xref: /llvm-project/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (revision eeac0ffaf46cf9f9b0f680b9940cc4b68a0286d8)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
7; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
8; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
9; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
10
11; --------------------------------------------------------------------
12; float
13; --------------------------------------------------------------------
14
15define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
16; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
17; GFX12:       ; %bb.0:
18; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
19; GFX12-NEXT:    s_wait_expcnt 0x0
20; GFX12-NEXT:    s_wait_samplecnt 0x0
21; GFX12-NEXT:    s_wait_bvhcnt 0x0
22; GFX12-NEXT:    s_wait_kmcnt 0x0
23; GFX12-NEXT:    s_wait_storecnt 0x0
24; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
25; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
26; GFX12-NEXT:    global_inv scope:SCOPE_DEV
27; GFX12-NEXT:    s_setpc_b64 s[30:31]
28;
29; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
30; GFX940:       ; %bb.0:
31; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32; GFX940-NEXT:    flat_load_dword v3, v[0:1]
33; GFX940-NEXT:    s_mov_b64 s[0:1], 0
34; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
35; GFX940-NEXT:  .LBB0_1: ; %atomicrmw.start
36; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
37; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
38; GFX940-NEXT:    v_mov_b32_e32 v5, v3
39; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
40; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
41; GFX940-NEXT:    buffer_wbl2 sc1
42; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
43; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
44; GFX940-NEXT:    buffer_inv sc1
45; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
46; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
47; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
48; GFX940-NEXT:    s_cbranch_execnz .LBB0_1
49; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
50; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
51; GFX940-NEXT:    v_mov_b32_e32 v0, v3
52; GFX940-NEXT:    s_setpc_b64 s[30:31]
53;
54; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
55; GFX11:       ; %bb.0:
56; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
57; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
58; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 glc
59; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
60; GFX11-NEXT:    buffer_gl1_inv
61; GFX11-NEXT:    buffer_gl0_inv
62; GFX11-NEXT:    s_setpc_b64 s[30:31]
63;
64; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
65; GFX10:       ; %bb.0:
66; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
67; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
68; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
69; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
70; GFX10-NEXT:    buffer_gl1_inv
71; GFX10-NEXT:    buffer_gl0_inv
72; GFX10-NEXT:    s_setpc_b64 s[30:31]
73;
74; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
75; GFX90A:       ; %bb.0:
76; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
77; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
78; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
79; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
80; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
81; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
82; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
83; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
84; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
85; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
86; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
87; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
88; GFX90A-NEXT:    buffer_wbinvl1
89; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
90; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
91; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
92; GFX90A-NEXT:    s_cbranch_execnz .LBB0_1
93; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
94; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
95; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
96; GFX90A-NEXT:    s_setpc_b64 s[30:31]
97;
98; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
99; GFX908:       ; %bb.0:
100; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101; GFX908-NEXT:    flat_load_dword v3, v[0:1]
102; GFX908-NEXT:    s_mov_b64 s[4:5], 0
103; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
104; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
105; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
106; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
107; GFX908-NEXT:    v_mov_b32_e32 v4, v3
108; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
109; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
110; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
111; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
112; GFX908-NEXT:    buffer_wbinvl1
113; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
114; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
115; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
116; GFX908-NEXT:    s_cbranch_execnz .LBB0_1
117; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
118; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
119; GFX908-NEXT:    v_mov_b32_e32 v0, v3
120; GFX908-NEXT:    s_setpc_b64 s[30:31]
121;
122; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
123; GFX8:       ; %bb.0:
124; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125; GFX8-NEXT:    flat_load_dword v3, v[0:1]
126; GFX8-NEXT:    s_mov_b64 s[4:5], 0
127; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
128; GFX8-NEXT:  .LBB0_1: ; %atomicrmw.start
129; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
130; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
131; GFX8-NEXT:    v_mov_b32_e32 v4, v3
132; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
133; GFX8-NEXT:    v_min_f32_e32 v3, v3, v2
134; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
135; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
136; GFX8-NEXT:    buffer_wbinvl1
137; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
138; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
139; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
140; GFX8-NEXT:    s_cbranch_execnz .LBB0_1
141; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
142; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
143; GFX8-NEXT:    v_mov_b32_e32 v0, v3
144; GFX8-NEXT:    s_setpc_b64 s[30:31]
145;
146; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
147; GFX7:       ; %bb.0:
148; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
150; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
151; GFX7-NEXT:    buffer_wbinvl1
152; GFX7-NEXT:    s_setpc_b64 s[30:31]
153  %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
154  ret float %result
155}
156
157define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
158; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
159; GFX12:       ; %bb.0:
160; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
161; GFX12-NEXT:    s_wait_expcnt 0x0
162; GFX12-NEXT:    s_wait_samplecnt 0x0
163; GFX12-NEXT:    s_wait_bvhcnt 0x0
164; GFX12-NEXT:    s_wait_kmcnt 0x0
165; GFX12-NEXT:    s_wait_storecnt 0x0
166; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
167; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
168; GFX12-NEXT:    global_inv scope:SCOPE_DEV
169; GFX12-NEXT:    s_setpc_b64 s[30:31]
170;
171; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
172; GFX940:       ; %bb.0:
173; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
174; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
175; GFX940-NEXT:    s_mov_b64 s[0:1], 0
176; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
177; GFX940-NEXT:  .LBB1_1: ; %atomicrmw.start
178; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
179; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
180; GFX940-NEXT:    v_mov_b32_e32 v5, v3
181; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
182; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
183; GFX940-NEXT:    buffer_wbl2 sc1
184; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
185; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
186; GFX940-NEXT:    buffer_inv sc1
187; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
188; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
189; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
190; GFX940-NEXT:    s_cbranch_execnz .LBB1_1
191; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
192; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
193; GFX940-NEXT:    v_mov_b32_e32 v0, v3
194; GFX940-NEXT:    s_setpc_b64 s[30:31]
195;
196; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
197; GFX11:       ; %bb.0:
198; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
199; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
200; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
201; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
202; GFX11-NEXT:    buffer_gl1_inv
203; GFX11-NEXT:    buffer_gl0_inv
204; GFX11-NEXT:    s_setpc_b64 s[30:31]
205;
206; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
207; GFX10:       ; %bb.0:
208; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
209; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
210; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
211; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
212; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
213; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
214; GFX10-NEXT:    buffer_gl1_inv
215; GFX10-NEXT:    buffer_gl0_inv
216; GFX10-NEXT:    s_setpc_b64 s[30:31]
217;
218; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
219; GFX90A:       ; %bb.0:
220; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
221; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
222; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
223; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
224; GFX90A-NEXT:  .LBB1_1: ; %atomicrmw.start
225; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
226; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
227; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
228; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
229; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
230; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
231; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
232; GFX90A-NEXT:    buffer_wbinvl1
233; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
234; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
235; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
236; GFX90A-NEXT:    s_cbranch_execnz .LBB1_1
237; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
238; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
239; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
240; GFX90A-NEXT:    s_setpc_b64 s[30:31]
241;
242; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
243; GFX908:       ; %bb.0:
244; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
245; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
246; GFX908-NEXT:    s_mov_b64 s[4:5], 0
247; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
248; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
249; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
250; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
251; GFX908-NEXT:    v_mov_b32_e32 v4, v3
252; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
253; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
254; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
255; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
256; GFX908-NEXT:    buffer_wbinvl1
257; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
258; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
259; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
260; GFX908-NEXT:    s_cbranch_execnz .LBB1_1
261; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
262; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
263; GFX908-NEXT:    v_mov_b32_e32 v0, v3
264; GFX908-NEXT:    s_setpc_b64 s[30:31]
265;
266; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
267; GFX8:       ; %bb.0:
268; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
269; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
270; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
271; GFX8-NEXT:    flat_load_dword v0, v[3:4]
272; GFX8-NEXT:    s_mov_b64 s[4:5], 0
273; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
274; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
275; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
276; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
277; GFX8-NEXT:    v_mov_b32_e32 v6, v0
278; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
279; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
280; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
281; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
282; GFX8-NEXT:    buffer_wbinvl1
283; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
284; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
285; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
286; GFX8-NEXT:    s_cbranch_execnz .LBB1_1
287; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
288; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
289; GFX8-NEXT:    s_setpc_b64 s[30:31]
290;
291; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
292; GFX7:       ; %bb.0:
293; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
294; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
295; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
296; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
297; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
298; GFX7-NEXT:    buffer_wbinvl1
299; GFX7-NEXT:    s_setpc_b64 s[30:31]
300  %gep = getelementptr float, ptr %ptr, i64 511
301  %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
302  ret float %result
303}
304
305define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
306; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
307; GFX12:       ; %bb.0:
308; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
309; GFX12-NEXT:    s_wait_expcnt 0x0
310; GFX12-NEXT:    s_wait_samplecnt 0x0
311; GFX12-NEXT:    s_wait_bvhcnt 0x0
312; GFX12-NEXT:    s_wait_kmcnt 0x0
313; GFX12-NEXT:    s_wait_storecnt 0x0
314; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
315; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
316; GFX12-NEXT:    global_inv scope:SCOPE_DEV
317; GFX12-NEXT:    s_setpc_b64 s[30:31]
318;
319; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
320; GFX940:       ; %bb.0:
321; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
322; GFX940-NEXT:    v_mov_b32_e32 v4, v0
323; GFX940-NEXT:    v_mov_b32_e32 v5, v1
324; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
325; GFX940-NEXT:    s_movk_i32 s0, 0xf800
326; GFX940-NEXT:    s_nop 0
327; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
328; GFX940-NEXT:    flat_load_dword v0, v[0:1]
329; GFX940-NEXT:    s_mov_b32 s1, -1
330; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
331; GFX940-NEXT:    s_mov_b64 s[0:1], 0
332; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
333; GFX940-NEXT:  .LBB2_1: ; %atomicrmw.start
334; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
335; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
336; GFX940-NEXT:    v_mov_b32_e32 v3, v0
337; GFX940-NEXT:    v_max_f32_e32 v0, v3, v3
338; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
339; GFX940-NEXT:    buffer_wbl2 sc1
340; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
341; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
342; GFX940-NEXT:    buffer_inv sc1
343; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
344; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
345; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
346; GFX940-NEXT:    s_cbranch_execnz .LBB2_1
347; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
348; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
349; GFX940-NEXT:    s_setpc_b64 s[30:31]
350;
351; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
352; GFX11:       ; %bb.0:
353; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
354; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
355; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
356; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
357; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 glc
358; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
359; GFX11-NEXT:    buffer_gl1_inv
360; GFX11-NEXT:    buffer_gl0_inv
361; GFX11-NEXT:    s_setpc_b64 s[30:31]
362;
363; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
364; GFX10:       ; %bb.0:
365; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
366; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
367; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
368; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
369; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
370; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
371; GFX10-NEXT:    buffer_gl1_inv
372; GFX10-NEXT:    buffer_gl0_inv
373; GFX10-NEXT:    s_setpc_b64 s[30:31]
374;
375; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
376; GFX90A:       ; %bb.0:
377; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
378; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
379; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
380; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
381; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
382; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
383; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
384; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
385; GFX90A-NEXT:  .LBB2_1: ; %atomicrmw.start
386; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
387; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
388; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
389; GFX90A-NEXT:    v_max_f32_e32 v0, v3, v3
390; GFX90A-NEXT:    v_min_f32_e32 v2, v0, v1
391; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
392; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
393; GFX90A-NEXT:    buffer_wbinvl1
394; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
395; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
396; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
397; GFX90A-NEXT:    s_cbranch_execnz .LBB2_1
398; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
399; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
400; GFX90A-NEXT:    s_setpc_b64 s[30:31]
401;
402; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
403; GFX908:       ; %bb.0:
404; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
405; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
406; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
407; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
408; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
409; GFX908-NEXT:    flat_load_dword v0, v[0:1]
410; GFX908-NEXT:    s_mov_b64 s[4:5], 0
411; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
412; GFX908-NEXT:  .LBB2_1: ; %atomicrmw.start
413; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
414; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
415; GFX908-NEXT:    v_mov_b32_e32 v6, v0
416; GFX908-NEXT:    v_max_f32_e32 v0, v6, v6
417; GFX908-NEXT:    v_min_f32_e32 v5, v0, v1
418; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
419; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
420; GFX908-NEXT:    buffer_wbinvl1
421; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
422; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
423; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
424; GFX908-NEXT:    s_cbranch_execnz .LBB2_1
425; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
426; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
427; GFX908-NEXT:    s_setpc_b64 s[30:31]
428;
429; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
430; GFX8:       ; %bb.0:
431; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
432; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
433; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
434; GFX8-NEXT:    flat_load_dword v0, v[3:4]
435; GFX8-NEXT:    s_mov_b64 s[4:5], 0
436; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
437; GFX8-NEXT:  .LBB2_1: ; %atomicrmw.start
438; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
439; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
440; GFX8-NEXT:    v_mov_b32_e32 v6, v0
441; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
442; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
443; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
444; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
445; GFX8-NEXT:    buffer_wbinvl1
446; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
447; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
448; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
449; GFX8-NEXT:    s_cbranch_execnz .LBB2_1
450; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
451; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
452; GFX8-NEXT:    s_setpc_b64 s[30:31]
453;
454; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
455; GFX7:       ; %bb.0:
456; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
457; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
458; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
459; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
460; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
461; GFX7-NEXT:    buffer_wbinvl1
462; GFX7-NEXT:    s_setpc_b64 s[30:31]
463  %gep = getelementptr float, ptr %ptr, i64 -512
464  %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
465  ret float %result
466}
467
468define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
469; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
470; GFX12:       ; %bb.0:
471; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
472; GFX12-NEXT:    s_wait_expcnt 0x0
473; GFX12-NEXT:    s_wait_samplecnt 0x0
474; GFX12-NEXT:    s_wait_bvhcnt 0x0
475; GFX12-NEXT:    s_wait_kmcnt 0x0
476; GFX12-NEXT:    s_wait_storecnt 0x0
477; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV
478; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
479; GFX12-NEXT:    global_inv scope:SCOPE_DEV
480; GFX12-NEXT:    s_setpc_b64 s[30:31]
481;
482; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
483; GFX940:       ; %bb.0:
484; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
485; GFX940-NEXT:    flat_load_dword v3, v[0:1]
486; GFX940-NEXT:    s_mov_b64 s[0:1], 0
487; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
488; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
489; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
490; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
491; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
492; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
493; GFX940-NEXT:    buffer_wbl2 sc1
494; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
495; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
496; GFX940-NEXT:    buffer_inv sc1
497; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
498; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
499; GFX940-NEXT:    v_mov_b32_e32 v3, v2
500; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
501; GFX940-NEXT:    s_cbranch_execnz .LBB3_1
502; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
503; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
504; GFX940-NEXT:    s_setpc_b64 s[30:31]
505;
506; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
507; GFX11:       ; %bb.0:
508; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
509; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
510; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2
511; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
512; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
513; GFX11-NEXT:    buffer_gl1_inv
514; GFX11-NEXT:    buffer_gl0_inv
515; GFX11-NEXT:    s_setpc_b64 s[30:31]
516;
517; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
518; GFX10:       ; %bb.0:
519; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
520; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
521; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
522; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
523; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
524; GFX10-NEXT:    buffer_gl1_inv
525; GFX10-NEXT:    buffer_gl0_inv
526; GFX10-NEXT:    s_setpc_b64 s[30:31]
527;
528; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
529; GFX90A:       ; %bb.0:
530; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
531; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
532; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
533; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
534; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
535; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
536; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
537; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
538; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
539; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
540; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
541; GFX90A-NEXT:    buffer_wbinvl1
542; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
543; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
544; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
545; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
546; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
547; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
548; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
549; GFX90A-NEXT:    s_setpc_b64 s[30:31]
550;
551; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
552; GFX908:       ; %bb.0:
553; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
554; GFX908-NEXT:    flat_load_dword v3, v[0:1]
555; GFX908-NEXT:    s_mov_b64 s[4:5], 0
556; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
557; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
558; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
559; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
560; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
561; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
562; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
563; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
564; GFX908-NEXT:    buffer_wbinvl1
565; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
566; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
567; GFX908-NEXT:    v_mov_b32_e32 v3, v2
568; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
569; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
570; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
571; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
572; GFX908-NEXT:    s_setpc_b64 s[30:31]
573;
574; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
575; GFX8:       ; %bb.0:
576; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
577; GFX8-NEXT:    flat_load_dword v3, v[0:1]
578; GFX8-NEXT:    s_mov_b64 s[4:5], 0
579; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
580; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
581; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
582; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
583; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
584; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
585; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
586; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
587; GFX8-NEXT:    buffer_wbinvl1
588; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
589; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
590; GFX8-NEXT:    v_mov_b32_e32 v3, v2
591; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
592; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
593; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
594; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
595; GFX8-NEXT:    s_setpc_b64 s[30:31]
596;
597; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
598; GFX7:       ; %bb.0:
599; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
600; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
601; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
602; GFX7-NEXT:    buffer_wbinvl1
603; GFX7-NEXT:    s_setpc_b64 s[30:31]
604  %unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
605  ret void
606}
607
608define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
609; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
610; GFX12:       ; %bb.0:
611; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
612; GFX12-NEXT:    s_wait_expcnt 0x0
613; GFX12-NEXT:    s_wait_samplecnt 0x0
614; GFX12-NEXT:    s_wait_bvhcnt 0x0
615; GFX12-NEXT:    s_wait_kmcnt 0x0
616; GFX12-NEXT:    s_wait_storecnt 0x0
617; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
618; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
619; GFX12-NEXT:    global_inv scope:SCOPE_DEV
620; GFX12-NEXT:    s_setpc_b64 s[30:31]
621;
622; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
623; GFX940:       ; %bb.0:
624; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
625; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
626; GFX940-NEXT:    s_mov_b64 s[0:1], 0
627; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
628; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
629; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
630; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
631; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
632; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
633; GFX940-NEXT:    buffer_wbl2 sc1
634; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
635; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
636; GFX940-NEXT:    buffer_inv sc1
637; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
638; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
639; GFX940-NEXT:    v_mov_b32_e32 v3, v2
640; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
641; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
642; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
643; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
644; GFX940-NEXT:    s_setpc_b64 s[30:31]
645;
646; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
647; GFX11:       ; %bb.0:
648; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
649; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
650; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2 offset:2044
651; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
652; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
653; GFX11-NEXT:    buffer_gl1_inv
654; GFX11-NEXT:    buffer_gl0_inv
655; GFX11-NEXT:    s_setpc_b64 s[30:31]
656;
657; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
658; GFX10:       ; %bb.0:
659; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
660; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
661; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
662; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
663; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
664; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
665; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
666; GFX10-NEXT:    buffer_gl1_inv
667; GFX10-NEXT:    buffer_gl0_inv
668; GFX10-NEXT:    s_setpc_b64 s[30:31]
669;
670; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
671; GFX90A:       ; %bb.0:
672; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
673; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
674; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
675; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
676; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
677; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
678; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
679; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
680; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
681; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
682; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
683; GFX90A-NEXT:    buffer_wbinvl1
684; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
685; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
686; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
687; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
688; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
689; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
690; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
691; GFX90A-NEXT:    s_setpc_b64 s[30:31]
692;
693; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
694; GFX908:       ; %bb.0:
695; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
696; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
697; GFX908-NEXT:    s_mov_b64 s[4:5], 0
698; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
699; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
700; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
701; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
702; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
703; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
704; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
705; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
706; GFX908-NEXT:    buffer_wbinvl1
707; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
708; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
709; GFX908-NEXT:    v_mov_b32_e32 v3, v2
710; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
711; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
712; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
713; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
714; GFX908-NEXT:    s_setpc_b64 s[30:31]
715;
716; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
717; GFX8:       ; %bb.0:
718; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
719; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
720; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
721; GFX8-NEXT:    flat_load_dword v3, v[0:1]
722; GFX8-NEXT:    s_mov_b64 s[4:5], 0
723; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
724; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
725; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
726; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
727; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
728; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
729; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
730; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
731; GFX8-NEXT:    buffer_wbinvl1
732; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
733; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
734; GFX8-NEXT:    v_mov_b32_e32 v3, v2
735; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
736; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
737; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
738; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
739; GFX8-NEXT:    s_setpc_b64 s[30:31]
740;
741; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
742; GFX7:       ; %bb.0:
743; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
744; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
745; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
746; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
747; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
748; GFX7-NEXT:    buffer_wbinvl1
749; GFX7-NEXT:    s_setpc_b64 s[30:31]
750  %gep = getelementptr float, ptr %ptr, i64 511
751  %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
752  ret void
753}
754
755define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
756; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
757; GFX12:       ; %bb.0:
758; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
759; GFX12-NEXT:    s_wait_expcnt 0x0
760; GFX12-NEXT:    s_wait_samplecnt 0x0
761; GFX12-NEXT:    s_wait_bvhcnt 0x0
762; GFX12-NEXT:    s_wait_kmcnt 0x0
763; GFX12-NEXT:    s_wait_storecnt 0x0
764; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
765; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
766; GFX12-NEXT:    global_inv scope:SCOPE_DEV
767; GFX12-NEXT:    s_setpc_b64 s[30:31]
768;
769; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
770; GFX940:       ; %bb.0:
771; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
772; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
773; GFX940-NEXT:    s_movk_i32 s0, 0xf800
774; GFX940-NEXT:    s_nop 0
775; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
776; GFX940-NEXT:    flat_load_dword v3, v[4:5]
777; GFX940-NEXT:    s_mov_b32 s1, -1
778; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
779; GFX940-NEXT:    s_mov_b64 s[0:1], 0
780; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
781; GFX940-NEXT:  .LBB5_1: ; %atomicrmw.start
782; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
783; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
784; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
785; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
786; GFX940-NEXT:    buffer_wbl2 sc1
787; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
788; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
789; GFX940-NEXT:    buffer_inv sc1
790; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
791; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
792; GFX940-NEXT:    v_mov_b32_e32 v3, v2
793; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
794; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
795; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
796; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
797; GFX940-NEXT:    s_setpc_b64 s[30:31]
798;
799; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
800; GFX11:       ; %bb.0:
801; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
802; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
803; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
804; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
805; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2
806; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
807; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
808; GFX11-NEXT:    buffer_gl1_inv
809; GFX11-NEXT:    buffer_gl0_inv
810; GFX11-NEXT:    s_setpc_b64 s[30:31]
811;
812; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
813; GFX10:       ; %bb.0:
814; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
815; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
816; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
817; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
818; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
819; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
820; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
821; GFX10-NEXT:    buffer_gl1_inv
822; GFX10-NEXT:    buffer_gl0_inv
823; GFX10-NEXT:    s_setpc_b64 s[30:31]
824;
825; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
826; GFX90A:       ; %bb.0:
827; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
828; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
829; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
830; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
831; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
832; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
833; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
834; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
835; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.start
836; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
837; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
838; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
839; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
840; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
841; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
842; GFX90A-NEXT:    buffer_wbinvl1
843; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
844; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
845; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
846; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
847; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
848; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
849; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
850; GFX90A-NEXT:    s_setpc_b64 s[30:31]
851;
852; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
853; GFX908:       ; %bb.0:
854; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
855; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
856; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
857; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
858; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
859; GFX908-NEXT:    flat_load_dword v1, v[0:1]
860; GFX908-NEXT:    s_mov_b64 s[4:5], 0
861; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
862; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
863; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
864; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
865; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
866; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
867; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
868; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
869; GFX908-NEXT:    buffer_wbinvl1
870; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
871; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
872; GFX908-NEXT:    v_mov_b32_e32 v1, v0
873; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
874; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
875; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
876; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
877; GFX908-NEXT:    s_setpc_b64 s[30:31]
878;
879; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
880; GFX8:       ; %bb.0:
881; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
882; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
883; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
884; GFX8-NEXT:    flat_load_dword v3, v[0:1]
885; GFX8-NEXT:    s_mov_b64 s[4:5], 0
886; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
887; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
888; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
889; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
890; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
891; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
892; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
893; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
894; GFX8-NEXT:    buffer_wbinvl1
895; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
896; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
897; GFX8-NEXT:    v_mov_b32_e32 v3, v2
898; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
899; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
900; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
901; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
902; GFX8-NEXT:    s_setpc_b64 s[30:31]
903;
904; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
905; GFX7:       ; %bb.0:
906; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
907; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
908; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
909; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
910; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
911; GFX7-NEXT:    buffer_wbinvl1
912; GFX7-NEXT:    s_setpc_b64 s[30:31]
913  %gep = getelementptr float, ptr %ptr, i64 -512
914  %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
915  ret void
916}
917
918define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
919; GFX12-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
920; GFX12:       ; %bb.0:
921; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
922; GFX12-NEXT:    s_wait_expcnt 0x0
923; GFX12-NEXT:    s_wait_samplecnt 0x0
924; GFX12-NEXT:    s_wait_bvhcnt 0x0
925; GFX12-NEXT:    s_wait_kmcnt 0x0
926; GFX12-NEXT:    global_wb scope:SCOPE_SYS
927; GFX12-NEXT:    s_wait_storecnt 0x0
928; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
929; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
930; GFX12-NEXT:    global_inv scope:SCOPE_SYS
931; GFX12-NEXT:    s_setpc_b64 s[30:31]
932;
933; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
934; GFX940:       ; %bb.0:
935; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
936; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
937; GFX940-NEXT:    s_mov_b64 s[0:1], 0
938; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
939; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
940; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
941; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
942; GFX940-NEXT:    v_mov_b32_e32 v5, v3
943; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
944; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
945; GFX940-NEXT:    buffer_wbl2 sc0 sc1
946; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
947; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
948; GFX940-NEXT:    buffer_inv sc0 sc1
949; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
950; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
951; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
952; GFX940-NEXT:    s_cbranch_execnz .LBB6_1
953; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
954; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
955; GFX940-NEXT:    v_mov_b32_e32 v0, v3
956; GFX940-NEXT:    s_setpc_b64 s[30:31]
957;
958; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
959; GFX11:       ; %bb.0:
960; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
961; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
962; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
963; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
964; GFX11-NEXT:    buffer_gl1_inv
965; GFX11-NEXT:    buffer_gl0_inv
966; GFX11-NEXT:    s_setpc_b64 s[30:31]
967;
968; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
969; GFX10:       ; %bb.0:
970; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
971; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
972; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
973; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
974; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
975; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
976; GFX10-NEXT:    buffer_gl1_inv
977; GFX10-NEXT:    buffer_gl0_inv
978; GFX10-NEXT:    s_setpc_b64 s[30:31]
979;
980; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
981; GFX90A:       ; %bb.0:
982; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
983; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
984; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
985; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
986; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
987; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
988; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
989; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
990; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
991; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
992; GFX90A-NEXT:    buffer_wbl2
993; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
994; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
995; GFX90A-NEXT:    buffer_invl2
996; GFX90A-NEXT:    buffer_wbinvl1
997; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
998; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
999; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1000; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
1001; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1002; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1003; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
1004; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1005;
1006; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
1007; GFX908:       ; %bb.0:
1008; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1009; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
1010; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1011; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
1012; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
1013; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1014; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1015; GFX908-NEXT:    v_mov_b32_e32 v4, v3
1016; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
1017; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
1018; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
1019; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1020; GFX908-NEXT:    buffer_wbinvl1
1021; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1022; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1023; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1024; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
1025; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1026; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1027; GFX908-NEXT:    v_mov_b32_e32 v0, v3
1028; GFX908-NEXT:    s_setpc_b64 s[30:31]
1029;
1030; GFX8-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
1031; GFX8:       ; %bb.0:
1032; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1033; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
1034; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
1035; GFX8-NEXT:    flat_load_dword v0, v[3:4]
1036; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1037; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
1038; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
1039; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1040; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1041; GFX8-NEXT:    v_mov_b32_e32 v6, v0
1042; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
1043; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
1044; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
1045; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1046; GFX8-NEXT:    buffer_wbinvl1
1047; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
1048; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1049; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1050; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
1051; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1052; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1053; GFX8-NEXT:    s_setpc_b64 s[30:31]
1054;
1055; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
1056; GFX7:       ; %bb.0:
1057; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1058; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
1059; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1060; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
1061; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1062; GFX7-NEXT:    buffer_wbinvl1
1063; GFX7-NEXT:    s_setpc_b64 s[30:31]
1064  %gep = getelementptr float, ptr %ptr, i64 511
1065  %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
1066  ret float %result
1067}
1068
1069define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
1070; GFX12-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
1071; GFX12:       ; %bb.0:
1072; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1073; GFX12-NEXT:    s_wait_expcnt 0x0
1074; GFX12-NEXT:    s_wait_samplecnt 0x0
1075; GFX12-NEXT:    s_wait_bvhcnt 0x0
1076; GFX12-NEXT:    s_wait_kmcnt 0x0
1077; GFX12-NEXT:    global_wb scope:SCOPE_SYS
1078; GFX12-NEXT:    s_wait_storecnt 0x0
1079; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS
1080; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
1081; GFX12-NEXT:    global_inv scope:SCOPE_SYS
1082; GFX12-NEXT:    s_setpc_b64 s[30:31]
1083;
1084; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
1085; GFX940:       ; %bb.0:
1086; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1087; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
1088; GFX940-NEXT:    s_mov_b64 s[0:1], 0
1089; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
1090; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
1091; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
1092; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1093; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
1094; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
1095; GFX940-NEXT:    buffer_wbl2 sc0 sc1
1096; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
1097; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1098; GFX940-NEXT:    buffer_inv sc0 sc1
1099; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
1100; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1101; GFX940-NEXT:    v_mov_b32_e32 v3, v2
1102; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1103; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
1104; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
1105; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
1106; GFX940-NEXT:    s_setpc_b64 s[30:31]
1107;
1108; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
1109; GFX11:       ; %bb.0:
1110; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1111; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1112; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2 offset:2044
1113; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1114; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1115; GFX11-NEXT:    buffer_gl1_inv
1116; GFX11-NEXT:    buffer_gl0_inv
1117; GFX11-NEXT:    s_setpc_b64 s[30:31]
1118;
1119; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
1120; GFX10:       ; %bb.0:
1121; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1122; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
1123; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1124; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1125; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
1126; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1127; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1128; GFX10-NEXT:    buffer_gl1_inv
1129; GFX10-NEXT:    buffer_gl0_inv
1130; GFX10-NEXT:    s_setpc_b64 s[30:31]
1131;
1132; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
1133; GFX90A:       ; %bb.0:
1134; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1135; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
1136; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1137; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
1138; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
1139; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1140; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1141; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
1142; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
1143; GFX90A-NEXT:    buffer_wbl2
1144; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
1145; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1146; GFX90A-NEXT:    buffer_invl2
1147; GFX90A-NEXT:    buffer_wbinvl1
1148; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
1149; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1150; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
1151; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1152; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
1153; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1154; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1155; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1156;
1157; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
1158; GFX908:       ; %bb.0:
1159; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1160; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
1161; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1162; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
1163; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
1164; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1165; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1166; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
1167; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
1168; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
1169; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1170; GFX908-NEXT:    buffer_wbinvl1
1171; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
1172; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1173; GFX908-NEXT:    v_mov_b32_e32 v3, v2
1174; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1175; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
1176; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1177; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1178; GFX908-NEXT:    s_setpc_b64 s[30:31]
1179;
1180; GFX8-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
1181; GFX8:       ; %bb.0:
1182; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1183; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
1184; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1185; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1186; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1187; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
1188; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
1189; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1190; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1191; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
1192; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
1193; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
1194; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1195; GFX8-NEXT:    buffer_wbinvl1
1196; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
1197; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1198; GFX8-NEXT:    v_mov_b32_e32 v3, v2
1199; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1200; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
1201; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1202; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1203; GFX8-NEXT:    s_setpc_b64 s[30:31]
1204;
1205; GFX7-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
1206; GFX7:       ; %bb.0:
1207; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1208; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
1209; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1210; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
1211; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1212; GFX7-NEXT:    buffer_wbinvl1
1213; GFX7-NEXT:    s_setpc_b64 s[30:31]
1214  %gep = getelementptr float, ptr %ptr, i64 511
1215  %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
1216  ret void
1217}
1218
1219define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
1220; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
1221; GFX12:       ; %bb.0:
1222; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1223; GFX12-NEXT:    s_wait_expcnt 0x0
1224; GFX12-NEXT:    s_wait_samplecnt 0x0
1225; GFX12-NEXT:    s_wait_bvhcnt 0x0
1226; GFX12-NEXT:    s_wait_kmcnt 0x0
1227; GFX12-NEXT:    s_wait_storecnt 0x0
1228; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1229; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1230; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1231; GFX12-NEXT:    s_setpc_b64 s[30:31]
1232;
1233; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
1234; GFX940:       ; %bb.0:
1235; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1236; GFX940-NEXT:    flat_load_dword v3, v[0:1]
1237; GFX940-NEXT:    s_mov_b64 s[0:1], 0
1238; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
1239; GFX940-NEXT:  .LBB8_1: ; %atomicrmw.start
1240; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
1241; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1242; GFX940-NEXT:    v_mov_b32_e32 v5, v3
1243; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
1244; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
1245; GFX940-NEXT:    buffer_wbl2 sc1
1246; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
1247; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1248; GFX940-NEXT:    buffer_inv sc1
1249; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1250; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1251; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1252; GFX940-NEXT:    s_cbranch_execnz .LBB8_1
1253; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
1254; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
1255; GFX940-NEXT:    v_mov_b32_e32 v0, v3
1256; GFX940-NEXT:    s_setpc_b64 s[30:31]
1257;
1258; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
1259; GFX11:       ; %bb.0:
1260; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1261; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
1262; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
1263; GFX11-NEXT:    s_mov_b32 s0, 0
1264; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
1265; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1266; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1267; GFX11-NEXT:    v_mov_b32_e32 v4, v3
1268; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1269; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
1270; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
1271; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1272; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
1273; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1274; GFX11-NEXT:    buffer_gl1_inv
1275; GFX11-NEXT:    buffer_gl0_inv
1276; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1277; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
1278; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1279; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1280; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
1281; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1282; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1283; GFX11-NEXT:    v_mov_b32_e32 v0, v3
1284; GFX11-NEXT:    s_setpc_b64 s[30:31]
1285;
1286; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
1287; GFX10:       ; %bb.0:
1288; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1289; GFX10-NEXT:    flat_load_dword v3, v[0:1]
1290; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
1291; GFX10-NEXT:    s_mov_b32 s4, 0
1292; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
1293; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
1294; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1295; GFX10-NEXT:    v_mov_b32_e32 v4, v3
1296; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
1297; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
1298; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1299; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1300; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1301; GFX10-NEXT:    buffer_gl1_inv
1302; GFX10-NEXT:    buffer_gl0_inv
1303; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
1304; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
1305; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
1306; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
1307; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
1308; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1309; GFX10-NEXT:    v_mov_b32_e32 v0, v3
1310; GFX10-NEXT:    s_setpc_b64 s[30:31]
1311;
1312; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
1313; GFX90A:       ; %bb.0:
1314; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1315; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
1316; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1317; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
1318; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
1319; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1320; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1321; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
1322; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
1323; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
1324; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
1325; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1326; GFX90A-NEXT:    buffer_wbinvl1
1327; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1328; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1329; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1330; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
1331; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1332; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1333; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
1334; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1335;
1336; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
1337; GFX908:       ; %bb.0:
1338; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1339; GFX908-NEXT:    flat_load_dword v3, v[0:1]
1340; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1341; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
1342; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
1343; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1344; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1345; GFX908-NEXT:    v_mov_b32_e32 v4, v3
1346; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
1347; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
1348; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1349; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1350; GFX908-NEXT:    buffer_wbinvl1
1351; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1352; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1353; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1354; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
1355; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1356; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1357; GFX908-NEXT:    v_mov_b32_e32 v0, v3
1358; GFX908-NEXT:    s_setpc_b64 s[30:31]
1359;
1360; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
1361; GFX8:       ; %bb.0:
1362; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1363; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1364; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1365; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
1366; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
1367; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1368; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1369; GFX8-NEXT:    v_mov_b32_e32 v4, v3
1370; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
1371; GFX8-NEXT:    v_min_f32_e32 v3, v3, v2
1372; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1373; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1374; GFX8-NEXT:    buffer_wbinvl1
1375; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1376; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1377; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1378; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
1379; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1380; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1381; GFX8-NEXT:    v_mov_b32_e32 v0, v3
1382; GFX8-NEXT:    s_setpc_b64 s[30:31]
1383;
1384; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
1385; GFX7:       ; %bb.0:
1386; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1387; GFX7-NEXT:    flat_load_dword v3, v[0:1]
1388; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1389; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
1390; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
1391; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
1392; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1393; GFX7-NEXT:    v_mov_b32_e32 v4, v3
1394; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v4
1395; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
1396; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1397; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1398; GFX7-NEXT:    buffer_wbinvl1
1399; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1400; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1401; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1402; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
1403; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
1404; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
1405; GFX7-NEXT:    v_mov_b32_e32 v0, v3
1406; GFX7-NEXT:    s_setpc_b64 s[30:31]
1407  %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
1408  ret float %result
1409}
1410
1411define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
1412; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1413; GFX12:       ; %bb.0:
1414; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1415; GFX12-NEXT:    s_wait_expcnt 0x0
1416; GFX12-NEXT:    s_wait_samplecnt 0x0
1417; GFX12-NEXT:    s_wait_bvhcnt 0x0
1418; GFX12-NEXT:    s_wait_kmcnt 0x0
1419; GFX12-NEXT:    s_wait_storecnt 0x0
1420; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1421; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1422; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1423; GFX12-NEXT:    s_setpc_b64 s[30:31]
1424;
1425; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1426; GFX940:       ; %bb.0:
1427; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1428; GFX940-NEXT:    flat_load_dword v3, v[0:1]
1429; GFX940-NEXT:    s_mov_b64 s[0:1], 0
1430; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
1431; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
1432; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
1433; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1434; GFX940-NEXT:    v_mov_b32_e32 v5, v3
1435; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
1436; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
1437; GFX940-NEXT:    buffer_wbl2 sc1
1438; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
1439; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1440; GFX940-NEXT:    buffer_inv sc1
1441; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1442; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1443; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1444; GFX940-NEXT:    s_cbranch_execnz .LBB9_1
1445; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
1446; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
1447; GFX940-NEXT:    v_mov_b32_e32 v0, v3
1448; GFX940-NEXT:    s_setpc_b64 s[30:31]
1449;
1450; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1451; GFX11:       ; %bb.0:
1452; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1453; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1454; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 glc
1455; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1456; GFX11-NEXT:    buffer_gl1_inv
1457; GFX11-NEXT:    buffer_gl0_inv
1458; GFX11-NEXT:    s_setpc_b64 s[30:31]
1459;
1460; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1461; GFX10:       ; %bb.0:
1462; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1463; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1464; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
1465; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1466; GFX10-NEXT:    buffer_gl1_inv
1467; GFX10-NEXT:    buffer_gl0_inv
1468; GFX10-NEXT:    s_setpc_b64 s[30:31]
1469;
1470; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1471; GFX90A:       ; %bb.0:
1472; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1473; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
1474; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1475; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
1476; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
1477; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1478; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1479; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
1480; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
1481; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
1482; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
1483; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1484; GFX90A-NEXT:    buffer_wbinvl1
1485; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1486; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1487; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1488; GFX90A-NEXT:    s_cbranch_execnz .LBB9_1
1489; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1490; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1491; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
1492; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1493;
1494; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1495; GFX908:       ; %bb.0:
1496; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1497; GFX908-NEXT:    flat_load_dword v3, v[0:1]
1498; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1499; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
1500; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
1501; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1502; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1503; GFX908-NEXT:    v_mov_b32_e32 v4, v3
1504; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
1505; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
1506; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1507; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1508; GFX908-NEXT:    buffer_wbinvl1
1509; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1510; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1511; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1512; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
1513; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1514; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1515; GFX908-NEXT:    v_mov_b32_e32 v0, v3
1516; GFX908-NEXT:    s_setpc_b64 s[30:31]
1517;
1518; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1519; GFX8:       ; %bb.0:
1520; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1521; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1522; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1523; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
1524; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
1525; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1526; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1527; GFX8-NEXT:    v_mov_b32_e32 v4, v3
1528; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
1529; GFX8-NEXT:    v_min_f32_e32 v3, v3, v2
1530; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1531; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1532; GFX8-NEXT:    buffer_wbinvl1
1533; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1534; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1535; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1536; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
1537; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1538; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1539; GFX8-NEXT:    v_mov_b32_e32 v0, v3
1540; GFX8-NEXT:    s_setpc_b64 s[30:31]
1541;
1542; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1543; GFX7:       ; %bb.0:
1544; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1545; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
1546; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1547; GFX7-NEXT:    buffer_wbinvl1
1548; GFX7-NEXT:    s_setpc_b64 s[30:31]
1549  %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
1550  ret float %result
1551}
1552
1553; --------------------------------------------------------------------
1554; float with ftz/daz
1555; --------------------------------------------------------------------
1556
1557define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
1558; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
1559; GFX12:       ; %bb.0:
1560; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1561; GFX12-NEXT:    s_wait_expcnt 0x0
1562; GFX12-NEXT:    s_wait_samplecnt 0x0
1563; GFX12-NEXT:    s_wait_bvhcnt 0x0
1564; GFX12-NEXT:    s_wait_kmcnt 0x0
1565; GFX12-NEXT:    s_wait_storecnt 0x0
1566; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1567; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1568; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1569; GFX12-NEXT:    s_setpc_b64 s[30:31]
1570;
1571; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
1572; GFX940:       ; %bb.0:
1573; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1574; GFX940-NEXT:    flat_load_dword v3, v[0:1]
1575; GFX940-NEXT:    s_mov_b64 s[0:1], 0
1576; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
1577; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
1578; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
1579; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1580; GFX940-NEXT:    v_mov_b32_e32 v5, v3
1581; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
1582; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
1583; GFX940-NEXT:    buffer_wbl2 sc1
1584; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
1585; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1586; GFX940-NEXT:    buffer_inv sc1
1587; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1588; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1589; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1590; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
1591; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
1592; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
1593; GFX940-NEXT:    v_mov_b32_e32 v0, v3
1594; GFX940-NEXT:    s_setpc_b64 s[30:31]
1595;
1596; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
1597; GFX11:       ; %bb.0:
1598; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1599; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1600; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 glc
1601; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1602; GFX11-NEXT:    buffer_gl1_inv
1603; GFX11-NEXT:    buffer_gl0_inv
1604; GFX11-NEXT:    s_setpc_b64 s[30:31]
1605;
1606; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
1607; GFX10:       ; %bb.0:
1608; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1609; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1610; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
1611; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1612; GFX10-NEXT:    buffer_gl1_inv
1613; GFX10-NEXT:    buffer_gl0_inv
1614; GFX10-NEXT:    s_setpc_b64 s[30:31]
1615;
1616; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
1617; GFX90A:       ; %bb.0:
1618; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1619; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
1620; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1621; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
1622; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
1623; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1624; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1625; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
1626; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
1627; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
1628; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
1629; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1630; GFX90A-NEXT:    buffer_wbinvl1
1631; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1632; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1633; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1634; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
1635; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1636; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1637; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
1638; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1639;
1640; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
1641; GFX908:       ; %bb.0:
1642; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1643; GFX908-NEXT:    flat_load_dword v3, v[0:1]
1644; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1645; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
1646; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
1647; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1648; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1649; GFX908-NEXT:    v_mov_b32_e32 v4, v3
1650; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
1651; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
1652; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1653; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1654; GFX908-NEXT:    buffer_wbinvl1
1655; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1656; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1657; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1658; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
1659; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1660; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1661; GFX908-NEXT:    v_mov_b32_e32 v0, v3
1662; GFX908-NEXT:    s_setpc_b64 s[30:31]
1663;
1664; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
1665; GFX8:       ; %bb.0:
1666; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1667; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1668; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1669; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
1670; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
1671; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1672; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1673; GFX8-NEXT:    v_mov_b32_e32 v4, v3
1674; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
1675; GFX8-NEXT:    v_min_f32_e32 v3, v3, v2
1676; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1677; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1678; GFX8-NEXT:    buffer_wbinvl1
1679; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1680; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1681; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1682; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
1683; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1684; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1685; GFX8-NEXT:    v_mov_b32_e32 v0, v3
1686; GFX8-NEXT:    s_setpc_b64 s[30:31]
1687;
1688; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
1689; GFX7:       ; %bb.0:
1690; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1691; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
1692; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1693; GFX7-NEXT:    buffer_wbinvl1
1694; GFX7-NEXT:    s_setpc_b64 s[30:31]
1695  %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1696  ret float %result
1697}
1698
1699define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
1700; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
1701; GFX12:       ; %bb.0:
1702; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1703; GFX12-NEXT:    s_wait_expcnt 0x0
1704; GFX12-NEXT:    s_wait_samplecnt 0x0
1705; GFX12-NEXT:    s_wait_bvhcnt 0x0
1706; GFX12-NEXT:    s_wait_kmcnt 0x0
1707; GFX12-NEXT:    s_wait_storecnt 0x0
1708; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1709; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1710; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1711; GFX12-NEXT:    s_setpc_b64 s[30:31]
1712;
1713; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
1714; GFX940:       ; %bb.0:
1715; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1716; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
1717; GFX940-NEXT:    s_mov_b64 s[0:1], 0
1718; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
1719; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
1720; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
1721; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1722; GFX940-NEXT:    v_mov_b32_e32 v5, v3
1723; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
1724; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
1725; GFX940-NEXT:    buffer_wbl2 sc1
1726; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
1727; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1728; GFX940-NEXT:    buffer_inv sc1
1729; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1730; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1731; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1732; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
1733; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
1734; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
1735; GFX940-NEXT:    v_mov_b32_e32 v0, v3
1736; GFX940-NEXT:    s_setpc_b64 s[30:31]
1737;
1738; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
1739; GFX11:       ; %bb.0:
1740; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1741; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1742; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
1743; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1744; GFX11-NEXT:    buffer_gl1_inv
1745; GFX11-NEXT:    buffer_gl0_inv
1746; GFX11-NEXT:    s_setpc_b64 s[30:31]
1747;
1748; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
1749; GFX10:       ; %bb.0:
1750; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1751; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
1752; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1753; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1754; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
1755; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1756; GFX10-NEXT:    buffer_gl1_inv
1757; GFX10-NEXT:    buffer_gl0_inv
1758; GFX10-NEXT:    s_setpc_b64 s[30:31]
1759;
1760; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
1761; GFX90A:       ; %bb.0:
1762; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1763; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
1764; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1765; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
1766; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
1767; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1768; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1769; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
1770; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
1771; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
1772; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
1773; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1774; GFX90A-NEXT:    buffer_wbinvl1
1775; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1776; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1777; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1778; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
1779; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1780; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1781; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
1782; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1783;
1784; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
1785; GFX908:       ; %bb.0:
1786; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1787; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
1788; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1789; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
1790; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
1791; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1792; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1793; GFX908-NEXT:    v_mov_b32_e32 v4, v3
1794; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
1795; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
1796; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
1797; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1798; GFX908-NEXT:    buffer_wbinvl1
1799; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1800; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1801; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1802; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
1803; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1804; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1805; GFX908-NEXT:    v_mov_b32_e32 v0, v3
1806; GFX908-NEXT:    s_setpc_b64 s[30:31]
1807;
1808; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
1809; GFX8:       ; %bb.0:
1810; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1811; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
1812; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
1813; GFX8-NEXT:    flat_load_dword v0, v[3:4]
1814; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1815; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
1816; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
1817; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1818; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1819; GFX8-NEXT:    v_mov_b32_e32 v6, v0
1820; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
1821; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
1822; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
1823; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1824; GFX8-NEXT:    buffer_wbinvl1
1825; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
1826; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1827; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1828; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
1829; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1830; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1831; GFX8-NEXT:    s_setpc_b64 s[30:31]
1832;
1833; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
1834; GFX7:       ; %bb.0:
1835; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1836; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
1837; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1838; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
1839; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1840; GFX7-NEXT:    buffer_wbinvl1
1841; GFX7-NEXT:    s_setpc_b64 s[30:31]
1842  %gep = getelementptr float, ptr %ptr, i64 511
1843  %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1844  ret float %result
1845}
1846
1847define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
1848; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
1849; GFX12:       ; %bb.0:
1850; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1851; GFX12-NEXT:    s_wait_expcnt 0x0
1852; GFX12-NEXT:    s_wait_samplecnt 0x0
1853; GFX12-NEXT:    s_wait_bvhcnt 0x0
1854; GFX12-NEXT:    s_wait_kmcnt 0x0
1855; GFX12-NEXT:    s_wait_storecnt 0x0
1856; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1857; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1858; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1859; GFX12-NEXT:    s_setpc_b64 s[30:31]
1860;
1861; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
1862; GFX940:       ; %bb.0:
1863; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1864; GFX940-NEXT:    v_mov_b32_e32 v4, v0
1865; GFX940-NEXT:    v_mov_b32_e32 v5, v1
1866; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
1867; GFX940-NEXT:    s_movk_i32 s0, 0xf800
1868; GFX940-NEXT:    s_nop 0
1869; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
1870; GFX940-NEXT:    flat_load_dword v0, v[0:1]
1871; GFX940-NEXT:    s_mov_b32 s1, -1
1872; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
1873; GFX940-NEXT:    s_mov_b64 s[0:1], 0
1874; GFX940-NEXT:    v_max_f32_e32 v1, v2, v2
1875; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
1876; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
1877; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1878; GFX940-NEXT:    v_mov_b32_e32 v3, v0
1879; GFX940-NEXT:    v_max_f32_e32 v0, v3, v3
1880; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
1881; GFX940-NEXT:    buffer_wbl2 sc1
1882; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
1883; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1884; GFX940-NEXT:    buffer_inv sc1
1885; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
1886; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1887; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1888; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
1889; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
1890; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
1891; GFX940-NEXT:    s_setpc_b64 s[30:31]
1892;
1893; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
1894; GFX11:       ; %bb.0:
1895; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1896; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
1897; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
1898; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1899; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 glc
1900; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1901; GFX11-NEXT:    buffer_gl1_inv
1902; GFX11-NEXT:    buffer_gl0_inv
1903; GFX11-NEXT:    s_setpc_b64 s[30:31]
1904;
1905; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
1906; GFX10:       ; %bb.0:
1907; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1908; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
1909; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
1910; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1911; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
1912; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1913; GFX10-NEXT:    buffer_gl1_inv
1914; GFX10-NEXT:    buffer_gl0_inv
1915; GFX10-NEXT:    s_setpc_b64 s[30:31]
1916;
1917; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
1918; GFX90A:       ; %bb.0:
1919; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1920; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
1921; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
1922; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
1923; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
1924; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
1925; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1926; GFX90A-NEXT:    v_max_f32_e32 v1, v2, v2
1927; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
1928; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1929; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1930; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
1931; GFX90A-NEXT:    v_max_f32_e32 v0, v3, v3
1932; GFX90A-NEXT:    v_min_f32_e32 v2, v0, v1
1933; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
1934; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1935; GFX90A-NEXT:    buffer_wbinvl1
1936; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
1937; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1938; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1939; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
1940; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1941; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1942; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1943;
1944; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
1945; GFX908:       ; %bb.0:
1946; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1947; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
1948; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
1949; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
1950; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
1951; GFX908-NEXT:    flat_load_dword v0, v[0:1]
1952; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1953; GFX908-NEXT:    v_max_f32_e32 v1, v2, v2
1954; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
1955; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1956; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1957; GFX908-NEXT:    v_mov_b32_e32 v6, v0
1958; GFX908-NEXT:    v_max_f32_e32 v0, v6, v6
1959; GFX908-NEXT:    v_min_f32_e32 v5, v0, v1
1960; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
1961; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1962; GFX908-NEXT:    buffer_wbinvl1
1963; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
1964; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1965; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1966; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
1967; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1968; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1969; GFX908-NEXT:    s_setpc_b64 s[30:31]
1970;
1971; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
1972; GFX8:       ; %bb.0:
1973; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1974; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
1975; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
1976; GFX8-NEXT:    flat_load_dword v0, v[3:4]
1977; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1978; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
1979; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
1980; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1981; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1982; GFX8-NEXT:    v_mov_b32_e32 v6, v0
1983; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
1984; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
1985; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
1986; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1987; GFX8-NEXT:    buffer_wbinvl1
1988; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
1989; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1990; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1991; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
1992; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1993; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1994; GFX8-NEXT:    s_setpc_b64 s[30:31]
1995;
1996; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
1997; GFX7:       ; %bb.0:
1998; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1999; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
2000; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
2001; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
2002; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2003; GFX7-NEXT:    buffer_wbinvl1
2004; GFX7-NEXT:    s_setpc_b64 s[30:31]
2005  %gep = getelementptr float, ptr %ptr, i64 -512
2006  %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
2007  ret float %result
2008}
2009
2010define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
2011; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
2012; GFX12:       ; %bb.0:
2013; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2014; GFX12-NEXT:    s_wait_expcnt 0x0
2015; GFX12-NEXT:    s_wait_samplecnt 0x0
2016; GFX12-NEXT:    s_wait_bvhcnt 0x0
2017; GFX12-NEXT:    s_wait_kmcnt 0x0
2018; GFX12-NEXT:    s_wait_storecnt 0x0
2019; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV
2020; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
2021; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2022; GFX12-NEXT:    s_setpc_b64 s[30:31]
2023;
2024; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
2025; GFX940:       ; %bb.0:
2026; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2027; GFX940-NEXT:    flat_load_dword v3, v[0:1]
2028; GFX940-NEXT:    s_mov_b64 s[0:1], 0
2029; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
2030; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
2031; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
2032; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2033; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
2034; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
2035; GFX940-NEXT:    buffer_wbl2 sc1
2036; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
2037; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2038; GFX940-NEXT:    buffer_inv sc1
2039; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2040; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2041; GFX940-NEXT:    v_mov_b32_e32 v3, v2
2042; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2043; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
2044; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
2045; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
2046; GFX940-NEXT:    s_setpc_b64 s[30:31]
2047;
2048; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
2049; GFX11:       ; %bb.0:
2050; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2051; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2052; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2
2053; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2054; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2055; GFX11-NEXT:    buffer_gl1_inv
2056; GFX11-NEXT:    buffer_gl0_inv
2057; GFX11-NEXT:    s_setpc_b64 s[30:31]
2058;
2059; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
2060; GFX10:       ; %bb.0:
2061; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2062; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2063; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
2064; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2065; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2066; GFX10-NEXT:    buffer_gl1_inv
2067; GFX10-NEXT:    buffer_gl0_inv
2068; GFX10-NEXT:    s_setpc_b64 s[30:31]
2069;
2070; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
2071; GFX90A:       ; %bb.0:
2072; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2073; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
2074; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
2075; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
2076; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
2077; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
2078; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2079; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
2080; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
2081; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2082; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2083; GFX90A-NEXT:    buffer_wbinvl1
2084; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2085; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2086; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
2087; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2088; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
2089; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
2090; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2091; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2092;
2093; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
2094; GFX908:       ; %bb.0:
2095; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2096; GFX908-NEXT:    flat_load_dword v3, v[0:1]
2097; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2098; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
2099; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
2100; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2101; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2102; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
2103; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
2104; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2105; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2106; GFX908-NEXT:    buffer_wbinvl1
2107; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2108; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2109; GFX908-NEXT:    v_mov_b32_e32 v3, v2
2110; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2111; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
2112; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2113; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2114; GFX908-NEXT:    s_setpc_b64 s[30:31]
2115;
2116; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
2117; GFX8:       ; %bb.0:
2118; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2119; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2120; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2121; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
2122; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
2123; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2124; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2125; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
2126; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
2127; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2128; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2129; GFX8-NEXT:    buffer_wbinvl1
2130; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2131; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2132; GFX8-NEXT:    v_mov_b32_e32 v3, v2
2133; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2134; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
2135; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2136; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2137; GFX8-NEXT:    s_setpc_b64 s[30:31]
2138;
2139; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
2140; GFX7:       ; %bb.0:
2141; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2142; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
2143; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2144; GFX7-NEXT:    buffer_wbinvl1
2145; GFX7-NEXT:    s_setpc_b64 s[30:31]
2146  %unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
2147  ret void
2148}
2149
2150define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
2151; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2152; GFX12:       ; %bb.0:
2153; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2154; GFX12-NEXT:    s_wait_expcnt 0x0
2155; GFX12-NEXT:    s_wait_samplecnt 0x0
2156; GFX12-NEXT:    s_wait_bvhcnt 0x0
2157; GFX12-NEXT:    s_wait_kmcnt 0x0
2158; GFX12-NEXT:    s_wait_storecnt 0x0
2159; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
2160; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
2161; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2162; GFX12-NEXT:    s_setpc_b64 s[30:31]
2163;
2164; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2165; GFX940:       ; %bb.0:
2166; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2167; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
2168; GFX940-NEXT:    s_mov_b64 s[0:1], 0
2169; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
2170; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
2171; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
2172; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2173; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
2174; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
2175; GFX940-NEXT:    buffer_wbl2 sc1
2176; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
2177; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2178; GFX940-NEXT:    buffer_inv sc1
2179; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2180; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2181; GFX940-NEXT:    v_mov_b32_e32 v3, v2
2182; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2183; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
2184; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
2185; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
2186; GFX940-NEXT:    s_setpc_b64 s[30:31]
2187;
2188; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2189; GFX11:       ; %bb.0:
2190; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2191; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2192; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2 offset:2044
2193; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2194; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2195; GFX11-NEXT:    buffer_gl1_inv
2196; GFX11-NEXT:    buffer_gl0_inv
2197; GFX11-NEXT:    s_setpc_b64 s[30:31]
2198;
2199; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2200; GFX10:       ; %bb.0:
2201; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2202; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
2203; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2204; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2205; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
2206; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2207; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2208; GFX10-NEXT:    buffer_gl1_inv
2209; GFX10-NEXT:    buffer_gl0_inv
2210; GFX10-NEXT:    s_setpc_b64 s[30:31]
2211;
2212; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2213; GFX90A:       ; %bb.0:
2214; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2215; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
2216; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
2217; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
2218; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
2219; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
2220; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2221; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
2222; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
2223; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
2224; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2225; GFX90A-NEXT:    buffer_wbinvl1
2226; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2227; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2228; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
2229; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2230; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
2231; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
2232; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2233; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2234;
2235; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2236; GFX908:       ; %bb.0:
2237; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2238; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
2239; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2240; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
2241; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
2242; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2243; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2244; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
2245; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
2246; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
2247; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2248; GFX908-NEXT:    buffer_wbinvl1
2249; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2250; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2251; GFX908-NEXT:    v_mov_b32_e32 v3, v2
2252; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2253; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
2254; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2255; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2256; GFX908-NEXT:    s_setpc_b64 s[30:31]
2257;
2258; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2259; GFX8:       ; %bb.0:
2260; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2261; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
2262; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2263; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2264; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2265; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
2266; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
2267; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2268; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2269; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
2270; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
2271; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2272; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2273; GFX8-NEXT:    buffer_wbinvl1
2274; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2275; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2276; GFX8-NEXT:    v_mov_b32_e32 v3, v2
2277; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2278; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
2279; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2280; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2281; GFX8-NEXT:    s_setpc_b64 s[30:31]
2282;
2283; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2284; GFX7:       ; %bb.0:
2285; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2286; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
2287; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2288; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
2289; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2290; GFX7-NEXT:    buffer_wbinvl1
2291; GFX7-NEXT:    s_setpc_b64 s[30:31]
2292  %gep = getelementptr float, ptr %ptr, i64 511
2293  %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
2294  ret void
2295}
2296
2297define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
2298; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2299; GFX12:       ; %bb.0:
2300; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2301; GFX12-NEXT:    s_wait_expcnt 0x0
2302; GFX12-NEXT:    s_wait_samplecnt 0x0
2303; GFX12-NEXT:    s_wait_bvhcnt 0x0
2304; GFX12-NEXT:    s_wait_kmcnt 0x0
2305; GFX12-NEXT:    s_wait_storecnt 0x0
2306; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
2307; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
2308; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2309; GFX12-NEXT:    s_setpc_b64 s[30:31]
2310;
2311; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2312; GFX940:       ; %bb.0:
2313; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2314; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
2315; GFX940-NEXT:    s_movk_i32 s0, 0xf800
2316; GFX940-NEXT:    s_nop 0
2317; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
2318; GFX940-NEXT:    flat_load_dword v3, v[4:5]
2319; GFX940-NEXT:    s_mov_b32 s1, -1
2320; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
2321; GFX940-NEXT:    s_mov_b64 s[0:1], 0
2322; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
2323; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
2324; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
2325; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2326; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
2327; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
2328; GFX940-NEXT:    buffer_wbl2 sc1
2329; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
2330; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2331; GFX940-NEXT:    buffer_inv sc1
2332; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2333; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2334; GFX940-NEXT:    v_mov_b32_e32 v3, v2
2335; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2336; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
2337; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
2338; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
2339; GFX940-NEXT:    s_setpc_b64 s[30:31]
2340;
2341; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2342; GFX11:       ; %bb.0:
2343; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2344; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
2345; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
2346; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2347; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2
2348; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2349; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2350; GFX11-NEXT:    buffer_gl1_inv
2351; GFX11-NEXT:    buffer_gl0_inv
2352; GFX11-NEXT:    s_setpc_b64 s[30:31]
2353;
2354; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2355; GFX10:       ; %bb.0:
2356; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2357; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
2358; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
2359; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2360; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
2361; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2362; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2363; GFX10-NEXT:    buffer_gl1_inv
2364; GFX10-NEXT:    buffer_gl0_inv
2365; GFX10-NEXT:    s_setpc_b64 s[30:31]
2366;
2367; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2368; GFX90A:       ; %bb.0:
2369; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2370; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
2371; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
2372; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
2373; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
2374; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
2375; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
2376; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
2377; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
2378; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
2379; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2380; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
2381; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
2382; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
2383; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2384; GFX90A-NEXT:    buffer_wbinvl1
2385; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2386; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2387; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
2388; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2389; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
2390; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
2391; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2392; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2393;
2394; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2395; GFX908:       ; %bb.0:
2396; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2397; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
2398; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
2399; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
2400; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
2401; GFX908-NEXT:    flat_load_dword v1, v[0:1]
2402; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2403; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
2404; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
2405; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2406; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2407; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
2408; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
2409; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2410; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2411; GFX908-NEXT:    buffer_wbinvl1
2412; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
2413; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2414; GFX908-NEXT:    v_mov_b32_e32 v1, v0
2415; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2416; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
2417; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2418; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2419; GFX908-NEXT:    s_setpc_b64 s[30:31]
2420;
2421; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2422; GFX8:       ; %bb.0:
2423; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2424; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
2425; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
2426; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2427; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2428; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
2429; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
2430; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2431; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2432; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
2433; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
2434; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2435; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2436; GFX8-NEXT:    buffer_wbinvl1
2437; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2438; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2439; GFX8-NEXT:    v_mov_b32_e32 v3, v2
2440; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2441; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
2442; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2443; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2444; GFX8-NEXT:    s_setpc_b64 s[30:31]
2445;
2446; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
2447; GFX7:       ; %bb.0:
2448; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2449; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
2450; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
2451; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
2452; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2453; GFX7-NEXT:    buffer_wbinvl1
2454; GFX7-NEXT:    s_setpc_b64 s[30:31]
2455  %gep = getelementptr float, ptr %ptr, i64 -512
2456  %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
2457  ret void
2458}
2459
2460define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
2461; GFX12-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2462; GFX12:       ; %bb.0:
2463; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2464; GFX12-NEXT:    s_wait_expcnt 0x0
2465; GFX12-NEXT:    s_wait_samplecnt 0x0
2466; GFX12-NEXT:    s_wait_bvhcnt 0x0
2467; GFX12-NEXT:    s_wait_kmcnt 0x0
2468; GFX12-NEXT:    global_wb scope:SCOPE_SYS
2469; GFX12-NEXT:    s_wait_storecnt 0x0
2470; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2471; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2472; GFX12-NEXT:    global_inv scope:SCOPE_SYS
2473; GFX12-NEXT:    s_setpc_b64 s[30:31]
2474;
2475; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2476; GFX940:       ; %bb.0:
2477; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2478; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
2479; GFX940-NEXT:    s_mov_b64 s[0:1], 0
2480; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
2481; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
2482; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
2483; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2484; GFX940-NEXT:    v_mov_b32_e32 v5, v3
2485; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
2486; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
2487; GFX940-NEXT:    buffer_wbl2 sc0 sc1
2488; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
2489; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2490; GFX940-NEXT:    buffer_inv sc0 sc1
2491; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
2492; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2493; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2494; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
2495; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
2496; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
2497; GFX940-NEXT:    v_mov_b32_e32 v0, v3
2498; GFX940-NEXT:    s_setpc_b64 s[30:31]
2499;
2500; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2501; GFX11:       ; %bb.0:
2502; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2503; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2504; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
2505; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2506; GFX11-NEXT:    buffer_gl1_inv
2507; GFX11-NEXT:    buffer_gl0_inv
2508; GFX11-NEXT:    s_setpc_b64 s[30:31]
2509;
2510; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2511; GFX10:       ; %bb.0:
2512; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2513; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
2514; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2515; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2516; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
2517; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2518; GFX10-NEXT:    buffer_gl1_inv
2519; GFX10-NEXT:    buffer_gl0_inv
2520; GFX10-NEXT:    s_setpc_b64 s[30:31]
2521;
2522; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2523; GFX90A:       ; %bb.0:
2524; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2525; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
2526; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
2527; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
2528; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
2529; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
2530; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2531; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
2532; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
2533; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
2534; GFX90A-NEXT:    buffer_wbl2
2535; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
2536; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2537; GFX90A-NEXT:    buffer_invl2
2538; GFX90A-NEXT:    buffer_wbinvl1
2539; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
2540; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2541; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2542; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
2543; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
2544; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2545; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
2546; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2547;
2548; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2549; GFX908:       ; %bb.0:
2550; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2551; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
2552; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2553; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
2554; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
2555; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2556; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2557; GFX908-NEXT:    v_mov_b32_e32 v4, v3
2558; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
2559; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
2560; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
2561; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2562; GFX908-NEXT:    buffer_wbinvl1
2563; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
2564; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2565; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2566; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
2567; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2568; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2569; GFX908-NEXT:    v_mov_b32_e32 v0, v3
2570; GFX908-NEXT:    s_setpc_b64 s[30:31]
2571;
2572; GFX8-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2573; GFX8:       ; %bb.0:
2574; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2575; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
2576; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
2577; GFX8-NEXT:    flat_load_dword v0, v[3:4]
2578; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2579; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
2580; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
2581; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2582; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2583; GFX8-NEXT:    v_mov_b32_e32 v6, v0
2584; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
2585; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
2586; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
2587; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2588; GFX8-NEXT:    buffer_wbinvl1
2589; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
2590; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2591; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2592; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
2593; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2594; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2595; GFX8-NEXT:    s_setpc_b64 s[30:31]
2596;
2597; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2598; GFX7:       ; %bb.0:
2599; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2600; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
2601; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2602; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
2603; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2604; GFX7-NEXT:    buffer_wbinvl1
2605; GFX7-NEXT:    s_setpc_b64 s[30:31]
2606  %gep = getelementptr float, ptr %ptr, i64 511
2607  %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
2608  ret float %result
2609}
2610
2611define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
2612; GFX12-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2613; GFX12:       ; %bb.0:
2614; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2615; GFX12-NEXT:    s_wait_expcnt 0x0
2616; GFX12-NEXT:    s_wait_samplecnt 0x0
2617; GFX12-NEXT:    s_wait_bvhcnt 0x0
2618; GFX12-NEXT:    s_wait_kmcnt 0x0
2619; GFX12-NEXT:    global_wb scope:SCOPE_SYS
2620; GFX12-NEXT:    s_wait_storecnt 0x0
2621; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS
2622; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
2623; GFX12-NEXT:    global_inv scope:SCOPE_SYS
2624; GFX12-NEXT:    s_setpc_b64 s[30:31]
2625;
2626; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2627; GFX940:       ; %bb.0:
2628; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2629; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
2630; GFX940-NEXT:    s_mov_b64 s[0:1], 0
2631; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
2632; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
2633; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
2634; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2635; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
2636; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
2637; GFX940-NEXT:    buffer_wbl2 sc0 sc1
2638; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
2639; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2640; GFX940-NEXT:    buffer_inv sc0 sc1
2641; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2642; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2643; GFX940-NEXT:    v_mov_b32_e32 v3, v2
2644; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2645; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
2646; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
2647; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
2648; GFX940-NEXT:    s_setpc_b64 s[30:31]
2649;
2650; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2651; GFX11:       ; %bb.0:
2652; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2653; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2654; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2 offset:2044
2655; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2656; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2657; GFX11-NEXT:    buffer_gl1_inv
2658; GFX11-NEXT:    buffer_gl0_inv
2659; GFX11-NEXT:    s_setpc_b64 s[30:31]
2660;
2661; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2662; GFX10:       ; %bb.0:
2663; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2664; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
2665; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2666; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2667; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
2668; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2669; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2670; GFX10-NEXT:    buffer_gl1_inv
2671; GFX10-NEXT:    buffer_gl0_inv
2672; GFX10-NEXT:    s_setpc_b64 s[30:31]
2673;
2674; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2675; GFX90A:       ; %bb.0:
2676; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2677; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
2678; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
2679; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
2680; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
2681; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
2682; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2683; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
2684; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
2685; GFX90A-NEXT:    buffer_wbl2
2686; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
2687; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2688; GFX90A-NEXT:    buffer_invl2
2689; GFX90A-NEXT:    buffer_wbinvl1
2690; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2691; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2692; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
2693; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2694; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
2695; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
2696; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2697; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2698;
2699; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2700; GFX908:       ; %bb.0:
2701; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2702; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
2703; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2704; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
2705; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
2706; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2707; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2708; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
2709; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
2710; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
2711; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2712; GFX908-NEXT:    buffer_wbinvl1
2713; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2714; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2715; GFX908-NEXT:    v_mov_b32_e32 v3, v2
2716; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2717; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
2718; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2719; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2720; GFX908-NEXT:    s_setpc_b64 s[30:31]
2721;
2722; GFX8-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2723; GFX8:       ; %bb.0:
2724; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2725; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
2726; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2727; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2728; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2729; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
2730; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
2731; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2732; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2733; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
2734; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
2735; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2736; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2737; GFX8-NEXT:    buffer_wbinvl1
2738; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
2739; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2740; GFX8-NEXT:    v_mov_b32_e32 v3, v2
2741; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2742; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
2743; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2744; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2745; GFX8-NEXT:    s_setpc_b64 s[30:31]
2746;
2747; GFX7-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
2748; GFX7:       ; %bb.0:
2749; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2750; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
2751; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2752; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
2753; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2754; GFX7-NEXT:    buffer_wbinvl1
2755; GFX7-NEXT:    s_setpc_b64 s[30:31]
2756  %gep = getelementptr float, ptr %ptr, i64 511
2757  %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
2758  ret void
2759}
2760
2761; --------------------------------------------------------------------
2762; double
2763; --------------------------------------------------------------------
2764
2765define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
2766; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
2767; GFX12:       ; %bb.0:
2768; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2769; GFX12-NEXT:    s_wait_expcnt 0x0
2770; GFX12-NEXT:    s_wait_samplecnt 0x0
2771; GFX12-NEXT:    s_wait_bvhcnt 0x0
2772; GFX12-NEXT:    s_wait_kmcnt 0x0
2773; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
2774; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
2775; GFX12-NEXT:    s_mov_b32 s0, exec_lo
2776; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
2777; GFX12-NEXT:    s_wait_alu 0xfffe
2778; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
2779; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
2780; GFX12-NEXT:    s_cbranch_execz .LBB18_4
2781; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
2782; GFX12-NEXT:    flat_load_b64 v[2:3], v[0:1]
2783; GFX12-NEXT:    s_mov_b32 s1, 0
2784; GFX12-NEXT:  .LBB18_2: ; %atomicrmw.start
2785; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
2786; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2787; GFX12-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
2788; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2789; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
2790; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[2:3], v[4:5]
2791; GFX12-NEXT:    s_wait_storecnt 0x0
2792; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
2793; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2794; GFX12-NEXT:    global_inv scope:SCOPE_DEV
2795; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
2796; GFX12-NEXT:    s_wait_alu 0xfffe
2797; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
2798; GFX12-NEXT:    s_wait_alu 0xfffe
2799; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
2800; GFX12-NEXT:    s_cbranch_execnz .LBB18_2
2801; GFX12-NEXT:  ; %bb.3: ; %Flow
2802; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2803; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
2804; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
2805; GFX12-NEXT:  .LBB18_4: ; %Flow2
2806; GFX12-NEXT:    s_wait_alu 0xfffe
2807; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
2808; GFX12-NEXT:    s_cbranch_execz .LBB18_6
2809; GFX12-NEXT:  ; %bb.5: ; %atomicrmw.private
2810; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
2811; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
2812; GFX12-NEXT:    scratch_load_b64 v[2:3], v6, off
2813; GFX12-NEXT:    s_wait_loadcnt 0x0
2814; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
2815; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2816; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
2817; GFX12-NEXT:    scratch_store_b64 v6, v[0:1], off
2818; GFX12-NEXT:  .LBB18_6: ; %atomicrmw.phi
2819; GFX12-NEXT:    s_wait_alu 0xfffe
2820; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2821; GFX12-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
2822; GFX12-NEXT:    s_wait_alu 0xfffe
2823; GFX12-NEXT:    s_setpc_b64 s[30:31]
2824;
2825; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
2826; GFX940:       ; %bb.0:
2827; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2828; GFX940-NEXT:    v_mov_b32_e32 v5, v1
2829; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
2830; GFX940-NEXT:    v_mov_b32_e32 v4, v0
2831; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
2832; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
2833; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2834; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
2835; GFX940-NEXT:    s_cbranch_execnz .LBB18_3
2836; GFX940-NEXT:  ; %bb.1: ; %Flow
2837; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
2838; GFX940-NEXT:    s_cbranch_execnz .LBB18_4
2839; GFX940-NEXT:  .LBB18_2: ; %atomicrmw.phi
2840; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
2841; GFX940-NEXT:    s_setpc_b64 s[30:31]
2842; GFX940-NEXT:  .LBB18_3: ; %atomicrmw.global
2843; GFX940-NEXT:    buffer_wbl2 sc1
2844; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0
2845; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2846; GFX940-NEXT:    buffer_inv sc1
2847; GFX940-NEXT:    ; implicit-def: $vgpr4_vgpr5
2848; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
2849; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
2850; GFX940-NEXT:    s_cbranch_execz .LBB18_2
2851; GFX940-NEXT:  .LBB18_4: ; %atomicrmw.private
2852; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
2853; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
2854; GFX940-NEXT:    s_nop 0
2855; GFX940-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
2856; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
2857; GFX940-NEXT:    s_waitcnt vmcnt(0)
2858; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
2859; GFX940-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
2860; GFX940-NEXT:    scratch_store_dwordx2 v6, v[2:3], off sc0 sc1
2861; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
2862; GFX940-NEXT:    s_waitcnt vmcnt(0)
2863; GFX940-NEXT:    s_setpc_b64 s[30:31]
2864;
2865; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
2866; GFX11:       ; %bb.0:
2867; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2868; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
2869; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
2870; GFX11-NEXT:    s_mov_b32 s0, exec_lo
2871; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
2872; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
2873; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
2874; GFX11-NEXT:    s_cbranch_execz .LBB18_4
2875; GFX11-NEXT:  ; %bb.1: ; %atomicrmw.global
2876; GFX11-NEXT:    flat_load_b64 v[2:3], v[0:1]
2877; GFX11-NEXT:    s_mov_b32 s1, 0
2878; GFX11-NEXT:  .LBB18_2: ; %atomicrmw.start
2879; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2880; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2881; GFX11-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
2882; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2883; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
2884; GFX11-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
2885; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2886; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc
2887; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2888; GFX11-NEXT:    buffer_gl1_inv
2889; GFX11-NEXT:    buffer_gl0_inv
2890; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
2891; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
2892; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2893; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
2894; GFX11-NEXT:    s_cbranch_execnz .LBB18_2
2895; GFX11-NEXT:  ; %bb.3: ; %Flow
2896; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2897; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
2898; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
2899; GFX11-NEXT:  .LBB18_4: ; %Flow2
2900; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
2901; GFX11-NEXT:    s_cbranch_execz .LBB18_6
2902; GFX11-NEXT:  ; %bb.5: ; %atomicrmw.private
2903; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
2904; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
2905; GFX11-NEXT:    scratch_load_b64 v[2:3], v6, off
2906; GFX11-NEXT:    s_waitcnt vmcnt(0)
2907; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
2908; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2909; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
2910; GFX11-NEXT:    scratch_store_b64 v6, v[0:1], off
2911; GFX11-NEXT:  .LBB18_6: ; %atomicrmw.phi
2912; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2913; GFX11-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
2914; GFX11-NEXT:    s_setpc_b64 s[30:31]
2915;
2916; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
2917; GFX10:       ; %bb.0:
2918; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2919; GFX10-NEXT:    v_mov_b32_e32 v5, v1
2920; GFX10-NEXT:    v_mov_b32_e32 v4, v0
2921; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
2922; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
2923; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v5
2924; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2925; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
2926; GFX10-NEXT:    s_cbranch_execnz .LBB18_3
2927; GFX10-NEXT:  ; %bb.1: ; %Flow
2928; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
2929; GFX10-NEXT:    s_cbranch_execnz .LBB18_4
2930; GFX10-NEXT:  .LBB18_2: ; %atomicrmw.phi
2931; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
2932; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2933; GFX10-NEXT:    s_setpc_b64 s[30:31]
2934; GFX10-NEXT:  .LBB18_3: ; %atomicrmw.global
2935; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2936; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc
2937; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2938; GFX10-NEXT:    buffer_gl1_inv
2939; GFX10-NEXT:    buffer_gl0_inv
2940; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
2941; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
2942; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
2943; GFX10-NEXT:    s_cbranch_execz .LBB18_2
2944; GFX10-NEXT:  .LBB18_4: ; %atomicrmw.private
2945; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
2946; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
2947; GFX10-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
2948; GFX10-NEXT:    s_clause 0x1
2949; GFX10-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
2950; GFX10-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
2951; GFX10-NEXT:    s_waitcnt vmcnt(0)
2952; GFX10-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
2953; GFX10-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
2954; GFX10-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
2955; GFX10-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
2956; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
2957; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2958; GFX10-NEXT:    s_setpc_b64 s[30:31]
2959;
2960; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
2961; GFX90A:       ; %bb.0:
2962; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2963; GFX90A-NEXT:    v_mov_b32_e32 v5, v1
2964; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
2965; GFX90A-NEXT:    v_mov_b32_e32 v4, v0
2966; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
2967; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
2968; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2969; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
2970; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
2971; GFX90A-NEXT:  ; %bb.1: ; %Flow
2972; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
2973; GFX90A-NEXT:    s_cbranch_execnz .LBB18_4
2974; GFX90A-NEXT:  .LBB18_2: ; %atomicrmw.phi
2975; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2976; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2977; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.global
2978; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc
2979; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2980; GFX90A-NEXT:    buffer_wbinvl1
2981; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
2982; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
2983; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
2984; GFX90A-NEXT:    s_cbranch_execz .LBB18_2
2985; GFX90A-NEXT:  .LBB18_4: ; %atomicrmw.private
2986; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
2987; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
2988; GFX90A-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
2989; GFX90A-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
2990; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
2991; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2992; GFX90A-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
2993; GFX90A-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
2994; GFX90A-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
2995; GFX90A-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
2996; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
2997; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2998; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2999;
3000; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
3001; GFX908:       ; %bb.0:
3002; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3003; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
3004; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
3005; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
3006; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
3007; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3008; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3009; GFX908-NEXT:    s_cbranch_execz .LBB18_4
3010; GFX908-NEXT:  ; %bb.1: ; %atomicrmw.global
3011; GFX908-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
3012; GFX908-NEXT:    s_mov_b64 s[6:7], 0
3013; GFX908-NEXT:  .LBB18_2: ; %atomicrmw.start
3014; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
3015; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3016; GFX908-NEXT:    v_mov_b32_e32 v9, v3
3017; GFX908-NEXT:    v_mov_b32_e32 v8, v2
3018; GFX908-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
3019; GFX908-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
3020; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
3021; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3022; GFX908-NEXT:    buffer_wbinvl1
3023; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
3024; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3025; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3026; GFX908-NEXT:    s_cbranch_execnz .LBB18_2
3027; GFX908-NEXT:  ; %bb.3: ; %Flow
3028; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
3029; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
3030; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
3031; GFX908-NEXT:  .LBB18_4: ; %Flow2
3032; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3033; GFX908-NEXT:    s_cbranch_execz .LBB18_6
3034; GFX908-NEXT:  ; %bb.5: ; %atomicrmw.private
3035; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3036; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
3037; GFX908-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
3038; GFX908-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
3039; GFX908-NEXT:    s_waitcnt vmcnt(0)
3040; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
3041; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
3042; GFX908-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
3043; GFX908-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
3044; GFX908-NEXT:  .LBB18_6: ; %atomicrmw.phi
3045; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3046; GFX908-NEXT:    v_mov_b32_e32 v0, v2
3047; GFX908-NEXT:    v_mov_b32_e32 v1, v3
3048; GFX908-NEXT:    s_waitcnt vmcnt(0)
3049; GFX908-NEXT:    s_setpc_b64 s[30:31]
3050;
3051; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
3052; GFX8:       ; %bb.0:
3053; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3054; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
3055; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
3056; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
3057; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
3058; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3059; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
3060; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3061; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3062; GFX8-NEXT:    s_cbranch_execz .LBB18_4
3063; GFX8-NEXT:  ; %bb.1: ; %atomicrmw.global
3064; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
3065; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
3066; GFX8-NEXT:    flat_load_dword v3, v[2:3]
3067; GFX8-NEXT:    flat_load_dword v2, v[0:1]
3068; GFX8-NEXT:    s_mov_b64 s[6:7], 0
3069; GFX8-NEXT:  .LBB18_2: ; %atomicrmw.start
3070; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
3071; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3072; GFX8-NEXT:    v_mov_b32_e32 v9, v3
3073; GFX8-NEXT:    v_mov_b32_e32 v8, v2
3074; GFX8-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
3075; GFX8-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
3076; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
3077; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3078; GFX8-NEXT:    buffer_wbinvl1
3079; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
3080; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3081; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3082; GFX8-NEXT:    s_cbranch_execnz .LBB18_2
3083; GFX8-NEXT:  ; %bb.3: ; %Flow
3084; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
3085; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3086; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
3087; GFX8-NEXT:  .LBB18_4: ; %Flow2
3088; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3089; GFX8-NEXT:    s_cbranch_execz .LBB18_6
3090; GFX8-NEXT:  ; %bb.5: ; %atomicrmw.private
3091; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3092; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
3093; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
3094; GFX8-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
3095; GFX8-NEXT:    buffer_load_dword v3, v7, s[0:3], 0 offen
3096; GFX8-NEXT:    s_waitcnt vmcnt(0)
3097; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
3098; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
3099; GFX8-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
3100; GFX8-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
3101; GFX8-NEXT:  .LBB18_6: ; %atomicrmw.phi
3102; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3103; GFX8-NEXT:    v_mov_b32_e32 v0, v2
3104; GFX8-NEXT:    v_mov_b32_e32 v1, v3
3105; GFX8-NEXT:    s_waitcnt vmcnt(0)
3106; GFX8-NEXT:    s_setpc_b64 s[30:31]
3107;
3108; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
3109; GFX7:       ; %bb.0:
3110; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3111; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
3112; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
3113; GFX7-NEXT:    v_mov_b32_e32 v5, v1
3114; GFX7-NEXT:    v_mov_b32_e32 v4, v0
3115; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
3116; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3117; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v5
3118; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3119; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3120; GFX7-NEXT:    s_cbranch_execnz .LBB18_3
3121; GFX7-NEXT:  ; %bb.1: ; %Flow
3122; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3123; GFX7-NEXT:    s_cbranch_execnz .LBB18_4
3124; GFX7-NEXT:  .LBB18_2: ; %atomicrmw.phi
3125; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3126; GFX7-NEXT:    s_setpc_b64 s[30:31]
3127; GFX7-NEXT:  .LBB18_3: ; %atomicrmw.global
3128; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc
3129; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3130; GFX7-NEXT:    buffer_wbinvl1
3131; GFX7-NEXT:    ; implicit-def: $vgpr4_vgpr5
3132; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
3133; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3134; GFX7-NEXT:    s_cbranch_execz .LBB18_2
3135; GFX7-NEXT:  .LBB18_4: ; %atomicrmw.private
3136; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3137; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3138; GFX7-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
3139; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 4, v6
3140; GFX7-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
3141; GFX7-NEXT:    buffer_load_dword v1, v7, s[0:3], 0 offen
3142; GFX7-NEXT:    s_waitcnt vmcnt(0)
3143; GFX7-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
3144; GFX7-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
3145; GFX7-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
3146; GFX7-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
3147; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3148; GFX7-NEXT:    s_waitcnt vmcnt(0)
3149; GFX7-NEXT:    s_setpc_b64 s[30:31]
3150  %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
3151  ret double %result
3152}
3153
3154define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
3155; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
3156; GFX12:       ; %bb.0:
3157; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3158; GFX12-NEXT:    s_wait_expcnt 0x0
3159; GFX12-NEXT:    s_wait_samplecnt 0x0
3160; GFX12-NEXT:    s_wait_bvhcnt 0x0
3161; GFX12-NEXT:    s_wait_kmcnt 0x0
3162; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
3163; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
3164; GFX12-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
3165; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
3166; GFX12-NEXT:    s_mov_b32 s0, exec_lo
3167; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
3168; GFX12-NEXT:    s_wait_alu 0xfffe
3169; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3170; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v5
3171; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
3172; GFX12-NEXT:    s_cbranch_execnz .LBB19_3
3173; GFX12-NEXT:  ; %bb.1: ; %Flow2
3174; GFX12-NEXT:    s_wait_alu 0xfffe
3175; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
3176; GFX12-NEXT:    s_cbranch_execnz .LBB19_6
3177; GFX12-NEXT:  .LBB19_2: ; %atomicrmw.phi
3178; GFX12-NEXT:    s_wait_alu 0xfffe
3179; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3180; GFX12-NEXT:    s_wait_alu 0xfffe
3181; GFX12-NEXT:    s_setpc_b64 s[30:31]
3182; GFX12-NEXT:  .LBB19_3: ; %atomicrmw.global
3183; GFX12-NEXT:    flat_load_b64 v[0:1], v[4:5]
3184; GFX12-NEXT:    s_mov_b32 s1, 0
3185; GFX12-NEXT:  .LBB19_4: ; %atomicrmw.start
3186; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
3187; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3188; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
3189; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3190; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[8:9], v[8:9]
3191; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[0:1], v[2:3]
3192; GFX12-NEXT:    s_wait_storecnt 0x0
3193; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3194; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3195; GFX12-NEXT:    global_inv scope:SCOPE_DEV
3196; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
3197; GFX12-NEXT:    s_wait_alu 0xfffe
3198; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
3199; GFX12-NEXT:    s_wait_alu 0xfffe
3200; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
3201; GFX12-NEXT:    s_cbranch_execnz .LBB19_4
3202; GFX12-NEXT:  ; %bb.5: ; %Flow
3203; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
3204; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
3205; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
3206; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
3207; GFX12-NEXT:    s_cbranch_execz .LBB19_2
3208; GFX12-NEXT:  .LBB19_6: ; %atomicrmw.private
3209; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
3210; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
3211; GFX12-NEXT:    scratch_load_b64 v[0:1], v6, off
3212; GFX12-NEXT:    s_wait_loadcnt 0x0
3213; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
3214; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3215; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[4:5], v[2:3]
3216; GFX12-NEXT:    scratch_store_b64 v6, v[2:3], off
3217; GFX12-NEXT:    s_wait_alu 0xfffe
3218; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3219; GFX12-NEXT:    s_wait_alu 0xfffe
3220; GFX12-NEXT:    s_setpc_b64 s[30:31]
3221;
3222; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
3223; GFX940:       ; %bb.0:
3224; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3225; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7f8
3226; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
3227; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
3228; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
3229; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
3230; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3231; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
3232; GFX940-NEXT:    s_cbranch_execnz .LBB19_3
3233; GFX940-NEXT:  ; %bb.1: ; %Flow
3234; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
3235; GFX940-NEXT:    s_cbranch_execnz .LBB19_4
3236; GFX940-NEXT:  .LBB19_2: ; %atomicrmw.phi
3237; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
3238; GFX940-NEXT:    s_setpc_b64 s[30:31]
3239; GFX940-NEXT:  .LBB19_3: ; %atomicrmw.global
3240; GFX940-NEXT:    buffer_wbl2 sc1
3241; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0
3242; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3243; GFX940-NEXT:    buffer_inv sc1
3244; GFX940-NEXT:    ; implicit-def: $vgpr4_vgpr5
3245; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
3246; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
3247; GFX940-NEXT:    s_cbranch_execz .LBB19_2
3248; GFX940-NEXT:  .LBB19_4: ; %atomicrmw.private
3249; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3250; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3251; GFX940-NEXT:    s_nop 0
3252; GFX940-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
3253; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
3254; GFX940-NEXT:    s_waitcnt vmcnt(0)
3255; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
3256; GFX940-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
3257; GFX940-NEXT:    scratch_store_dwordx2 v6, v[2:3], off sc0 sc1
3258; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
3259; GFX940-NEXT:    s_waitcnt vmcnt(0)
3260; GFX940-NEXT:    s_setpc_b64 s[30:31]
3261;
3262; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
3263; GFX11:       ; %bb.0:
3264; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3265; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3266; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
3267; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
3268; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
3269; GFX11-NEXT:    s_mov_b32 s0, exec_lo
3270; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
3271; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3272; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v5
3273; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
3274; GFX11-NEXT:    s_cbranch_execnz .LBB19_3
3275; GFX11-NEXT:  ; %bb.1: ; %Flow2
3276; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
3277; GFX11-NEXT:    s_cbranch_execnz .LBB19_6
3278; GFX11-NEXT:  .LBB19_2: ; %atomicrmw.phi
3279; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3280; GFX11-NEXT:    s_setpc_b64 s[30:31]
3281; GFX11-NEXT:  .LBB19_3: ; %atomicrmw.global
3282; GFX11-NEXT:    flat_load_b64 v[0:1], v[4:5]
3283; GFX11-NEXT:    s_mov_b32 s1, 0
3284; GFX11-NEXT:  .LBB19_4: ; %atomicrmw.start
3285; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3286; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3287; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
3288; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3289; GFX11-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
3290; GFX11-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
3291; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3292; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc
3293; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3294; GFX11-NEXT:    buffer_gl1_inv
3295; GFX11-NEXT:    buffer_gl0_inv
3296; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
3297; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
3298; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3299; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
3300; GFX11-NEXT:    s_cbranch_execnz .LBB19_4
3301; GFX11-NEXT:  ; %bb.5: ; %Flow
3302; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
3303; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
3304; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
3305; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
3306; GFX11-NEXT:    s_cbranch_execz .LBB19_2
3307; GFX11-NEXT:  .LBB19_6: ; %atomicrmw.private
3308; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
3309; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
3310; GFX11-NEXT:    scratch_load_b64 v[0:1], v6, off
3311; GFX11-NEXT:    s_waitcnt vmcnt(0)
3312; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
3313; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3314; GFX11-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
3315; GFX11-NEXT:    scratch_store_b64 v6, v[2:3], off
3316; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3317; GFX11-NEXT:    s_setpc_b64 s[30:31]
3318;
3319; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
3320; GFX10:       ; %bb.0:
3321; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3322; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7f8, v0
3323; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
3324; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
3325; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
3326; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v5
3327; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3328; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
3329; GFX10-NEXT:    s_cbranch_execnz .LBB19_3
3330; GFX10-NEXT:  ; %bb.1: ; %Flow
3331; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
3332; GFX10-NEXT:    s_cbranch_execnz .LBB19_4
3333; GFX10-NEXT:  .LBB19_2: ; %atomicrmw.phi
3334; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
3335; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3336; GFX10-NEXT:    s_setpc_b64 s[30:31]
3337; GFX10-NEXT:  .LBB19_3: ; %atomicrmw.global
3338; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3339; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc
3340; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3341; GFX10-NEXT:    buffer_gl1_inv
3342; GFX10-NEXT:    buffer_gl0_inv
3343; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
3344; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
3345; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
3346; GFX10-NEXT:    s_cbranch_execz .LBB19_2
3347; GFX10-NEXT:  .LBB19_4: ; %atomicrmw.private
3348; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
3349; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3350; GFX10-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
3351; GFX10-NEXT:    s_clause 0x1
3352; GFX10-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
3353; GFX10-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
3354; GFX10-NEXT:    s_waitcnt vmcnt(0)
3355; GFX10-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
3356; GFX10-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
3357; GFX10-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
3358; GFX10-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
3359; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
3360; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3361; GFX10-NEXT:    s_setpc_b64 s[30:31]
3362;
3363; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
3364; GFX90A:       ; %bb.0:
3365; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3366; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
3367; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
3368; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
3369; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
3370; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
3371; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3372; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3373; GFX90A-NEXT:    s_cbranch_execnz .LBB19_3
3374; GFX90A-NEXT:  ; %bb.1: ; %Flow
3375; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3376; GFX90A-NEXT:    s_cbranch_execnz .LBB19_4
3377; GFX90A-NEXT:  .LBB19_2: ; %atomicrmw.phi
3378; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3379; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3380; GFX90A-NEXT:  .LBB19_3: ; %atomicrmw.global
3381; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc
3382; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3383; GFX90A-NEXT:    buffer_wbinvl1
3384; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
3385; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
3386; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3387; GFX90A-NEXT:    s_cbranch_execz .LBB19_2
3388; GFX90A-NEXT:  .LBB19_4: ; %atomicrmw.private
3389; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3390; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
3391; GFX90A-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
3392; GFX90A-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
3393; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3394; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3395; GFX90A-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
3396; GFX90A-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
3397; GFX90A-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
3398; GFX90A-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
3399; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3400; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3401; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3402;
3403; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
3404; GFX908:       ; %bb.0:
3405; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3406; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3407; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7f8, v0
3408; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
3409; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
3410; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
3411; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
3412; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3413; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3414; GFX908-NEXT:    s_cbranch_execnz .LBB19_3
3415; GFX908-NEXT:  ; %bb.1: ; %Flow2
3416; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3417; GFX908-NEXT:    s_cbranch_execnz .LBB19_6
3418; GFX908-NEXT:  .LBB19_2: ; %atomicrmw.phi
3419; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3420; GFX908-NEXT:    s_setpc_b64 s[30:31]
3421; GFX908-NEXT:  .LBB19_3: ; %atomicrmw.global
3422; GFX908-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
3423; GFX908-NEXT:    s_mov_b64 s[6:7], 0
3424; GFX908-NEXT:  .LBB19_4: ; %atomicrmw.start
3425; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
3426; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3427; GFX908-NEXT:    v_mov_b32_e32 v9, v1
3428; GFX908-NEXT:    v_mov_b32_e32 v8, v0
3429; GFX908-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
3430; GFX908-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
3431; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
3432; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3433; GFX908-NEXT:    buffer_wbinvl1
3434; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
3435; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3436; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3437; GFX908-NEXT:    s_cbranch_execnz .LBB19_4
3438; GFX908-NEXT:  ; %bb.5: ; %Flow
3439; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
3440; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
3441; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
3442; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3443; GFX908-NEXT:    s_cbranch_execz .LBB19_2
3444; GFX908-NEXT:  .LBB19_6: ; %atomicrmw.private
3445; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3446; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
3447; GFX908-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
3448; GFX908-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
3449; GFX908-NEXT:    s_waitcnt vmcnt(0)
3450; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
3451; GFX908-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
3452; GFX908-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
3453; GFX908-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
3454; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3455; GFX908-NEXT:    s_waitcnt vmcnt(0)
3456; GFX908-NEXT:    s_setpc_b64 s[30:31]
3457;
3458; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
3459; GFX8:       ; %bb.0:
3460; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3461; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3462; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
3463; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
3464; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
3465; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
3466; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3467; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v5
3468; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3469; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3470; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3471; GFX8-NEXT:    s_cbranch_execnz .LBB19_3
3472; GFX8-NEXT:  ; %bb.1: ; %Flow2
3473; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3474; GFX8-NEXT:    s_cbranch_execnz .LBB19_6
3475; GFX8-NEXT:  .LBB19_2: ; %atomicrmw.phi
3476; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3477; GFX8-NEXT:    s_setpc_b64 s[30:31]
3478; GFX8-NEXT:  .LBB19_3: ; %atomicrmw.global
3479; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
3480; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
3481; GFX8-NEXT:    flat_load_dword v1, v[0:1]
3482; GFX8-NEXT:    flat_load_dword v0, v[4:5]
3483; GFX8-NEXT:    s_mov_b64 s[6:7], 0
3484; GFX8-NEXT:  .LBB19_4: ; %atomicrmw.start
3485; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
3486; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3487; GFX8-NEXT:    v_mov_b32_e32 v9, v1
3488; GFX8-NEXT:    v_mov_b32_e32 v8, v0
3489; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
3490; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
3491; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
3492; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3493; GFX8-NEXT:    buffer_wbinvl1
3494; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
3495; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3496; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3497; GFX8-NEXT:    s_cbranch_execnz .LBB19_4
3498; GFX8-NEXT:  ; %bb.5: ; %Flow
3499; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
3500; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
3501; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
3502; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3503; GFX8-NEXT:    s_cbranch_execz .LBB19_2
3504; GFX8-NEXT:  .LBB19_6: ; %atomicrmw.private
3505; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3506; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
3507; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
3508; GFX8-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
3509; GFX8-NEXT:    buffer_load_dword v1, v7, s[0:3], 0 offen
3510; GFX8-NEXT:    s_waitcnt vmcnt(0)
3511; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
3512; GFX8-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
3513; GFX8-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
3514; GFX8-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
3515; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3516; GFX8-NEXT:    s_waitcnt vmcnt(0)
3517; GFX8-NEXT:    s_setpc_b64 s[30:31]
3518;
3519; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
3520; GFX7:       ; %bb.0:
3521; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3522; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
3523; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
3524; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7f8, v0
3525; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
3526; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3527; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v5
3528; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
3529; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3530; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3531; GFX7-NEXT:    s_cbranch_execnz .LBB19_3
3532; GFX7-NEXT:  ; %bb.1: ; %Flow
3533; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3534; GFX7-NEXT:    s_cbranch_execnz .LBB19_4
3535; GFX7-NEXT:  .LBB19_2: ; %atomicrmw.phi
3536; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3537; GFX7-NEXT:    s_setpc_b64 s[30:31]
3538; GFX7-NEXT:  .LBB19_3: ; %atomicrmw.global
3539; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc
3540; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3541; GFX7-NEXT:    buffer_wbinvl1
3542; GFX7-NEXT:    ; implicit-def: $vgpr4_vgpr5
3543; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
3544; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3545; GFX7-NEXT:    s_cbranch_execz .LBB19_2
3546; GFX7-NEXT:  .LBB19_4: ; %atomicrmw.private
3547; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3548; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3549; GFX7-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
3550; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 4, v6
3551; GFX7-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
3552; GFX7-NEXT:    buffer_load_dword v1, v7, s[0:3], 0 offen
3553; GFX7-NEXT:    s_waitcnt vmcnt(0)
3554; GFX7-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
3555; GFX7-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
3556; GFX7-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
3557; GFX7-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
3558; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3559; GFX7-NEXT:    s_waitcnt vmcnt(0)
3560; GFX7-NEXT:    s_setpc_b64 s[30:31]
3561  %gep = getelementptr double, ptr %ptr, i64 255
3562  %result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
3563  ret double %result
3564}
3565
3566define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
3567; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
3568; GFX12:       ; %bb.0:
3569; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3570; GFX12-NEXT:    s_wait_expcnt 0x0
3571; GFX12-NEXT:    s_wait_samplecnt 0x0
3572; GFX12-NEXT:    s_wait_bvhcnt 0x0
3573; GFX12-NEXT:    s_wait_kmcnt 0x0
3574; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
3575; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
3576; GFX12-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
3577; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
3578; GFX12-NEXT:    s_mov_b32 s0, exec_lo
3579; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
3580; GFX12-NEXT:    s_wait_alu 0xfffe
3581; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3582; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v5
3583; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
3584; GFX12-NEXT:    s_cbranch_execnz .LBB20_3
3585; GFX12-NEXT:  ; %bb.1: ; %Flow2
3586; GFX12-NEXT:    s_wait_alu 0xfffe
3587; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
3588; GFX12-NEXT:    s_cbranch_execnz .LBB20_6
3589; GFX12-NEXT:  .LBB20_2: ; %atomicrmw.phi
3590; GFX12-NEXT:    s_wait_alu 0xfffe
3591; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3592; GFX12-NEXT:    s_wait_alu 0xfffe
3593; GFX12-NEXT:    s_setpc_b64 s[30:31]
3594; GFX12-NEXT:  .LBB20_3: ; %atomicrmw.global
3595; GFX12-NEXT:    flat_load_b64 v[0:1], v[4:5]
3596; GFX12-NEXT:    s_mov_b32 s1, 0
3597; GFX12-NEXT:  .LBB20_4: ; %atomicrmw.start
3598; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
3599; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3600; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
3601; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3602; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[8:9], v[8:9]
3603; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[0:1], v[2:3]
3604; GFX12-NEXT:    s_wait_storecnt 0x0
3605; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3606; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3607; GFX12-NEXT:    global_inv scope:SCOPE_DEV
3608; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
3609; GFX12-NEXT:    s_wait_alu 0xfffe
3610; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
3611; GFX12-NEXT:    s_wait_alu 0xfffe
3612; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
3613; GFX12-NEXT:    s_cbranch_execnz .LBB20_4
3614; GFX12-NEXT:  ; %bb.5: ; %Flow
3615; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
3616; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
3617; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
3618; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
3619; GFX12-NEXT:    s_cbranch_execz .LBB20_2
3620; GFX12-NEXT:  .LBB20_6: ; %atomicrmw.private
3621; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
3622; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
3623; GFX12-NEXT:    scratch_load_b64 v[0:1], v6, off
3624; GFX12-NEXT:    s_wait_loadcnt 0x0
3625; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
3626; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3627; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[4:5], v[2:3]
3628; GFX12-NEXT:    scratch_store_b64 v6, v[2:3], off
3629; GFX12-NEXT:    s_wait_alu 0xfffe
3630; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3631; GFX12-NEXT:    s_wait_alu 0xfffe
3632; GFX12-NEXT:    s_setpc_b64 s[30:31]
3633;
3634; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
3635; GFX940:       ; %bb.0:
3636; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3637; GFX940-NEXT:    s_movk_i32 s0, 0xf800
3638; GFX940-NEXT:    s_mov_b32 s1, -1
3639; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
3640; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
3641; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
3642; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
3643; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
3644; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
3645; GFX940-NEXT:    s_cbranch_execnz .LBB20_3
3646; GFX940-NEXT:  ; %bb.1: ; %Flow
3647; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
3648; GFX940-NEXT:    s_cbranch_execnz .LBB20_4
3649; GFX940-NEXT:  .LBB20_2: ; %atomicrmw.phi
3650; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
3651; GFX940-NEXT:    s_setpc_b64 s[30:31]
3652; GFX940-NEXT:  .LBB20_3: ; %atomicrmw.global
3653; GFX940-NEXT:    buffer_wbl2 sc1
3654; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0
3655; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3656; GFX940-NEXT:    buffer_inv sc1
3657; GFX940-NEXT:    ; implicit-def: $vgpr4_vgpr5
3658; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
3659; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
3660; GFX940-NEXT:    s_cbranch_execz .LBB20_2
3661; GFX940-NEXT:  .LBB20_4: ; %atomicrmw.private
3662; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3663; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3664; GFX940-NEXT:    s_nop 0
3665; GFX940-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
3666; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
3667; GFX940-NEXT:    s_waitcnt vmcnt(0)
3668; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
3669; GFX940-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
3670; GFX940-NEXT:    scratch_store_dwordx2 v6, v[2:3], off sc0 sc1
3671; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
3672; GFX940-NEXT:    s_waitcnt vmcnt(0)
3673; GFX940-NEXT:    s_setpc_b64 s[30:31]
3674;
3675; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
3676; GFX11:       ; %bb.0:
3677; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3678; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3679; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
3680; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
3681; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
3682; GFX11-NEXT:    s_mov_b32 s0, exec_lo
3683; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
3684; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3685; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v5
3686; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
3687; GFX11-NEXT:    s_cbranch_execnz .LBB20_3
3688; GFX11-NEXT:  ; %bb.1: ; %Flow2
3689; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
3690; GFX11-NEXT:    s_cbranch_execnz .LBB20_6
3691; GFX11-NEXT:  .LBB20_2: ; %atomicrmw.phi
3692; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3693; GFX11-NEXT:    s_setpc_b64 s[30:31]
3694; GFX11-NEXT:  .LBB20_3: ; %atomicrmw.global
3695; GFX11-NEXT:    flat_load_b64 v[0:1], v[4:5]
3696; GFX11-NEXT:    s_mov_b32 s1, 0
3697; GFX11-NEXT:  .LBB20_4: ; %atomicrmw.start
3698; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3699; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3700; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
3701; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3702; GFX11-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
3703; GFX11-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
3704; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3705; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc
3706; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3707; GFX11-NEXT:    buffer_gl1_inv
3708; GFX11-NEXT:    buffer_gl0_inv
3709; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
3710; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
3711; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3712; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
3713; GFX11-NEXT:    s_cbranch_execnz .LBB20_4
3714; GFX11-NEXT:  ; %bb.5: ; %Flow
3715; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
3716; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
3717; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
3718; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
3719; GFX11-NEXT:    s_cbranch_execz .LBB20_2
3720; GFX11-NEXT:  .LBB20_6: ; %atomicrmw.private
3721; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
3722; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
3723; GFX11-NEXT:    scratch_load_b64 v[0:1], v6, off
3724; GFX11-NEXT:    s_waitcnt vmcnt(0)
3725; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
3726; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3727; GFX11-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
3728; GFX11-NEXT:    scratch_store_b64 v6, v[2:3], off
3729; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
3730; GFX11-NEXT:    s_setpc_b64 s[30:31]
3731;
3732; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
3733; GFX10:       ; %bb.0:
3734; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3735; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
3736; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
3737; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
3738; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
3739; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v5
3740; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
3741; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
3742; GFX10-NEXT:    s_cbranch_execnz .LBB20_3
3743; GFX10-NEXT:  ; %bb.1: ; %Flow
3744; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
3745; GFX10-NEXT:    s_cbranch_execnz .LBB20_4
3746; GFX10-NEXT:  .LBB20_2: ; %atomicrmw.phi
3747; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
3748; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3749; GFX10-NEXT:    s_setpc_b64 s[30:31]
3750; GFX10-NEXT:  .LBB20_3: ; %atomicrmw.global
3751; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3752; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc
3753; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3754; GFX10-NEXT:    buffer_gl1_inv
3755; GFX10-NEXT:    buffer_gl0_inv
3756; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
3757; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
3758; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
3759; GFX10-NEXT:    s_cbranch_execz .LBB20_2
3760; GFX10-NEXT:  .LBB20_4: ; %atomicrmw.private
3761; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
3762; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3763; GFX10-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
3764; GFX10-NEXT:    s_clause 0x1
3765; GFX10-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
3766; GFX10-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
3767; GFX10-NEXT:    s_waitcnt vmcnt(0)
3768; GFX10-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
3769; GFX10-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
3770; GFX10-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
3771; GFX10-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
3772; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
3773; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
3774; GFX10-NEXT:    s_setpc_b64 s[30:31]
3775;
3776; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
3777; GFX90A:       ; %bb.0:
3778; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3779; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
3780; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
3781; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
3782; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
3783; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
3784; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3785; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3786; GFX90A-NEXT:    s_cbranch_execnz .LBB20_3
3787; GFX90A-NEXT:  ; %bb.1: ; %Flow
3788; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3789; GFX90A-NEXT:    s_cbranch_execnz .LBB20_4
3790; GFX90A-NEXT:  .LBB20_2: ; %atomicrmw.phi
3791; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3792; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3793; GFX90A-NEXT:  .LBB20_3: ; %atomicrmw.global
3794; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc
3795; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3796; GFX90A-NEXT:    buffer_wbinvl1
3797; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
3798; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
3799; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3800; GFX90A-NEXT:    s_cbranch_execz .LBB20_2
3801; GFX90A-NEXT:  .LBB20_4: ; %atomicrmw.private
3802; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3803; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
3804; GFX90A-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
3805; GFX90A-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
3806; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3807; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3808; GFX90A-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
3809; GFX90A-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
3810; GFX90A-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
3811; GFX90A-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
3812; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
3813; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3814; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3815;
3816; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
3817; GFX908:       ; %bb.0:
3818; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3819; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3820; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
3821; GFX908-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
3822; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
3823; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
3824; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
3825; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3826; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3827; GFX908-NEXT:    s_cbranch_execnz .LBB20_3
3828; GFX908-NEXT:  ; %bb.1: ; %Flow2
3829; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3830; GFX908-NEXT:    s_cbranch_execnz .LBB20_6
3831; GFX908-NEXT:  .LBB20_2: ; %atomicrmw.phi
3832; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3833; GFX908-NEXT:    s_setpc_b64 s[30:31]
3834; GFX908-NEXT:  .LBB20_3: ; %atomicrmw.global
3835; GFX908-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
3836; GFX908-NEXT:    s_mov_b64 s[6:7], 0
3837; GFX908-NEXT:  .LBB20_4: ; %atomicrmw.start
3838; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
3839; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3840; GFX908-NEXT:    v_mov_b32_e32 v9, v1
3841; GFX908-NEXT:    v_mov_b32_e32 v8, v0
3842; GFX908-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
3843; GFX908-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
3844; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
3845; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3846; GFX908-NEXT:    buffer_wbinvl1
3847; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
3848; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3849; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3850; GFX908-NEXT:    s_cbranch_execnz .LBB20_4
3851; GFX908-NEXT:  ; %bb.5: ; %Flow
3852; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
3853; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
3854; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
3855; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3856; GFX908-NEXT:    s_cbranch_execz .LBB20_2
3857; GFX908-NEXT:  .LBB20_6: ; %atomicrmw.private
3858; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3859; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
3860; GFX908-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
3861; GFX908-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
3862; GFX908-NEXT:    s_waitcnt vmcnt(0)
3863; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
3864; GFX908-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
3865; GFX908-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
3866; GFX908-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
3867; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
3868; GFX908-NEXT:    s_waitcnt vmcnt(0)
3869; GFX908-NEXT:    s_setpc_b64 s[30:31]
3870;
3871; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
3872; GFX8:       ; %bb.0:
3873; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3874; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3875; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
3876; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
3877; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
3878; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
3879; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3880; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v5
3881; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
3882; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3883; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3884; GFX8-NEXT:    s_cbranch_execnz .LBB20_3
3885; GFX8-NEXT:  ; %bb.1: ; %Flow2
3886; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3887; GFX8-NEXT:    s_cbranch_execnz .LBB20_6
3888; GFX8-NEXT:  .LBB20_2: ; %atomicrmw.phi
3889; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3890; GFX8-NEXT:    s_setpc_b64 s[30:31]
3891; GFX8-NEXT:  .LBB20_3: ; %atomicrmw.global
3892; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
3893; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
3894; GFX8-NEXT:    flat_load_dword v1, v[0:1]
3895; GFX8-NEXT:    flat_load_dword v0, v[4:5]
3896; GFX8-NEXT:    s_mov_b64 s[6:7], 0
3897; GFX8-NEXT:  .LBB20_4: ; %atomicrmw.start
3898; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
3899; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3900; GFX8-NEXT:    v_mov_b32_e32 v9, v1
3901; GFX8-NEXT:    v_mov_b32_e32 v8, v0
3902; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
3903; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
3904; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
3905; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3906; GFX8-NEXT:    buffer_wbinvl1
3907; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
3908; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
3909; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
3910; GFX8-NEXT:    s_cbranch_execnz .LBB20_4
3911; GFX8-NEXT:  ; %bb.5: ; %Flow
3912; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
3913; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
3914; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
3915; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3916; GFX8-NEXT:    s_cbranch_execz .LBB20_2
3917; GFX8-NEXT:  .LBB20_6: ; %atomicrmw.private
3918; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3919; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
3920; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
3921; GFX8-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
3922; GFX8-NEXT:    buffer_load_dword v1, v7, s[0:3], 0 offen
3923; GFX8-NEXT:    s_waitcnt vmcnt(0)
3924; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
3925; GFX8-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
3926; GFX8-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
3927; GFX8-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
3928; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
3929; GFX8-NEXT:    s_waitcnt vmcnt(0)
3930; GFX8-NEXT:    s_setpc_b64 s[30:31]
3931;
3932; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
3933; GFX7:       ; %bb.0:
3934; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3935; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
3936; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
3937; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
3938; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
3939; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3940; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v5
3941; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
3942; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
3943; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
3944; GFX7-NEXT:    s_cbranch_execnz .LBB20_3
3945; GFX7-NEXT:  ; %bb.1: ; %Flow
3946; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3947; GFX7-NEXT:    s_cbranch_execnz .LBB20_4
3948; GFX7-NEXT:  .LBB20_2: ; %atomicrmw.phi
3949; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3950; GFX7-NEXT:    s_setpc_b64 s[30:31]
3951; GFX7-NEXT:  .LBB20_3: ; %atomicrmw.global
3952; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc
3953; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3954; GFX7-NEXT:    buffer_wbinvl1
3955; GFX7-NEXT:    ; implicit-def: $vgpr4_vgpr5
3956; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
3957; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
3958; GFX7-NEXT:    s_cbranch_execz .LBB20_2
3959; GFX7-NEXT:  .LBB20_4: ; %atomicrmw.private
3960; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3961; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
3962; GFX7-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
3963; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 4, v6
3964; GFX7-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
3965; GFX7-NEXT:    buffer_load_dword v1, v7, s[0:3], 0 offen
3966; GFX7-NEXT:    s_waitcnt vmcnt(0)
3967; GFX7-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
3968; GFX7-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
3969; GFX7-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
3970; GFX7-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
3971; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
3972; GFX7-NEXT:    s_waitcnt vmcnt(0)
3973; GFX7-NEXT:    s_setpc_b64 s[30:31]
3974  %gep = getelementptr double, ptr %ptr, i64 -256
3975  %result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
3976  ret double %result
3977}
3978
3979define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
3980; GFX12-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
3981; GFX12:       ; %bb.0:
3982; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3983; GFX12-NEXT:    s_wait_expcnt 0x0
3984; GFX12-NEXT:    s_wait_samplecnt 0x0
3985; GFX12-NEXT:    s_wait_bvhcnt 0x0
3986; GFX12-NEXT:    s_wait_kmcnt 0x0
3987; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
3988; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
3989; GFX12-NEXT:    s_mov_b32 s0, exec_lo
3990; GFX12-NEXT:    s_wait_alu 0xfffe
3991; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
3992; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
3993; GFX12-NEXT:    s_cbranch_execnz .LBB21_3
3994; GFX12-NEXT:  ; %bb.1: ; %Flow2
3995; GFX12-NEXT:    s_wait_alu 0xfffe
3996; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
3997; GFX12-NEXT:    s_cbranch_execnz .LBB21_6
3998; GFX12-NEXT:  .LBB21_2: ; %atomicrmw.phi
3999; GFX12-NEXT:    s_wait_alu 0xfffe
4000; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4001; GFX12-NEXT:    s_wait_alu 0xfffe
4002; GFX12-NEXT:    s_setpc_b64 s[30:31]
4003; GFX12-NEXT:  .LBB21_3: ; %atomicrmw.global
4004; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
4005; GFX12-NEXT:    s_mov_b32 s1, 0
4006; GFX12-NEXT:  .LBB21_4: ; %atomicrmw.start
4007; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
4008; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4009; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
4010; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4011; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
4012; GFX12-NEXT:    s_wait_storecnt 0x0
4013; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
4014; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4015; GFX12-NEXT:    global_inv scope:SCOPE_DEV
4016; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
4017; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
4018; GFX12-NEXT:    s_wait_alu 0xfffe
4019; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
4020; GFX12-NEXT:    s_wait_alu 0xfffe
4021; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
4022; GFX12-NEXT:    s_cbranch_execnz .LBB21_4
4023; GFX12-NEXT:  ; %bb.5: ; %Flow
4024; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4025; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
4026; GFX12-NEXT:    ; implicit-def: $vgpr6_vgpr7
4027; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
4028; GFX12-NEXT:    s_cbranch_execz .LBB21_2
4029; GFX12-NEXT:  .LBB21_6: ; %atomicrmw.private
4030; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
4031; GFX12-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
4032; GFX12-NEXT:    scratch_load_b64 v[0:1], v2, off
4033; GFX12-NEXT:    s_wait_loadcnt 0x0
4034; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
4035; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4036; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[6:7]
4037; GFX12-NEXT:    scratch_store_b64 v2, v[0:1], off
4038; GFX12-NEXT:    s_wait_alu 0xfffe
4039; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4040; GFX12-NEXT:    s_wait_alu 0xfffe
4041; GFX12-NEXT:    s_setpc_b64 s[30:31]
4042;
4043; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
4044; GFX940:       ; %bb.0:
4045; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4046; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
4047; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
4048; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4049; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
4050; GFX940-NEXT:    s_cbranch_execnz .LBB21_3
4051; GFX940-NEXT:  ; %bb.1: ; %Flow
4052; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
4053; GFX940-NEXT:    s_cbranch_execnz .LBB21_4
4054; GFX940-NEXT:  .LBB21_2: ; %atomicrmw.phi
4055; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
4056; GFX940-NEXT:    s_setpc_b64 s[30:31]
4057; GFX940-NEXT:  .LBB21_3: ; %atomicrmw.global
4058; GFX940-NEXT:    buffer_wbl2 sc1
4059; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
4060; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4061; GFX940-NEXT:    buffer_inv sc1
4062; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
4063; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
4064; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
4065; GFX940-NEXT:    s_cbranch_execz .LBB21_2
4066; GFX940-NEXT:  .LBB21_4: ; %atomicrmw.private
4067; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4068; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
4069; GFX940-NEXT:    s_nop 0
4070; GFX940-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
4071; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
4072; GFX940-NEXT:    s_waitcnt vmcnt(0)
4073; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4074; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
4075; GFX940-NEXT:    scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
4076; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
4077; GFX940-NEXT:    s_waitcnt vmcnt(0)
4078; GFX940-NEXT:    s_setpc_b64 s[30:31]
4079;
4080; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
4081; GFX11:       ; %bb.0:
4082; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4083; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
4084; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
4085; GFX11-NEXT:    s_mov_b32 s0, exec_lo
4086; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
4087; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
4088; GFX11-NEXT:    s_cbranch_execnz .LBB21_3
4089; GFX11-NEXT:  ; %bb.1: ; %Flow2
4090; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
4091; GFX11-NEXT:    s_cbranch_execnz .LBB21_6
4092; GFX11-NEXT:  .LBB21_2: ; %atomicrmw.phi
4093; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4094; GFX11-NEXT:    s_setpc_b64 s[30:31]
4095; GFX11-NEXT:  .LBB21_3: ; %atomicrmw.global
4096; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
4097; GFX11-NEXT:    s_mov_b32 s1, 0
4098; GFX11-NEXT:  .LBB21_4: ; %atomicrmw.start
4099; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
4100; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4101; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
4102; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4103; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
4104; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4105; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc
4106; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4107; GFX11-NEXT:    buffer_gl1_inv
4108; GFX11-NEXT:    buffer_gl0_inv
4109; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
4110; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
4111; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
4112; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4113; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
4114; GFX11-NEXT:    s_cbranch_execnz .LBB21_4
4115; GFX11-NEXT:  ; %bb.5: ; %Flow
4116; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4117; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
4118; GFX11-NEXT:    ; implicit-def: $vgpr6_vgpr7
4119; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
4120; GFX11-NEXT:    s_cbranch_execz .LBB21_2
4121; GFX11-NEXT:  .LBB21_6: ; %atomicrmw.private
4122; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
4123; GFX11-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
4124; GFX11-NEXT:    scratch_load_b64 v[0:1], v2, off
4125; GFX11-NEXT:    s_waitcnt vmcnt(0)
4126; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4127; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4128; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
4129; GFX11-NEXT:    scratch_store_b64 v2, v[0:1], off
4130; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4131; GFX11-NEXT:    s_setpc_b64 s[30:31]
4132;
4133; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
4134; GFX10:       ; %bb.0:
4135; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4136; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
4137; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v1
4138; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4139; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
4140; GFX10-NEXT:    s_cbranch_execnz .LBB21_3
4141; GFX10-NEXT:  ; %bb.1: ; %Flow
4142; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
4143; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
4144; GFX10-NEXT:  .LBB21_2: ; %atomicrmw.phi
4145; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4146; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4147; GFX10-NEXT:    s_setpc_b64 s[30:31]
4148; GFX10-NEXT:  .LBB21_3: ; %atomicrmw.global
4149; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4150; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
4151; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
4152; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4153; GFX10-NEXT:    buffer_gl1_inv
4154; GFX10-NEXT:    buffer_gl0_inv
4155; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
4156; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
4157; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
4158; GFX10-NEXT:    s_cbranch_execz .LBB21_2
4159; GFX10-NEXT:  .LBB21_4: ; %atomicrmw.private
4160; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
4161; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
4162; GFX10-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
4163; GFX10-NEXT:    s_clause 0x1
4164; GFX10-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4165; GFX10-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
4166; GFX10-NEXT:    s_waitcnt vmcnt(0)
4167; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4168; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
4169; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
4170; GFX10-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
4171; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4172; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4173; GFX10-NEXT:    s_setpc_b64 s[30:31]
4174;
4175; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
4176; GFX90A:       ; %bb.0:
4177; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4178; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
4179; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
4180; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4181; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4182; GFX90A-NEXT:    s_cbranch_execnz .LBB21_3
4183; GFX90A-NEXT:  ; %bb.1: ; %Flow
4184; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4185; GFX90A-NEXT:    s_cbranch_execnz .LBB21_4
4186; GFX90A-NEXT:  .LBB21_2: ; %atomicrmw.phi
4187; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4188; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4189; GFX90A-NEXT:  .LBB21_3: ; %atomicrmw.global
4190; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
4191; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4192; GFX90A-NEXT:    buffer_wbinvl1
4193; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
4194; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
4195; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4196; GFX90A-NEXT:    s_cbranch_execz .LBB21_2
4197; GFX90A-NEXT:  .LBB21_4: ; %atomicrmw.private
4198; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4199; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
4200; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4201; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
4202; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
4203; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4204; GFX90A-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4205; GFX90A-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
4206; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
4207; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
4208; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4209; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4210; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4211;
4212; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
4213; GFX908:       ; %bb.0:
4214; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4215; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
4216; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
4217; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
4218; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4219; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4220; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
4221; GFX908-NEXT:  ; %bb.1: ; %Flow2
4222; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4223; GFX908-NEXT:    s_cbranch_execnz .LBB21_6
4224; GFX908-NEXT:  .LBB21_2: ; %atomicrmw.phi
4225; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4226; GFX908-NEXT:    s_setpc_b64 s[30:31]
4227; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.global
4228; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
4229; GFX908-NEXT:    s_mov_b64 s[6:7], 0
4230; GFX908-NEXT:  .LBB21_4: ; %atomicrmw.start
4231; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
4232; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4233; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
4234; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
4235; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
4236; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4237; GFX908-NEXT:    buffer_wbinvl1
4238; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
4239; GFX908-NEXT:    v_mov_b32_e32 v5, v3
4240; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4241; GFX908-NEXT:    v_mov_b32_e32 v4, v2
4242; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4243; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
4244; GFX908-NEXT:  ; %bb.5: ; %Flow
4245; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
4246; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
4247; GFX908-NEXT:    ; implicit-def: $vgpr6_vgpr7
4248; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4249; GFX908-NEXT:    s_cbranch_execz .LBB21_2
4250; GFX908-NEXT:  .LBB21_6: ; %atomicrmw.private
4251; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4252; GFX908-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
4253; GFX908-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
4254; GFX908-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
4255; GFX908-NEXT:    s_waitcnt vmcnt(0)
4256; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4257; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
4258; GFX908-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
4259; GFX908-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
4260; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4261; GFX908-NEXT:    s_waitcnt vmcnt(0)
4262; GFX908-NEXT:    s_setpc_b64 s[30:31]
4263;
4264; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
4265; GFX8:       ; %bb.0:
4266; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4267; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
4268; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
4269; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
4270; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4271; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
4272; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4273; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4274; GFX8-NEXT:    s_cbranch_execnz .LBB21_3
4275; GFX8-NEXT:  ; %bb.1: ; %Flow2
4276; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4277; GFX8-NEXT:    s_cbranch_execnz .LBB21_6
4278; GFX8-NEXT:  .LBB21_2: ; %atomicrmw.phi
4279; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4280; GFX8-NEXT:    s_setpc_b64 s[30:31]
4281; GFX8-NEXT:  .LBB21_3: ; %atomicrmw.global
4282; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
4283; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
4284; GFX8-NEXT:    flat_load_dword v5, v[2:3]
4285; GFX8-NEXT:    flat_load_dword v4, v[0:1]
4286; GFX8-NEXT:    s_mov_b64 s[6:7], 0
4287; GFX8-NEXT:  .LBB21_4: ; %atomicrmw.start
4288; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
4289; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4290; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
4291; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
4292; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
4293; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4294; GFX8-NEXT:    buffer_wbinvl1
4295; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
4296; GFX8-NEXT:    v_mov_b32_e32 v5, v3
4297; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4298; GFX8-NEXT:    v_mov_b32_e32 v4, v2
4299; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4300; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
4301; GFX8-NEXT:  ; %bb.5: ; %Flow
4302; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
4303; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
4304; GFX8-NEXT:    ; implicit-def: $vgpr6_vgpr7
4305; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4306; GFX8-NEXT:    s_cbranch_execz .LBB21_2
4307; GFX8-NEXT:  .LBB21_6: ; %atomicrmw.private
4308; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4309; GFX8-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
4310; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
4311; GFX8-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
4312; GFX8-NEXT:    buffer_load_dword v1, v3, s[0:3], 0 offen
4313; GFX8-NEXT:    s_waitcnt vmcnt(0)
4314; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4315; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
4316; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
4317; GFX8-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
4318; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4319; GFX8-NEXT:    s_waitcnt vmcnt(0)
4320; GFX8-NEXT:    s_setpc_b64 s[30:31]
4321;
4322; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
4323; GFX7:       ; %bb.0:
4324; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4325; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
4326; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
4327; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4328; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
4329; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4330; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4331; GFX7-NEXT:    s_cbranch_execnz .LBB21_3
4332; GFX7-NEXT:  ; %bb.1: ; %Flow
4333; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4334; GFX7-NEXT:    s_cbranch_execnz .LBB21_4
4335; GFX7-NEXT:  .LBB21_2: ; %atomicrmw.phi
4336; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4337; GFX7-NEXT:    s_setpc_b64 s[30:31]
4338; GFX7-NEXT:  .LBB21_3: ; %atomicrmw.global
4339; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
4340; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4341; GFX7-NEXT:    buffer_wbinvl1
4342; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
4343; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
4344; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4345; GFX7-NEXT:    s_cbranch_execz .LBB21_2
4346; GFX7-NEXT:  .LBB21_4: ; %atomicrmw.private
4347; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4348; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
4349; GFX7-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
4350; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v4
4351; GFX7-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4352; GFX7-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
4353; GFX7-NEXT:    s_waitcnt vmcnt(0)
4354; GFX7-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4355; GFX7-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
4356; GFX7-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
4357; GFX7-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
4358; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4359; GFX7-NEXT:    s_waitcnt vmcnt(0)
4360; GFX7-NEXT:    s_setpc_b64 s[30:31]
4361  %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
4362  ret void
4363}
4364
4365define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
4366; GFX12-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
4367; GFX12:       ; %bb.0:
4368; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4369; GFX12-NEXT:    s_wait_expcnt 0x0
4370; GFX12-NEXT:    s_wait_samplecnt 0x0
4371; GFX12-NEXT:    s_wait_bvhcnt 0x0
4372; GFX12-NEXT:    s_wait_kmcnt 0x0
4373; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
4374; GFX12-NEXT:    v_add_co_u32 v6, vcc_lo, 0x7f8, v0
4375; GFX12-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
4376; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
4377; GFX12-NEXT:    s_mov_b32 s0, exec_lo
4378; GFX12-NEXT:    s_wait_alu 0xfffe
4379; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4380; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v7
4381; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
4382; GFX12-NEXT:    s_cbranch_execnz .LBB22_3
4383; GFX12-NEXT:  ; %bb.1: ; %Flow2
4384; GFX12-NEXT:    s_wait_alu 0xfffe
4385; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
4386; GFX12-NEXT:    s_cbranch_execnz .LBB22_6
4387; GFX12-NEXT:  .LBB22_2: ; %atomicrmw.phi
4388; GFX12-NEXT:    s_wait_alu 0xfffe
4389; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4390; GFX12-NEXT:    s_wait_alu 0xfffe
4391; GFX12-NEXT:    s_setpc_b64 s[30:31]
4392; GFX12-NEXT:  .LBB22_3: ; %atomicrmw.global
4393; GFX12-NEXT:    flat_load_b64 v[2:3], v[6:7]
4394; GFX12-NEXT:    s_mov_b32 s1, 0
4395; GFX12-NEXT:  .LBB22_4: ; %atomicrmw.start
4396; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
4397; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4398; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
4399; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4400; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
4401; GFX12-NEXT:    s_wait_storecnt 0x0
4402; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
4403; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4404; GFX12-NEXT:    global_inv scope:SCOPE_DEV
4405; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
4406; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
4407; GFX12-NEXT:    s_wait_alu 0xfffe
4408; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
4409; GFX12-NEXT:    s_wait_alu 0xfffe
4410; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
4411; GFX12-NEXT:    s_cbranch_execnz .LBB22_4
4412; GFX12-NEXT:  ; %bb.5: ; %Flow
4413; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4414; GFX12-NEXT:    ; implicit-def: $vgpr6_vgpr7
4415; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
4416; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
4417; GFX12-NEXT:    s_cbranch_execz .LBB22_2
4418; GFX12-NEXT:  .LBB22_6: ; %atomicrmw.private
4419; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
4420; GFX12-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc_lo
4421; GFX12-NEXT:    scratch_load_b64 v[0:1], v2, off
4422; GFX12-NEXT:    s_wait_loadcnt 0x0
4423; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
4424; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4425; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
4426; GFX12-NEXT:    scratch_store_b64 v2, v[0:1], off
4427; GFX12-NEXT:    s_wait_alu 0xfffe
4428; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4429; GFX12-NEXT:    s_wait_alu 0xfffe
4430; GFX12-NEXT:    s_setpc_b64 s[30:31]
4431;
4432; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
4433; GFX940:       ; %bb.0:
4434; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4435; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7f8
4436; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
4437; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
4438; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
4439; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4440; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
4441; GFX940-NEXT:    s_cbranch_execnz .LBB22_3
4442; GFX940-NEXT:  ; %bb.1: ; %Flow
4443; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
4444; GFX940-NEXT:    s_cbranch_execnz .LBB22_4
4445; GFX940-NEXT:  .LBB22_2: ; %atomicrmw.phi
4446; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
4447; GFX940-NEXT:    s_setpc_b64 s[30:31]
4448; GFX940-NEXT:  .LBB22_3: ; %atomicrmw.global
4449; GFX940-NEXT:    buffer_wbl2 sc1
4450; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
4451; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4452; GFX940-NEXT:    buffer_inv sc1
4453; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
4454; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
4455; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
4456; GFX940-NEXT:    s_cbranch_execz .LBB22_2
4457; GFX940-NEXT:  .LBB22_4: ; %atomicrmw.private
4458; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4459; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
4460; GFX940-NEXT:    s_nop 0
4461; GFX940-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
4462; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
4463; GFX940-NEXT:    s_waitcnt vmcnt(0)
4464; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4465; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
4466; GFX940-NEXT:    scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
4467; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
4468; GFX940-NEXT:    s_waitcnt vmcnt(0)
4469; GFX940-NEXT:    s_setpc_b64 s[30:31]
4470;
4471; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
4472; GFX11:       ; %bb.0:
4473; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4474; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
4475; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, 0x7f8, v0
4476; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
4477; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
4478; GFX11-NEXT:    s_mov_b32 s0, exec_lo
4479; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4480; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v7
4481; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
4482; GFX11-NEXT:    s_cbranch_execnz .LBB22_3
4483; GFX11-NEXT:  ; %bb.1: ; %Flow2
4484; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
4485; GFX11-NEXT:    s_cbranch_execnz .LBB22_6
4486; GFX11-NEXT:  .LBB22_2: ; %atomicrmw.phi
4487; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4488; GFX11-NEXT:    s_setpc_b64 s[30:31]
4489; GFX11-NEXT:  .LBB22_3: ; %atomicrmw.global
4490; GFX11-NEXT:    flat_load_b64 v[2:3], v[6:7]
4491; GFX11-NEXT:    s_mov_b32 s1, 0
4492; GFX11-NEXT:  .LBB22_4: ; %atomicrmw.start
4493; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
4494; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4495; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
4496; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4497; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
4498; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4499; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc
4500; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4501; GFX11-NEXT:    buffer_gl1_inv
4502; GFX11-NEXT:    buffer_gl0_inv
4503; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
4504; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
4505; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
4506; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4507; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
4508; GFX11-NEXT:    s_cbranch_execnz .LBB22_4
4509; GFX11-NEXT:  ; %bb.5: ; %Flow
4510; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4511; GFX11-NEXT:    ; implicit-def: $vgpr6_vgpr7
4512; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
4513; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
4514; GFX11-NEXT:    s_cbranch_execz .LBB22_2
4515; GFX11-NEXT:  .LBB22_6: ; %atomicrmw.private
4516; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
4517; GFX11-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc_lo
4518; GFX11-NEXT:    scratch_load_b64 v[0:1], v2, off
4519; GFX11-NEXT:    s_waitcnt vmcnt(0)
4520; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4521; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4522; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
4523; GFX11-NEXT:    scratch_store_b64 v2, v[0:1], off
4524; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4525; GFX11-NEXT:    s_setpc_b64 s[30:31]
4526;
4527; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
4528; GFX10:       ; %bb.0:
4529; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4530; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7f8, v0
4531; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
4532; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
4533; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v1
4534; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4535; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
4536; GFX10-NEXT:    s_cbranch_execnz .LBB22_3
4537; GFX10-NEXT:  ; %bb.1: ; %Flow
4538; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
4539; GFX10-NEXT:    s_cbranch_execnz .LBB22_4
4540; GFX10-NEXT:  .LBB22_2: ; %atomicrmw.phi
4541; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4542; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4543; GFX10-NEXT:    s_setpc_b64 s[30:31]
4544; GFX10-NEXT:  .LBB22_3: ; %atomicrmw.global
4545; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4546; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
4547; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
4548; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4549; GFX10-NEXT:    buffer_gl1_inv
4550; GFX10-NEXT:    buffer_gl0_inv
4551; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
4552; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
4553; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
4554; GFX10-NEXT:    s_cbranch_execz .LBB22_2
4555; GFX10-NEXT:  .LBB22_4: ; %atomicrmw.private
4556; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
4557; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
4558; GFX10-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
4559; GFX10-NEXT:    s_clause 0x1
4560; GFX10-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4561; GFX10-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
4562; GFX10-NEXT:    s_waitcnt vmcnt(0)
4563; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4564; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
4565; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
4566; GFX10-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
4567; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4568; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4569; GFX10-NEXT:    s_setpc_b64 s[30:31]
4570;
4571; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
4572; GFX90A:       ; %bb.0:
4573; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4574; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7f8, v0
4575; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
4576; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
4577; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
4578; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4579; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4580; GFX90A-NEXT:    s_cbranch_execnz .LBB22_3
4581; GFX90A-NEXT:  ; %bb.1: ; %Flow
4582; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4583; GFX90A-NEXT:    s_cbranch_execnz .LBB22_4
4584; GFX90A-NEXT:  .LBB22_2: ; %atomicrmw.phi
4585; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4586; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4587; GFX90A-NEXT:  .LBB22_3: ; %atomicrmw.global
4588; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
4589; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4590; GFX90A-NEXT:    buffer_wbinvl1
4591; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
4592; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
4593; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4594; GFX90A-NEXT:    s_cbranch_execz .LBB22_2
4595; GFX90A-NEXT:  .LBB22_4: ; %atomicrmw.private
4596; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4597; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
4598; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4599; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
4600; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
4601; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4602; GFX90A-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4603; GFX90A-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
4604; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
4605; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
4606; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4607; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4608; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4609;
4610; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
4611; GFX908:       ; %bb.0:
4612; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4613; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
4614; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0x7f8, v0
4615; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
4616; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
4617; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
4618; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4619; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4620; GFX908-NEXT:    s_cbranch_execnz .LBB22_3
4621; GFX908-NEXT:  ; %bb.1: ; %Flow2
4622; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4623; GFX908-NEXT:    s_cbranch_execnz .LBB22_6
4624; GFX908-NEXT:  .LBB22_2: ; %atomicrmw.phi
4625; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4626; GFX908-NEXT:    s_setpc_b64 s[30:31]
4627; GFX908-NEXT:  .LBB22_3: ; %atomicrmw.global
4628; GFX908-NEXT:    flat_load_dwordx2 v[2:3], v[6:7]
4629; GFX908-NEXT:    s_mov_b64 s[6:7], 0
4630; GFX908-NEXT:  .LBB22_4: ; %atomicrmw.start
4631; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
4632; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4633; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
4634; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
4635; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
4636; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4637; GFX908-NEXT:    buffer_wbinvl1
4638; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4639; GFX908-NEXT:    v_mov_b32_e32 v3, v1
4640; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4641; GFX908-NEXT:    v_mov_b32_e32 v2, v0
4642; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4643; GFX908-NEXT:    s_cbranch_execnz .LBB22_4
4644; GFX908-NEXT:  ; %bb.5: ; %Flow
4645; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
4646; GFX908-NEXT:    ; implicit-def: $vgpr6_vgpr7
4647; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
4648; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4649; GFX908-NEXT:    s_cbranch_execz .LBB22_2
4650; GFX908-NEXT:  .LBB22_6: ; %atomicrmw.private
4651; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
4652; GFX908-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
4653; GFX908-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
4654; GFX908-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
4655; GFX908-NEXT:    s_waitcnt vmcnt(0)
4656; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4657; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
4658; GFX908-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
4659; GFX908-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
4660; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
4661; GFX908-NEXT:    s_waitcnt vmcnt(0)
4662; GFX908-NEXT:    s_setpc_b64 s[30:31]
4663;
4664; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
4665; GFX8:       ; %bb.0:
4666; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4667; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
4668; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
4669; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
4670; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7f8, v0
4671; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
4672; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
4673; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v7
4674; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4675; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4676; GFX8-NEXT:    s_cbranch_execnz .LBB22_3
4677; GFX8-NEXT:  ; %bb.1: ; %Flow2
4678; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4679; GFX8-NEXT:    s_cbranch_execnz .LBB22_6
4680; GFX8-NEXT:  .LBB22_2: ; %atomicrmw.phi
4681; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4682; GFX8-NEXT:    s_setpc_b64 s[30:31]
4683; GFX8-NEXT:  .LBB22_3: ; %atomicrmw.global
4684; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v6
4685; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v7, vcc
4686; GFX8-NEXT:    flat_load_dword v3, v[0:1]
4687; GFX8-NEXT:    flat_load_dword v2, v[6:7]
4688; GFX8-NEXT:    s_mov_b64 s[6:7], 0
4689; GFX8-NEXT:  .LBB22_4: ; %atomicrmw.start
4690; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
4691; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4692; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
4693; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
4694; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
4695; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4696; GFX8-NEXT:    buffer_wbinvl1
4697; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4698; GFX8-NEXT:    v_mov_b32_e32 v3, v1
4699; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
4700; GFX8-NEXT:    v_mov_b32_e32 v2, v0
4701; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
4702; GFX8-NEXT:    s_cbranch_execnz .LBB22_4
4703; GFX8-NEXT:  ; %bb.5: ; %Flow
4704; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
4705; GFX8-NEXT:    ; implicit-def: $vgpr6_vgpr7
4706; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
4707; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4708; GFX8-NEXT:    s_cbranch_execz .LBB22_2
4709; GFX8-NEXT:  .LBB22_6: ; %atomicrmw.private
4710; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
4711; GFX8-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
4712; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
4713; GFX8-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
4714; GFX8-NEXT:    buffer_load_dword v1, v3, s[0:3], 0 offen
4715; GFX8-NEXT:    s_waitcnt vmcnt(0)
4716; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4717; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
4718; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
4719; GFX8-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
4720; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
4721; GFX8-NEXT:    s_waitcnt vmcnt(0)
4722; GFX8-NEXT:    s_setpc_b64 s[30:31]
4723;
4724; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
4725; GFX7:       ; %bb.0:
4726; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4727; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
4728; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
4729; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7f8, v0
4730; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4731; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4732; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
4733; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4734; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4735; GFX7-NEXT:    s_cbranch_execnz .LBB22_3
4736; GFX7-NEXT:  ; %bb.1: ; %Flow
4737; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4738; GFX7-NEXT:    s_cbranch_execnz .LBB22_4
4739; GFX7-NEXT:  .LBB22_2: ; %atomicrmw.phi
4740; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4741; GFX7-NEXT:    s_setpc_b64 s[30:31]
4742; GFX7-NEXT:  .LBB22_3: ; %atomicrmw.global
4743; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
4744; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4745; GFX7-NEXT:    buffer_wbinvl1
4746; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
4747; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
4748; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4749; GFX7-NEXT:    s_cbranch_execz .LBB22_2
4750; GFX7-NEXT:  .LBB22_4: ; %atomicrmw.private
4751; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4752; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
4753; GFX7-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
4754; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v4
4755; GFX7-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4756; GFX7-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
4757; GFX7-NEXT:    s_waitcnt vmcnt(0)
4758; GFX7-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4759; GFX7-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
4760; GFX7-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
4761; GFX7-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
4762; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
4763; GFX7-NEXT:    s_waitcnt vmcnt(0)
4764; GFX7-NEXT:    s_setpc_b64 s[30:31]
4765  %gep = getelementptr double, ptr %ptr, i64 255
4766  %unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
4767  ret void
4768}
4769
4770define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
4771; GFX12-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
4772; GFX12:       ; %bb.0:
4773; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4774; GFX12-NEXT:    s_wait_expcnt 0x0
4775; GFX12-NEXT:    s_wait_samplecnt 0x0
4776; GFX12-NEXT:    s_wait_bvhcnt 0x0
4777; GFX12-NEXT:    s_wait_kmcnt 0x0
4778; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
4779; GFX12-NEXT:    v_add_co_u32 v6, vcc_lo, 0xfffff800, v0
4780; GFX12-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo
4781; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
4782; GFX12-NEXT:    s_mov_b32 s0, exec_lo
4783; GFX12-NEXT:    s_wait_alu 0xfffe
4784; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4785; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v7
4786; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
4787; GFX12-NEXT:    s_cbranch_execnz .LBB23_3
4788; GFX12-NEXT:  ; %bb.1: ; %Flow2
4789; GFX12-NEXT:    s_wait_alu 0xfffe
4790; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
4791; GFX12-NEXT:    s_cbranch_execnz .LBB23_6
4792; GFX12-NEXT:  .LBB23_2: ; %atomicrmw.phi
4793; GFX12-NEXT:    s_wait_alu 0xfffe
4794; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4795; GFX12-NEXT:    s_wait_alu 0xfffe
4796; GFX12-NEXT:    s_setpc_b64 s[30:31]
4797; GFX12-NEXT:  .LBB23_3: ; %atomicrmw.global
4798; GFX12-NEXT:    flat_load_b64 v[2:3], v[6:7]
4799; GFX12-NEXT:    s_mov_b32 s1, 0
4800; GFX12-NEXT:  .LBB23_4: ; %atomicrmw.start
4801; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
4802; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4803; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
4804; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4805; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
4806; GFX12-NEXT:    s_wait_storecnt 0x0
4807; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
4808; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4809; GFX12-NEXT:    global_inv scope:SCOPE_DEV
4810; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
4811; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
4812; GFX12-NEXT:    s_wait_alu 0xfffe
4813; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
4814; GFX12-NEXT:    s_wait_alu 0xfffe
4815; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
4816; GFX12-NEXT:    s_cbranch_execnz .LBB23_4
4817; GFX12-NEXT:  ; %bb.5: ; %Flow
4818; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4819; GFX12-NEXT:    ; implicit-def: $vgpr6_vgpr7
4820; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
4821; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
4822; GFX12-NEXT:    s_cbranch_execz .LBB23_2
4823; GFX12-NEXT:  .LBB23_6: ; %atomicrmw.private
4824; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
4825; GFX12-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc_lo
4826; GFX12-NEXT:    scratch_load_b64 v[0:1], v2, off
4827; GFX12-NEXT:    s_wait_loadcnt 0x0
4828; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
4829; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4830; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
4831; GFX12-NEXT:    scratch_store_b64 v2, v[0:1], off
4832; GFX12-NEXT:    s_wait_alu 0xfffe
4833; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4834; GFX12-NEXT:    s_wait_alu 0xfffe
4835; GFX12-NEXT:    s_setpc_b64 s[30:31]
4836;
4837; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
4838; GFX940:       ; %bb.0:
4839; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4840; GFX940-NEXT:    s_movk_i32 s0, 0xf800
4841; GFX940-NEXT:    s_mov_b32 s1, -1
4842; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
4843; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
4844; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v1
4845; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
4846; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
4847; GFX940-NEXT:    s_cbranch_execnz .LBB23_3
4848; GFX940-NEXT:  ; %bb.1: ; %Flow
4849; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
4850; GFX940-NEXT:    s_cbranch_execnz .LBB23_4
4851; GFX940-NEXT:  .LBB23_2: ; %atomicrmw.phi
4852; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
4853; GFX940-NEXT:    s_setpc_b64 s[30:31]
4854; GFX940-NEXT:  .LBB23_3: ; %atomicrmw.global
4855; GFX940-NEXT:    buffer_wbl2 sc1
4856; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
4857; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4858; GFX940-NEXT:    buffer_inv sc1
4859; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
4860; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
4861; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
4862; GFX940-NEXT:    s_cbranch_execz .LBB23_2
4863; GFX940-NEXT:  .LBB23_4: ; %atomicrmw.private
4864; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4865; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
4866; GFX940-NEXT:    s_nop 0
4867; GFX940-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
4868; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v4, off
4869; GFX940-NEXT:    s_waitcnt vmcnt(0)
4870; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4871; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
4872; GFX940-NEXT:    scratch_store_dwordx2 v4, v[0:1], off sc0 sc1
4873; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
4874; GFX940-NEXT:    s_waitcnt vmcnt(0)
4875; GFX940-NEXT:    s_setpc_b64 s[30:31]
4876;
4877; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
4878; GFX11:       ; %bb.0:
4879; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4880; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
4881; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, 0xfffff800, v0
4882; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo
4883; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
4884; GFX11-NEXT:    s_mov_b32 s0, exec_lo
4885; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4886; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v7
4887; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
4888; GFX11-NEXT:    s_cbranch_execnz .LBB23_3
4889; GFX11-NEXT:  ; %bb.1: ; %Flow2
4890; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
4891; GFX11-NEXT:    s_cbranch_execnz .LBB23_6
4892; GFX11-NEXT:  .LBB23_2: ; %atomicrmw.phi
4893; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4894; GFX11-NEXT:    s_setpc_b64 s[30:31]
4895; GFX11-NEXT:  .LBB23_3: ; %atomicrmw.global
4896; GFX11-NEXT:    flat_load_b64 v[2:3], v[6:7]
4897; GFX11-NEXT:    s_mov_b32 s1, 0
4898; GFX11-NEXT:  .LBB23_4: ; %atomicrmw.start
4899; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
4900; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4901; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
4902; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4903; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
4904; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4905; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc
4906; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4907; GFX11-NEXT:    buffer_gl1_inv
4908; GFX11-NEXT:    buffer_gl0_inv
4909; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
4910; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
4911; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
4912; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
4913; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
4914; GFX11-NEXT:    s_cbranch_execnz .LBB23_4
4915; GFX11-NEXT:  ; %bb.5: ; %Flow
4916; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
4917; GFX11-NEXT:    ; implicit-def: $vgpr6_vgpr7
4918; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
4919; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
4920; GFX11-NEXT:    s_cbranch_execz .LBB23_2
4921; GFX11-NEXT:  .LBB23_6: ; %atomicrmw.private
4922; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
4923; GFX11-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc_lo
4924; GFX11-NEXT:    scratch_load_b64 v[0:1], v2, off
4925; GFX11-NEXT:    s_waitcnt vmcnt(0)
4926; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4927; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4928; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
4929; GFX11-NEXT:    scratch_store_b64 v2, v[0:1], off
4930; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
4931; GFX11-NEXT:    s_setpc_b64 s[30:31]
4932;
4933; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
4934; GFX10:       ; %bb.0:
4935; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4936; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
4937; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
4938; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
4939; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v1
4940; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
4941; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
4942; GFX10-NEXT:    s_cbranch_execnz .LBB23_3
4943; GFX10-NEXT:  ; %bb.1: ; %Flow
4944; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
4945; GFX10-NEXT:    s_cbranch_execnz .LBB23_4
4946; GFX10-NEXT:  .LBB23_2: ; %atomicrmw.phi
4947; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4948; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4949; GFX10-NEXT:    s_setpc_b64 s[30:31]
4950; GFX10-NEXT:  .LBB23_3: ; %atomicrmw.global
4951; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4952; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
4953; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
4954; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4955; GFX10-NEXT:    buffer_gl1_inv
4956; GFX10-NEXT:    buffer_gl0_inv
4957; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
4958; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
4959; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
4960; GFX10-NEXT:    s_cbranch_execz .LBB23_2
4961; GFX10-NEXT:  .LBB23_4: ; %atomicrmw.private
4962; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
4963; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
4964; GFX10-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
4965; GFX10-NEXT:    s_clause 0x1
4966; GFX10-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
4967; GFX10-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
4968; GFX10-NEXT:    s_waitcnt vmcnt(0)
4969; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
4970; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
4971; GFX10-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
4972; GFX10-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
4973; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
4974; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
4975; GFX10-NEXT:    s_setpc_b64 s[30:31]
4976;
4977; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
4978; GFX90A:       ; %bb.0:
4979; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4980; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
4981; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
4982; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
4983; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
4984; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
4985; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
4986; GFX90A-NEXT:    s_cbranch_execnz .LBB23_3
4987; GFX90A-NEXT:  ; %bb.1: ; %Flow
4988; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
4989; GFX90A-NEXT:    s_cbranch_execnz .LBB23_4
4990; GFX90A-NEXT:  .LBB23_2: ; %atomicrmw.phi
4991; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
4992; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4993; GFX90A-NEXT:  .LBB23_3: ; %atomicrmw.global
4994; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
4995; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
4996; GFX90A-NEXT:    buffer_wbinvl1
4997; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
4998; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
4999; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5000; GFX90A-NEXT:    s_cbranch_execz .LBB23_2
5001; GFX90A-NEXT:  .LBB23_4: ; %atomicrmw.private
5002; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5003; GFX90A-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
5004; GFX90A-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
5005; GFX90A-NEXT:    buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
5006; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
5007; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5008; GFX90A-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
5009; GFX90A-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
5010; GFX90A-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
5011; GFX90A-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
5012; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5013; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5014; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5015;
5016; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
5017; GFX908:       ; %bb.0:
5018; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5019; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
5020; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffff800, v0
5021; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v1, vcc
5022; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
5023; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v7
5024; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5025; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5026; GFX908-NEXT:    s_cbranch_execnz .LBB23_3
5027; GFX908-NEXT:  ; %bb.1: ; %Flow2
5028; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5029; GFX908-NEXT:    s_cbranch_execnz .LBB23_6
5030; GFX908-NEXT:  .LBB23_2: ; %atomicrmw.phi
5031; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5032; GFX908-NEXT:    s_setpc_b64 s[30:31]
5033; GFX908-NEXT:  .LBB23_3: ; %atomicrmw.global
5034; GFX908-NEXT:    flat_load_dwordx2 v[2:3], v[6:7]
5035; GFX908-NEXT:    s_mov_b64 s[6:7], 0
5036; GFX908-NEXT:  .LBB23_4: ; %atomicrmw.start
5037; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
5038; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5039; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
5040; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
5041; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
5042; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5043; GFX908-NEXT:    buffer_wbinvl1
5044; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5045; GFX908-NEXT:    v_mov_b32_e32 v3, v1
5046; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5047; GFX908-NEXT:    v_mov_b32_e32 v2, v0
5048; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5049; GFX908-NEXT:    s_cbranch_execnz .LBB23_4
5050; GFX908-NEXT:  ; %bb.5: ; %Flow
5051; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
5052; GFX908-NEXT:    ; implicit-def: $vgpr6_vgpr7
5053; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
5054; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5055; GFX908-NEXT:    s_cbranch_execz .LBB23_2
5056; GFX908-NEXT:  .LBB23_6: ; %atomicrmw.private
5057; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
5058; GFX908-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
5059; GFX908-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
5060; GFX908-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
5061; GFX908-NEXT:    s_waitcnt vmcnt(0)
5062; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
5063; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
5064; GFX908-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
5065; GFX908-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
5066; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5067; GFX908-NEXT:    s_waitcnt vmcnt(0)
5068; GFX908-NEXT:    s_setpc_b64 s[30:31]
5069;
5070; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
5071; GFX8:       ; %bb.0:
5072; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5073; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
5074; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
5075; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
5076; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xfffff800, v0
5077; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v1, vcc
5078; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5079; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v7
5080; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5081; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5082; GFX8-NEXT:    s_cbranch_execnz .LBB23_3
5083; GFX8-NEXT:  ; %bb.1: ; %Flow2
5084; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5085; GFX8-NEXT:    s_cbranch_execnz .LBB23_6
5086; GFX8-NEXT:  .LBB23_2: ; %atomicrmw.phi
5087; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5088; GFX8-NEXT:    s_setpc_b64 s[30:31]
5089; GFX8-NEXT:  .LBB23_3: ; %atomicrmw.global
5090; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v6
5091; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v7, vcc
5092; GFX8-NEXT:    flat_load_dword v3, v[0:1]
5093; GFX8-NEXT:    flat_load_dword v2, v[6:7]
5094; GFX8-NEXT:    s_mov_b64 s[6:7], 0
5095; GFX8-NEXT:  .LBB23_4: ; %atomicrmw.start
5096; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
5097; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5098; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
5099; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
5100; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc
5101; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5102; GFX8-NEXT:    buffer_wbinvl1
5103; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5104; GFX8-NEXT:    v_mov_b32_e32 v3, v1
5105; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5106; GFX8-NEXT:    v_mov_b32_e32 v2, v0
5107; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5108; GFX8-NEXT:    s_cbranch_execnz .LBB23_4
5109; GFX8-NEXT:  ; %bb.5: ; %Flow
5110; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
5111; GFX8-NEXT:    ; implicit-def: $vgpr6_vgpr7
5112; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
5113; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5114; GFX8-NEXT:    s_cbranch_execz .LBB23_2
5115; GFX8-NEXT:  .LBB23_6: ; %atomicrmw.private
5116; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
5117; GFX8-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
5118; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
5119; GFX8-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
5120; GFX8-NEXT:    buffer_load_dword v1, v3, s[0:3], 0 offen
5121; GFX8-NEXT:    s_waitcnt vmcnt(0)
5122; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
5123; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
5124; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
5125; GFX8-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
5126; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5127; GFX8-NEXT:    s_waitcnt vmcnt(0)
5128; GFX8-NEXT:    s_setpc_b64 s[30:31]
5129;
5130; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
5131; GFX7:       ; %bb.0:
5132; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5133; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
5134; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
5135; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
5136; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
5137; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5138; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
5139; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5140; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5141; GFX7-NEXT:    s_cbranch_execnz .LBB23_3
5142; GFX7-NEXT:  ; %bb.1: ; %Flow
5143; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5144; GFX7-NEXT:    s_cbranch_execnz .LBB23_4
5145; GFX7-NEXT:  .LBB23_2: ; %atomicrmw.phi
5146; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5147; GFX7-NEXT:    s_setpc_b64 s[30:31]
5148; GFX7-NEXT:  .LBB23_3: ; %atomicrmw.global
5149; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
5150; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5151; GFX7-NEXT:    buffer_wbinvl1
5152; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
5153; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
5154; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5155; GFX7-NEXT:    s_cbranch_execz .LBB23_2
5156; GFX7-NEXT:  .LBB23_4: ; %atomicrmw.private
5157; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5158; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
5159; GFX7-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc
5160; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v4
5161; GFX7-NEXT:    buffer_load_dword v0, v4, s[0:3], 0 offen
5162; GFX7-NEXT:    buffer_load_dword v1, v5, s[0:3], 0 offen
5163; GFX7-NEXT:    s_waitcnt vmcnt(0)
5164; GFX7-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
5165; GFX7-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
5166; GFX7-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
5167; GFX7-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
5168; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5169; GFX7-NEXT:    s_waitcnt vmcnt(0)
5170; GFX7-NEXT:    s_setpc_b64 s[30:31]
5171  %gep = getelementptr double, ptr %ptr, i64 -256
5172  %unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
5173  ret void
5174}
5175
5176define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, double %val) #0 {
5177; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
5178; GFX12:       ; %bb.0:
5179; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5180; GFX12-NEXT:    s_wait_expcnt 0x0
5181; GFX12-NEXT:    s_wait_samplecnt 0x0
5182; GFX12-NEXT:    s_wait_bvhcnt 0x0
5183; GFX12-NEXT:    s_wait_kmcnt 0x0
5184; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
5185; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
5186; GFX12-NEXT:    s_mov_b32 s0, exec_lo
5187; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
5188; GFX12-NEXT:    s_wait_alu 0xfffe
5189; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
5190; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
5191; GFX12-NEXT:    s_cbranch_execz .LBB24_4
5192; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
5193; GFX12-NEXT:    flat_load_b64 v[2:3], v[0:1]
5194; GFX12-NEXT:    s_mov_b32 s1, 0
5195; GFX12-NEXT:  .LBB24_2: ; %atomicrmw.start
5196; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
5197; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5198; GFX12-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
5199; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5200; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
5201; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[2:3], v[4:5]
5202; GFX12-NEXT:    s_wait_storecnt 0x0
5203; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
5204; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5205; GFX12-NEXT:    global_inv scope:SCOPE_DEV
5206; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
5207; GFX12-NEXT:    s_wait_alu 0xfffe
5208; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
5209; GFX12-NEXT:    s_wait_alu 0xfffe
5210; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
5211; GFX12-NEXT:    s_cbranch_execnz .LBB24_2
5212; GFX12-NEXT:  ; %bb.3: ; %Flow
5213; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
5214; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
5215; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
5216; GFX12-NEXT:  .LBB24_4: ; %Flow2
5217; GFX12-NEXT:    s_wait_alu 0xfffe
5218; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
5219; GFX12-NEXT:    s_cbranch_execz .LBB24_6
5220; GFX12-NEXT:  ; %bb.5: ; %atomicrmw.private
5221; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
5222; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
5223; GFX12-NEXT:    scratch_load_b64 v[2:3], v6, off
5224; GFX12-NEXT:    s_wait_loadcnt 0x0
5225; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
5226; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5227; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
5228; GFX12-NEXT:    scratch_store_b64 v6, v[0:1], off
5229; GFX12-NEXT:  .LBB24_6: ; %atomicrmw.phi
5230; GFX12-NEXT:    s_wait_alu 0xfffe
5231; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5232; GFX12-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
5233; GFX12-NEXT:    s_wait_alu 0xfffe
5234; GFX12-NEXT:    s_setpc_b64 s[30:31]
5235;
5236; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
5237; GFX940:       ; %bb.0:
5238; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5239; GFX940-NEXT:    v_mov_b32_e32 v5, v1
5240; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
5241; GFX940-NEXT:    v_mov_b32_e32 v4, v0
5242; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
5243; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
5244; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
5245; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
5246; GFX940-NEXT:    s_cbranch_execnz .LBB24_3
5247; GFX940-NEXT:  ; %bb.1: ; %Flow
5248; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
5249; GFX940-NEXT:    s_cbranch_execnz .LBB24_4
5250; GFX940-NEXT:  .LBB24_2: ; %atomicrmw.phi
5251; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
5252; GFX940-NEXT:    s_setpc_b64 s[30:31]
5253; GFX940-NEXT:  .LBB24_3: ; %atomicrmw.global
5254; GFX940-NEXT:    buffer_wbl2 sc1
5255; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0
5256; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5257; GFX940-NEXT:    buffer_inv sc1
5258; GFX940-NEXT:    ; implicit-def: $vgpr4_vgpr5
5259; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
5260; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
5261; GFX940-NEXT:    s_cbranch_execz .LBB24_2
5262; GFX940-NEXT:  .LBB24_4: ; %atomicrmw.private
5263; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
5264; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
5265; GFX940-NEXT:    s_nop 0
5266; GFX940-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
5267; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
5268; GFX940-NEXT:    s_waitcnt vmcnt(0)
5269; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
5270; GFX940-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
5271; GFX940-NEXT:    scratch_store_dwordx2 v6, v[2:3], off sc0 sc1
5272; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
5273; GFX940-NEXT:    s_waitcnt vmcnt(0)
5274; GFX940-NEXT:    s_setpc_b64 s[30:31]
5275;
5276; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
5277; GFX11:       ; %bb.0:
5278; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5279; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
5280; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
5281; GFX11-NEXT:    s_mov_b32 s0, exec_lo
5282; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
5283; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
5284; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
5285; GFX11-NEXT:    s_cbranch_execz .LBB24_4
5286; GFX11-NEXT:  ; %bb.1: ; %atomicrmw.global
5287; GFX11-NEXT:    flat_load_b64 v[2:3], v[0:1]
5288; GFX11-NEXT:    s_mov_b32 s1, 0
5289; GFX11-NEXT:  .LBB24_2: ; %atomicrmw.start
5290; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
5291; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5292; GFX11-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
5293; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5294; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
5295; GFX11-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
5296; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5297; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc
5298; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5299; GFX11-NEXT:    buffer_gl1_inv
5300; GFX11-NEXT:    buffer_gl0_inv
5301; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
5302; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
5303; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5304; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
5305; GFX11-NEXT:    s_cbranch_execnz .LBB24_2
5306; GFX11-NEXT:  ; %bb.3: ; %Flow
5307; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
5308; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
5309; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
5310; GFX11-NEXT:  .LBB24_4: ; %Flow2
5311; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
5312; GFX11-NEXT:    s_cbranch_execz .LBB24_6
5313; GFX11-NEXT:  ; %bb.5: ; %atomicrmw.private
5314; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
5315; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
5316; GFX11-NEXT:    scratch_load_b64 v[2:3], v6, off
5317; GFX11-NEXT:    s_waitcnt vmcnt(0)
5318; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
5319; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5320; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
5321; GFX11-NEXT:    scratch_store_b64 v6, v[0:1], off
5322; GFX11-NEXT:  .LBB24_6: ; %atomicrmw.phi
5323; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5324; GFX11-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
5325; GFX11-NEXT:    s_setpc_b64 s[30:31]
5326;
5327; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
5328; GFX10:       ; %bb.0:
5329; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5330; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
5331; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
5332; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
5333; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v1
5334; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
5335; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
5336; GFX10-NEXT:    s_cbranch_execz .LBB24_4
5337; GFX10-NEXT:  ; %bb.1: ; %atomicrmw.global
5338; GFX10-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
5339; GFX10-NEXT:    s_mov_b32 s5, 0
5340; GFX10-NEXT:  .LBB24_2: ; %atomicrmw.start
5341; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
5342; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5343; GFX10-NEXT:    v_mov_b32_e32 v9, v3
5344; GFX10-NEXT:    v_mov_b32_e32 v8, v2
5345; GFX10-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
5346; GFX10-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
5347; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5348; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
5349; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5350; GFX10-NEXT:    buffer_gl1_inv
5351; GFX10-NEXT:    buffer_gl0_inv
5352; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
5353; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
5354; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
5355; GFX10-NEXT:    s_cbranch_execnz .LBB24_2
5356; GFX10-NEXT:  ; %bb.3: ; %Flow
5357; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
5358; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
5359; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
5360; GFX10-NEXT:  .LBB24_4: ; %Flow2
5361; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
5362; GFX10-NEXT:    s_cbranch_execz .LBB24_6
5363; GFX10-NEXT:  ; %bb.5: ; %atomicrmw.private
5364; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
5365; GFX10-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
5366; GFX10-NEXT:    s_clause 0x1
5367; GFX10-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
5368; GFX10-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
5369; GFX10-NEXT:    s_waitcnt vmcnt(0)
5370; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
5371; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
5372; GFX10-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
5373; GFX10-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
5374; GFX10-NEXT:  .LBB24_6: ; %atomicrmw.phi
5375; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
5376; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5377; GFX10-NEXT:    v_mov_b32_e32 v0, v2
5378; GFX10-NEXT:    v_mov_b32_e32 v1, v3
5379; GFX10-NEXT:    s_setpc_b64 s[30:31]
5380;
5381; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
5382; GFX90A:       ; %bb.0:
5383; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5384; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
5385; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
5386; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
5387; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
5388; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5389; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5390; GFX90A-NEXT:    s_cbranch_execz .LBB24_4
5391; GFX90A-NEXT:  ; %bb.1: ; %atomicrmw.global
5392; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
5393; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
5394; GFX90A-NEXT:  .LBB24_2: ; %atomicrmw.start
5395; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
5396; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5397; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1]
5398; GFX90A-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
5399; GFX90A-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
5400; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
5401; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5402; GFX90A-NEXT:    buffer_wbinvl1
5403; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
5404; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5405; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5406; GFX90A-NEXT:    s_cbranch_execnz .LBB24_2
5407; GFX90A-NEXT:  ; %bb.3: ; %Flow
5408; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
5409; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
5410; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
5411; GFX90A-NEXT:  .LBB24_4: ; %Flow2
5412; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5413; GFX90A-NEXT:    s_cbranch_execz .LBB24_6
5414; GFX90A-NEXT:  ; %bb.5: ; %atomicrmw.private
5415; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5416; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
5417; GFX90A-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
5418; GFX90A-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
5419; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5420; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
5421; GFX90A-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
5422; GFX90A-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
5423; GFX90A-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
5424; GFX90A-NEXT:  .LBB24_6: ; %atomicrmw.phi
5425; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5426; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
5427; GFX90A-NEXT:    v_mov_b32_e32 v1, v3
5428; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5429; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5430;
5431; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
5432; GFX908:       ; %bb.0:
5433; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5434; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
5435; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
5436; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
5437; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
5438; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5439; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5440; GFX908-NEXT:    s_cbranch_execz .LBB24_4
5441; GFX908-NEXT:  ; %bb.1: ; %atomicrmw.global
5442; GFX908-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
5443; GFX908-NEXT:    s_mov_b64 s[6:7], 0
5444; GFX908-NEXT:  .LBB24_2: ; %atomicrmw.start
5445; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
5446; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5447; GFX908-NEXT:    v_mov_b32_e32 v9, v3
5448; GFX908-NEXT:    v_mov_b32_e32 v8, v2
5449; GFX908-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
5450; GFX908-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
5451; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
5452; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5453; GFX908-NEXT:    buffer_wbinvl1
5454; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
5455; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5456; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5457; GFX908-NEXT:    s_cbranch_execnz .LBB24_2
5458; GFX908-NEXT:  ; %bb.3: ; %Flow
5459; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
5460; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
5461; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
5462; GFX908-NEXT:  .LBB24_4: ; %Flow2
5463; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5464; GFX908-NEXT:    s_cbranch_execz .LBB24_6
5465; GFX908-NEXT:  ; %bb.5: ; %atomicrmw.private
5466; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5467; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
5468; GFX908-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
5469; GFX908-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
5470; GFX908-NEXT:    s_waitcnt vmcnt(0)
5471; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
5472; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
5473; GFX908-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
5474; GFX908-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
5475; GFX908-NEXT:  .LBB24_6: ; %atomicrmw.phi
5476; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5477; GFX908-NEXT:    v_mov_b32_e32 v0, v2
5478; GFX908-NEXT:    v_mov_b32_e32 v1, v3
5479; GFX908-NEXT:    s_waitcnt vmcnt(0)
5480; GFX908-NEXT:    s_setpc_b64 s[30:31]
5481;
5482; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
5483; GFX8:       ; %bb.0:
5484; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5485; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
5486; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
5487; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
5488; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
5489; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5490; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
5491; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5492; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5493; GFX8-NEXT:    s_cbranch_execz .LBB24_4
5494; GFX8-NEXT:  ; %bb.1: ; %atomicrmw.global
5495; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
5496; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
5497; GFX8-NEXT:    flat_load_dword v3, v[2:3]
5498; GFX8-NEXT:    flat_load_dword v2, v[0:1]
5499; GFX8-NEXT:    s_mov_b64 s[6:7], 0
5500; GFX8-NEXT:  .LBB24_2: ; %atomicrmw.start
5501; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
5502; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5503; GFX8-NEXT:    v_mov_b32_e32 v9, v3
5504; GFX8-NEXT:    v_mov_b32_e32 v8, v2
5505; GFX8-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
5506; GFX8-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
5507; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
5508; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5509; GFX8-NEXT:    buffer_wbinvl1
5510; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
5511; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5512; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5513; GFX8-NEXT:    s_cbranch_execnz .LBB24_2
5514; GFX8-NEXT:  ; %bb.3: ; %Flow
5515; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
5516; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5517; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
5518; GFX8-NEXT:  .LBB24_4: ; %Flow2
5519; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5520; GFX8-NEXT:    s_cbranch_execz .LBB24_6
5521; GFX8-NEXT:  ; %bb.5: ; %atomicrmw.private
5522; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5523; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
5524; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
5525; GFX8-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
5526; GFX8-NEXT:    buffer_load_dword v3, v7, s[0:3], 0 offen
5527; GFX8-NEXT:    s_waitcnt vmcnt(0)
5528; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
5529; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
5530; GFX8-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
5531; GFX8-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
5532; GFX8-NEXT:  .LBB24_6: ; %atomicrmw.phi
5533; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5534; GFX8-NEXT:    v_mov_b32_e32 v0, v2
5535; GFX8-NEXT:    v_mov_b32_e32 v1, v3
5536; GFX8-NEXT:    s_waitcnt vmcnt(0)
5537; GFX8-NEXT:    s_setpc_b64 s[30:31]
5538;
5539; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
5540; GFX7:       ; %bb.0:
5541; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5542; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
5543; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
5544; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
5545; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
5546; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5547; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
5548; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5549; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5550; GFX7-NEXT:    s_cbranch_execz .LBB24_4
5551; GFX7-NEXT:  ; %bb.1: ; %atomicrmw.global
5552; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
5553; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
5554; GFX7-NEXT:    flat_load_dword v3, v[2:3]
5555; GFX7-NEXT:    flat_load_dword v2, v[0:1]
5556; GFX7-NEXT:    s_mov_b64 s[6:7], 0
5557; GFX7-NEXT:  .LBB24_2: ; %atomicrmw.start
5558; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
5559; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5560; GFX7-NEXT:    v_mov_b32_e32 v9, v3
5561; GFX7-NEXT:    v_mov_b32_e32 v8, v2
5562; GFX7-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
5563; GFX7-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
5564; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
5565; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5566; GFX7-NEXT:    buffer_wbinvl1
5567; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
5568; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5569; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5570; GFX7-NEXT:    s_cbranch_execnz .LBB24_2
5571; GFX7-NEXT:  ; %bb.3: ; %Flow
5572; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
5573; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
5574; GFX7-NEXT:    ; implicit-def: $vgpr4_vgpr5
5575; GFX7-NEXT:  .LBB24_4: ; %Flow2
5576; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5577; GFX7-NEXT:    s_cbranch_execz .LBB24_6
5578; GFX7-NEXT:  ; %bb.5: ; %atomicrmw.private
5579; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5580; GFX7-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
5581; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 4, v6
5582; GFX7-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
5583; GFX7-NEXT:    buffer_load_dword v3, v7, s[0:3], 0 offen
5584; GFX7-NEXT:    s_waitcnt vmcnt(0)
5585; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
5586; GFX7-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
5587; GFX7-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
5588; GFX7-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
5589; GFX7-NEXT:  .LBB24_6: ; %atomicrmw.phi
5590; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5591; GFX7-NEXT:    v_mov_b32_e32 v0, v2
5592; GFX7-NEXT:    v_mov_b32_e32 v1, v3
5593; GFX7-NEXT:    s_waitcnt vmcnt(0)
5594; GFX7-NEXT:    s_setpc_b64 s[30:31]
5595  %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
5596  ret double %result
5597}
5598
5599define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, double %val) #0 {
5600; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
5601; GFX12:       ; %bb.0:
5602; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5603; GFX12-NEXT:    s_wait_expcnt 0x0
5604; GFX12-NEXT:    s_wait_samplecnt 0x0
5605; GFX12-NEXT:    s_wait_bvhcnt 0x0
5606; GFX12-NEXT:    s_wait_kmcnt 0x0
5607; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
5608; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
5609; GFX12-NEXT:    s_mov_b32 s0, exec_lo
5610; GFX12-NEXT:    ; implicit-def: $vgpr2_vgpr3
5611; GFX12-NEXT:    s_wait_alu 0xfffe
5612; GFX12-NEXT:    v_cmpx_ne_u32_e64 s1, v1
5613; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
5614; GFX12-NEXT:    s_cbranch_execz .LBB25_4
5615; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
5616; GFX12-NEXT:    flat_load_b64 v[2:3], v[0:1]
5617; GFX12-NEXT:    s_mov_b32 s1, 0
5618; GFX12-NEXT:  .LBB25_2: ; %atomicrmw.start
5619; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
5620; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5621; GFX12-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
5622; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5623; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[8:9], v[8:9]
5624; GFX12-NEXT:    v_min_num_f64_e32 v[6:7], v[2:3], v[4:5]
5625; GFX12-NEXT:    s_wait_storecnt 0x0
5626; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
5627; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5628; GFX12-NEXT:    global_inv scope:SCOPE_DEV
5629; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
5630; GFX12-NEXT:    s_wait_alu 0xfffe
5631; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
5632; GFX12-NEXT:    s_wait_alu 0xfffe
5633; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
5634; GFX12-NEXT:    s_cbranch_execnz .LBB25_2
5635; GFX12-NEXT:  ; %bb.3: ; %Flow
5636; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
5637; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
5638; GFX12-NEXT:    ; implicit-def: $vgpr4_vgpr5
5639; GFX12-NEXT:  .LBB25_4: ; %Flow2
5640; GFX12-NEXT:    s_wait_alu 0xfffe
5641; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
5642; GFX12-NEXT:    s_cbranch_execz .LBB25_6
5643; GFX12-NEXT:  ; %bb.5: ; %atomicrmw.private
5644; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
5645; GFX12-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
5646; GFX12-NEXT:    scratch_load_b64 v[2:3], v6, off
5647; GFX12-NEXT:    s_wait_loadcnt 0x0
5648; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
5649; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5650; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
5651; GFX12-NEXT:    scratch_store_b64 v6, v[0:1], off
5652; GFX12-NEXT:  .LBB25_6: ; %atomicrmw.phi
5653; GFX12-NEXT:    s_wait_alu 0xfffe
5654; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5655; GFX12-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
5656; GFX12-NEXT:    s_wait_alu 0xfffe
5657; GFX12-NEXT:    s_setpc_b64 s[30:31]
5658;
5659; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
5660; GFX940:       ; %bb.0:
5661; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5662; GFX940-NEXT:    v_mov_b32_e32 v5, v1
5663; GFX940-NEXT:    s_mov_b64 s[0:1], src_private_base
5664; GFX940-NEXT:    v_mov_b32_e32 v4, v0
5665; GFX940-NEXT:    v_cmp_ne_u32_e32 vcc, s1, v5
5666; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1
5667; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], vcc
5668; GFX940-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
5669; GFX940-NEXT:    s_cbranch_execnz .LBB25_3
5670; GFX940-NEXT:  ; %bb.1: ; %Flow
5671; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
5672; GFX940-NEXT:    s_cbranch_execnz .LBB25_4
5673; GFX940-NEXT:  .LBB25_2: ; %atomicrmw.phi
5674; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
5675; GFX940-NEXT:    s_setpc_b64 s[30:31]
5676; GFX940-NEXT:  .LBB25_3: ; %atomicrmw.global
5677; GFX940-NEXT:    buffer_wbl2 sc1
5678; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0
5679; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5680; GFX940-NEXT:    buffer_inv sc1
5681; GFX940-NEXT:    ; implicit-def: $vgpr4_vgpr5
5682; GFX940-NEXT:    ; implicit-def: $vgpr2_vgpr3
5683; GFX940-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
5684; GFX940-NEXT:    s_cbranch_execz .LBB25_2
5685; GFX940-NEXT:  .LBB25_4: ; %atomicrmw.private
5686; GFX940-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
5687; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
5688; GFX940-NEXT:    s_nop 0
5689; GFX940-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
5690; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v6, off
5691; GFX940-NEXT:    s_waitcnt vmcnt(0)
5692; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
5693; GFX940-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
5694; GFX940-NEXT:    scratch_store_dwordx2 v6, v[2:3], off sc0 sc1
5695; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
5696; GFX940-NEXT:    s_waitcnt vmcnt(0)
5697; GFX940-NEXT:    s_setpc_b64 s[30:31]
5698;
5699; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
5700; GFX11:       ; %bb.0:
5701; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5702; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
5703; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
5704; GFX11-NEXT:    s_mov_b32 s0, exec_lo
5705; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
5706; GFX11-NEXT:    v_cmpx_ne_u32_e64 s1, v1
5707; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
5708; GFX11-NEXT:    s_cbranch_execz .LBB25_4
5709; GFX11-NEXT:  ; %bb.1: ; %atomicrmw.global
5710; GFX11-NEXT:    flat_load_b64 v[2:3], v[0:1]
5711; GFX11-NEXT:    s_mov_b32 s1, 0
5712; GFX11-NEXT:  .LBB25_2: ; %atomicrmw.start
5713; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
5714; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5715; GFX11-NEXT:    v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2
5716; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5717; GFX11-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
5718; GFX11-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
5719; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5720; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc
5721; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5722; GFX11-NEXT:    buffer_gl1_inv
5723; GFX11-NEXT:    buffer_gl0_inv
5724; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9]
5725; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
5726; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
5727; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
5728; GFX11-NEXT:    s_cbranch_execnz .LBB25_2
5729; GFX11-NEXT:  ; %bb.3: ; %Flow
5730; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
5731; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
5732; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
5733; GFX11-NEXT:  .LBB25_4: ; %Flow2
5734; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
5735; GFX11-NEXT:    s_cbranch_execz .LBB25_6
5736; GFX11-NEXT:  ; %bb.5: ; %atomicrmw.private
5737; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
5738; GFX11-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
5739; GFX11-NEXT:    scratch_load_b64 v[2:3], v6, off
5740; GFX11-NEXT:    s_waitcnt vmcnt(0)
5741; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
5742; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5743; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
5744; GFX11-NEXT:    scratch_store_b64 v6, v[0:1], off
5745; GFX11-NEXT:  .LBB25_6: ; %atomicrmw.phi
5746; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
5747; GFX11-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
5748; GFX11-NEXT:    s_setpc_b64 s[30:31]
5749;
5750; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
5751; GFX10:       ; %bb.0:
5752; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5753; GFX10-NEXT:    v_mov_b32_e32 v5, v1
5754; GFX10-NEXT:    v_mov_b32_e32 v4, v0
5755; GFX10-NEXT:    s_mov_b64 s[4:5], src_private_base
5756; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
5757; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v5
5758; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
5759; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s4
5760; GFX10-NEXT:    s_cbranch_execnz .LBB25_3
5761; GFX10-NEXT:  ; %bb.1: ; %Flow
5762; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
5763; GFX10-NEXT:    s_cbranch_execnz .LBB25_4
5764; GFX10-NEXT:  .LBB25_2: ; %atomicrmw.phi
5765; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
5766; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5767; GFX10-NEXT:    s_setpc_b64 s[30:31]
5768; GFX10-NEXT:  .LBB25_3: ; %atomicrmw.global
5769; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5770; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc
5771; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5772; GFX10-NEXT:    buffer_gl1_inv
5773; GFX10-NEXT:    buffer_gl0_inv
5774; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
5775; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
5776; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
5777; GFX10-NEXT:    s_cbranch_execz .LBB25_2
5778; GFX10-NEXT:  .LBB25_4: ; %atomicrmw.private
5779; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
5780; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
5781; GFX10-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
5782; GFX10-NEXT:    s_clause 0x1
5783; GFX10-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
5784; GFX10-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
5785; GFX10-NEXT:    s_waitcnt vmcnt(0)
5786; GFX10-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
5787; GFX10-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
5788; GFX10-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
5789; GFX10-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
5790; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
5791; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
5792; GFX10-NEXT:    s_setpc_b64 s[30:31]
5793;
5794; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
5795; GFX90A:       ; %bb.0:
5796; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5797; GFX90A-NEXT:    v_mov_b32_e32 v5, v1
5798; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base
5799; GFX90A-NEXT:    v_mov_b32_e32 v4, v0
5800; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v5
5801; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
5802; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5803; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5804; GFX90A-NEXT:    s_cbranch_execnz .LBB25_3
5805; GFX90A-NEXT:  ; %bb.1: ; %Flow
5806; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5807; GFX90A-NEXT:    s_cbranch_execnz .LBB25_4
5808; GFX90A-NEXT:  .LBB25_2: ; %atomicrmw.phi
5809; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5810; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5811; GFX90A-NEXT:  .LBB25_3: ; %atomicrmw.global
5812; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc
5813; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5814; GFX90A-NEXT:    buffer_wbinvl1
5815; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
5816; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
5817; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5818; GFX90A-NEXT:    s_cbranch_execz .LBB25_2
5819; GFX90A-NEXT:  .LBB25_4: ; %atomicrmw.private
5820; GFX90A-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
5821; GFX90A-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
5822; GFX90A-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
5823; GFX90A-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen offset:4
5824; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
5825; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5826; GFX90A-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
5827; GFX90A-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
5828; GFX90A-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
5829; GFX90A-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen offset:4
5830; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
5831; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5832; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5833;
5834; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
5835; GFX908:       ; %bb.0:
5836; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5837; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
5838; GFX908-NEXT:    s_mov_b64 s[4:5], src_private_base
5839; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
5840; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
5841; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5842; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5843; GFX908-NEXT:    s_cbranch_execz .LBB25_4
5844; GFX908-NEXT:  ; %bb.1: ; %atomicrmw.global
5845; GFX908-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
5846; GFX908-NEXT:    s_mov_b64 s[6:7], 0
5847; GFX908-NEXT:  .LBB25_2: ; %atomicrmw.start
5848; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
5849; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5850; GFX908-NEXT:    v_mov_b32_e32 v9, v3
5851; GFX908-NEXT:    v_mov_b32_e32 v8, v2
5852; GFX908-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
5853; GFX908-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
5854; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
5855; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5856; GFX908-NEXT:    buffer_wbinvl1
5857; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
5858; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5859; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5860; GFX908-NEXT:    s_cbranch_execnz .LBB25_2
5861; GFX908-NEXT:  ; %bb.3: ; %Flow
5862; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
5863; GFX908-NEXT:    ; implicit-def: $vgpr0_vgpr1
5864; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
5865; GFX908-NEXT:  .LBB25_4: ; %Flow2
5866; GFX908-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5867; GFX908-NEXT:    s_cbranch_execz .LBB25_6
5868; GFX908-NEXT:  ; %bb.5: ; %atomicrmw.private
5869; GFX908-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5870; GFX908-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
5871; GFX908-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
5872; GFX908-NEXT:    buffer_load_dword v3, v6, s[0:3], 0 offen offset:4
5873; GFX908-NEXT:    s_waitcnt vmcnt(0)
5874; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
5875; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
5876; GFX908-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
5877; GFX908-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
5878; GFX908-NEXT:  .LBB25_6: ; %atomicrmw.phi
5879; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
5880; GFX908-NEXT:    v_mov_b32_e32 v0, v2
5881; GFX908-NEXT:    v_mov_b32_e32 v1, v3
5882; GFX908-NEXT:    s_waitcnt vmcnt(0)
5883; GFX908-NEXT:    s_setpc_b64 s[30:31]
5884;
5885; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
5886; GFX8:       ; %bb.0:
5887; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5888; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
5889; GFX8-NEXT:    s_mov_b64 s[4:5], 0xc0
5890; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
5891; GFX8-NEXT:    ; implicit-def: $vgpr2_vgpr3
5892; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
5893; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
5894; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5895; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5896; GFX8-NEXT:    s_cbranch_execz .LBB25_4
5897; GFX8-NEXT:  ; %bb.1: ; %atomicrmw.global
5898; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
5899; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
5900; GFX8-NEXT:    flat_load_dword v3, v[2:3]
5901; GFX8-NEXT:    flat_load_dword v2, v[0:1]
5902; GFX8-NEXT:    s_mov_b64 s[6:7], 0
5903; GFX8-NEXT:  .LBB25_2: ; %atomicrmw.start
5904; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
5905; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5906; GFX8-NEXT:    v_mov_b32_e32 v9, v3
5907; GFX8-NEXT:    v_mov_b32_e32 v8, v2
5908; GFX8-NEXT:    v_max_f64 v[2:3], v[8:9], v[8:9]
5909; GFX8-NEXT:    v_min_f64 v[6:7], v[2:3], v[4:5]
5910; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
5911; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5912; GFX8-NEXT:    buffer_wbinvl1
5913; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
5914; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
5915; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
5916; GFX8-NEXT:    s_cbranch_execnz .LBB25_2
5917; GFX8-NEXT:  ; %bb.3: ; %Flow
5918; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
5919; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
5920; GFX8-NEXT:    ; implicit-def: $vgpr4_vgpr5
5921; GFX8-NEXT:  .LBB25_4: ; %Flow2
5922; GFX8-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5923; GFX8-NEXT:    s_cbranch_execz .LBB25_6
5924; GFX8-NEXT:  ; %bb.5: ; %atomicrmw.private
5925; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5926; GFX8-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc
5927; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 4, v6
5928; GFX8-NEXT:    buffer_load_dword v2, v6, s[0:3], 0 offen
5929; GFX8-NEXT:    buffer_load_dword v3, v7, s[0:3], 0 offen
5930; GFX8-NEXT:    s_waitcnt vmcnt(0)
5931; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
5932; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
5933; GFX8-NEXT:    buffer_store_dword v0, v6, s[0:3], 0 offen
5934; GFX8-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
5935; GFX8-NEXT:  .LBB25_6: ; %atomicrmw.phi
5936; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
5937; GFX8-NEXT:    v_mov_b32_e32 v0, v2
5938; GFX8-NEXT:    v_mov_b32_e32 v1, v3
5939; GFX8-NEXT:    s_waitcnt vmcnt(0)
5940; GFX8-NEXT:    s_setpc_b64 s[30:31]
5941;
5942; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
5943; GFX7:       ; %bb.0:
5944; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5945; GFX7-NEXT:    s_mov_b64 s[4:5], 0xc0
5946; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
5947; GFX7-NEXT:    v_mov_b32_e32 v5, v1
5948; GFX7-NEXT:    v_mov_b32_e32 v4, v0
5949; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1
5950; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5951; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v5
5952; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
5953; GFX7-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
5954; GFX7-NEXT:    s_cbranch_execnz .LBB25_3
5955; GFX7-NEXT:  ; %bb.1: ; %Flow
5956; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5957; GFX7-NEXT:    s_cbranch_execnz .LBB25_4
5958; GFX7-NEXT:  .LBB25_2: ; %atomicrmw.phi
5959; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5960; GFX7-NEXT:    s_setpc_b64 s[30:31]
5961; GFX7-NEXT:  .LBB25_3: ; %atomicrmw.global
5962; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc
5963; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
5964; GFX7-NEXT:    buffer_wbinvl1
5965; GFX7-NEXT:    ; implicit-def: $vgpr4_vgpr5
5966; GFX7-NEXT:    ; implicit-def: $vgpr2_vgpr3
5967; GFX7-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
5968; GFX7-NEXT:    s_cbranch_execz .LBB25_2
5969; GFX7-NEXT:  .LBB25_4: ; %atomicrmw.private
5970; GFX7-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
5971; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
5972; GFX7-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc
5973; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 4, v6
5974; GFX7-NEXT:    buffer_load_dword v0, v6, s[0:3], 0 offen
5975; GFX7-NEXT:    buffer_load_dword v1, v7, s[0:3], 0 offen
5976; GFX7-NEXT:    s_waitcnt vmcnt(0)
5977; GFX7-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
5978; GFX7-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
5979; GFX7-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
5980; GFX7-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
5981; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
5982; GFX7-NEXT:    s_waitcnt vmcnt(0)
5983; GFX7-NEXT:    s_setpc_b64 s[30:31]
5984  %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
5985  ret double %result
5986}
5987
5988; --------------------------------------------------------------------
5989; half
5990; --------------------------------------------------------------------
5991
5992define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
5993; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
5994; GFX12:       ; %bb.0:
5995; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
5996; GFX12-NEXT:    s_wait_expcnt 0x0
5997; GFX12-NEXT:    s_wait_samplecnt 0x0
5998; GFX12-NEXT:    s_wait_bvhcnt 0x0
5999; GFX12-NEXT:    s_wait_kmcnt 0x0
6000; GFX12-NEXT:    v_mov_b32_e32 v3, v0
6001; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
6002; GFX12-NEXT:    s_mov_b32 s0, 0
6003; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
6004; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
6005; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
6006; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
6007; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6008; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
6009; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6010; GFX12-NEXT:    v_not_b32_e32 v4, v4
6011; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
6012; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
6013; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6014; GFX12-NEXT:    v_mov_b32_e32 v6, v5
6015; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6016; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6017; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
6018; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6019; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v2
6020; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
6021; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6022; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6023; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
6024; GFX12-NEXT:    s_wait_storecnt 0x0
6025; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
6026; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6027; GFX12-NEXT:    global_inv scope:SCOPE_DEV
6028; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
6029; GFX12-NEXT:    s_wait_alu 0xfffe
6030; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
6031; GFX12-NEXT:    s_wait_alu 0xfffe
6032; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
6033; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
6034; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
6035; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6036; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6037; GFX12-NEXT:    s_wait_alu 0xfffe
6038; GFX12-NEXT:    s_setpc_b64 s[30:31]
6039;
6040; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
6041; GFX940:       ; %bb.0:
6042; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6043; GFX940-NEXT:    v_mov_b32_e32 v3, v0
6044; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
6045; GFX940-NEXT:    flat_load_dword v5, v[0:1]
6046; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
6047; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6048; GFX940-NEXT:    s_mov_b32 s0, 0xffff
6049; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
6050; GFX940-NEXT:    v_not_b32_e32 v4, v4
6051; GFX940-NEXT:    s_mov_b64 s[0:1], 0
6052; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
6053; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
6054; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
6055; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6056; GFX940-NEXT:    v_mov_b32_e32 v7, v5
6057; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
6058; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
6059; GFX940-NEXT:    v_min_f16_e32 v5, v5, v2
6060; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6061; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
6062; GFX940-NEXT:    buffer_wbl2 sc1
6063; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
6064; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6065; GFX940-NEXT:    buffer_inv sc1
6066; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
6067; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6068; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6069; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
6070; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
6071; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
6072; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6073; GFX940-NEXT:    s_setpc_b64 s[30:31]
6074;
6075; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
6076; GFX11:       ; %bb.0:
6077; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6078; GFX11-NEXT:    v_mov_b32_e32 v3, v0
6079; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
6080; GFX11-NEXT:    s_mov_b32 s0, 0
6081; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
6082; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
6083; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
6084; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
6085; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6086; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
6087; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6088; GFX11-NEXT:    v_not_b32_e32 v4, v4
6089; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
6090; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
6091; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6092; GFX11-NEXT:    v_mov_b32_e32 v6, v5
6093; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6094; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6095; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
6096; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6097; GFX11-NEXT:    v_min_f16_e32 v5, v5, v2
6098; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
6099; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6100; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6101; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
6102; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
6103; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
6104; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6105; GFX11-NEXT:    buffer_gl1_inv
6106; GFX11-NEXT:    buffer_gl0_inv
6107; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
6108; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
6109; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6110; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
6111; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
6112; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
6113; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6114; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6115; GFX11-NEXT:    s_setpc_b64 s[30:31]
6116;
6117; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
6118; GFX10:       ; %bb.0:
6119; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6120; GFX10-NEXT:    v_mov_b32_e32 v3, v0
6121; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
6122; GFX10-NEXT:    s_mov_b32 s4, 0
6123; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
6124; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
6125; GFX10-NEXT:    flat_load_dword v5, v[0:1]
6126; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6127; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
6128; GFX10-NEXT:    v_not_b32_e32 v4, v4
6129; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
6130; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
6131; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6132; GFX10-NEXT:    v_mov_b32_e32 v6, v5
6133; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6134; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
6135; GFX10-NEXT:    v_min_f16_e32 v5, v5, v2
6136; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
6137; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
6138; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
6139; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6140; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6141; GFX10-NEXT:    buffer_gl1_inv
6142; GFX10-NEXT:    buffer_gl0_inv
6143; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
6144; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
6145; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
6146; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
6147; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
6148; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6149; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6150; GFX10-NEXT:    s_setpc_b64 s[30:31]
6151;
6152; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
6153; GFX90A:       ; %bb.0:
6154; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6155; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
6156; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
6157; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
6158; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
6159; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6160; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
6161; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
6162; GFX90A-NEXT:    v_not_b32_e32 v4, v4
6163; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
6164; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
6165; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
6166; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
6167; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6168; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
6169; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
6170; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
6171; GFX90A-NEXT:    v_min_f16_e32 v5, v5, v2
6172; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6173; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
6174; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
6175; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6176; GFX90A-NEXT:    buffer_wbinvl1
6177; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
6178; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6179; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6180; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
6181; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
6182; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
6183; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6184; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6185;
6186; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
6187; GFX908:       ; %bb.0:
6188; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6189; GFX908-NEXT:    v_mov_b32_e32 v3, v0
6190; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
6191; GFX908-NEXT:    flat_load_dword v5, v[0:1]
6192; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
6193; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6194; GFX908-NEXT:    s_mov_b32 s4, 0xffff
6195; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
6196; GFX908-NEXT:    v_not_b32_e32 v4, v4
6197; GFX908-NEXT:    s_mov_b64 s[4:5], 0
6198; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
6199; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
6200; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
6201; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6202; GFX908-NEXT:    v_mov_b32_e32 v6, v5
6203; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6204; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
6205; GFX908-NEXT:    v_min_f16_e32 v5, v5, v2
6206; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6207; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
6208; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6209; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6210; GFX908-NEXT:    buffer_wbinvl1
6211; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
6212; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6213; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6214; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
6215; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
6216; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
6217; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6218; GFX908-NEXT:    s_setpc_b64 s[30:31]
6219;
6220; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
6221; GFX8:       ; %bb.0:
6222; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6223; GFX8-NEXT:    v_mov_b32_e32 v3, v0
6224; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
6225; GFX8-NEXT:    flat_load_dword v5, v[0:1]
6226; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
6227; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6228; GFX8-NEXT:    s_mov_b32 s4, 0xffff
6229; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
6230; GFX8-NEXT:    v_not_b32_e32 v4, v4
6231; GFX8-NEXT:    s_mov_b64 s[4:5], 0
6232; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
6233; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
6234; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
6235; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6236; GFX8-NEXT:    v_mov_b32_e32 v6, v5
6237; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6238; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
6239; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
6240; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
6241; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6242; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
6243; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6244; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6245; GFX8-NEXT:    buffer_wbinvl1
6246; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
6247; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6248; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6249; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
6250; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
6251; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
6252; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6253; GFX8-NEXT:    s_setpc_b64 s[30:31]
6254;
6255; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
6256; GFX7:       ; %bb.0:
6257; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6258; GFX7-NEXT:    v_mov_b32_e32 v3, v0
6259; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
6260; GFX7-NEXT:    flat_load_dword v5, v[0:1]
6261; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
6262; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
6263; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
6264; GFX7-NEXT:    s_mov_b64 s[4:5], 0
6265; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
6266; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
6267; GFX7-NEXT:    v_not_b32_e32 v4, v4
6268; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
6269; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
6270; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6271; GFX7-NEXT:    v_mov_b32_e32 v6, v5
6272; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
6273; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
6274; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
6275; GFX7-NEXT:    v_min_f32_e32 v5, v5, v3
6276; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
6277; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
6278; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
6279; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6280; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6281; GFX7-NEXT:    buffer_wbinvl1
6282; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
6283; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6284; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6285; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
6286; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
6287; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
6288; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
6289; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
6290; GFX7-NEXT:    s_setpc_b64 s[30:31]
6291  %result = atomicrmw fmin ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
6292  ret half %result
6293}
6294
6295define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
6296; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
6297; GFX12:       ; %bb.0:
6298; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6299; GFX12-NEXT:    s_wait_expcnt 0x0
6300; GFX12-NEXT:    s_wait_samplecnt 0x0
6301; GFX12-NEXT:    s_wait_bvhcnt 0x0
6302; GFX12-NEXT:    s_wait_kmcnt 0x0
6303; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
6304; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
6305; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
6306; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
6307; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
6308; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
6309; GFX12-NEXT:    s_mov_b32 s0, 0
6310; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
6311; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6312; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
6313; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6314; GFX12-NEXT:    v_not_b32_e32 v4, v4
6315; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
6316; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
6317; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6318; GFX12-NEXT:    v_mov_b32_e32 v6, v5
6319; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6320; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6321; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
6322; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6323; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v2
6324; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
6325; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6326; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6327; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
6328; GFX12-NEXT:    s_wait_storecnt 0x0
6329; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
6330; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6331; GFX12-NEXT:    global_inv scope:SCOPE_DEV
6332; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
6333; GFX12-NEXT:    s_wait_alu 0xfffe
6334; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
6335; GFX12-NEXT:    s_wait_alu 0xfffe
6336; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
6337; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
6338; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
6339; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6340; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6341; GFX12-NEXT:    s_wait_alu 0xfffe
6342; GFX12-NEXT:    s_setpc_b64 s[30:31]
6343;
6344; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
6345; GFX940:       ; %bb.0:
6346; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6347; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
6348; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
6349; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
6350; GFX940-NEXT:    v_mov_b32_e32 v1, v5
6351; GFX940-NEXT:    flat_load_dword v5, v[0:1]
6352; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
6353; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6354; GFX940-NEXT:    s_mov_b32 s0, 0xffff
6355; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
6356; GFX940-NEXT:    v_not_b32_e32 v4, v4
6357; GFX940-NEXT:    s_mov_b64 s[0:1], 0
6358; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
6359; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
6360; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
6361; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6362; GFX940-NEXT:    v_mov_b32_e32 v7, v5
6363; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
6364; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
6365; GFX940-NEXT:    v_min_f16_e32 v5, v5, v2
6366; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6367; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
6368; GFX940-NEXT:    buffer_wbl2 sc1
6369; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
6370; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6371; GFX940-NEXT:    buffer_inv sc1
6372; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
6373; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6374; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6375; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
6376; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
6377; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
6378; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6379; GFX940-NEXT:    s_setpc_b64 s[30:31]
6380;
6381; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
6382; GFX11:       ; %bb.0:
6383; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6384; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
6385; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
6386; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
6387; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
6388; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
6389; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
6390; GFX11-NEXT:    s_mov_b32 s0, 0
6391; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
6392; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6393; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
6394; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6395; GFX11-NEXT:    v_not_b32_e32 v4, v4
6396; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
6397; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
6398; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6399; GFX11-NEXT:    v_mov_b32_e32 v6, v5
6400; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6401; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6402; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
6403; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6404; GFX11-NEXT:    v_min_f16_e32 v5, v5, v2
6405; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
6406; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6407; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6408; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
6409; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
6410; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
6411; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6412; GFX11-NEXT:    buffer_gl1_inv
6413; GFX11-NEXT:    buffer_gl0_inv
6414; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
6415; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
6416; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6417; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
6418; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
6419; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
6420; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6421; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6422; GFX11-NEXT:    s_setpc_b64 s[30:31]
6423;
6424; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
6425; GFX10:       ; %bb.0:
6426; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6427; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
6428; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
6429; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
6430; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
6431; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
6432; GFX10-NEXT:    s_mov_b32 s4, 0
6433; GFX10-NEXT:    flat_load_dword v5, v[0:1]
6434; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6435; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
6436; GFX10-NEXT:    v_not_b32_e32 v4, v4
6437; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
6438; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
6439; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6440; GFX10-NEXT:    v_mov_b32_e32 v6, v5
6441; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6442; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
6443; GFX10-NEXT:    v_min_f16_e32 v5, v5, v2
6444; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
6445; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
6446; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
6447; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6448; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6449; GFX10-NEXT:    buffer_gl1_inv
6450; GFX10-NEXT:    buffer_gl0_inv
6451; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
6452; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
6453; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
6454; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
6455; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
6456; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6457; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6458; GFX10-NEXT:    s_setpc_b64 s[30:31]
6459;
6460; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
6461; GFX90A:       ; %bb.0:
6462; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6463; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
6464; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
6465; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
6466; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
6467; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
6468; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6469; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
6470; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
6471; GFX90A-NEXT:    v_not_b32_e32 v4, v4
6472; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
6473; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
6474; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
6475; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
6476; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6477; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
6478; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
6479; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
6480; GFX90A-NEXT:    v_min_f16_e32 v5, v5, v2
6481; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6482; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
6483; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
6484; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6485; GFX90A-NEXT:    buffer_wbinvl1
6486; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
6487; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6488; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6489; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
6490; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
6491; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
6492; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6493; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6494;
6495; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
6496; GFX908:       ; %bb.0:
6497; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6498; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
6499; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
6500; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
6501; GFX908-NEXT:    flat_load_dword v5, v[0:1]
6502; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
6503; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6504; GFX908-NEXT:    s_mov_b32 s4, 0xffff
6505; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
6506; GFX908-NEXT:    v_not_b32_e32 v4, v4
6507; GFX908-NEXT:    s_mov_b64 s[4:5], 0
6508; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
6509; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
6510; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
6511; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6512; GFX908-NEXT:    v_mov_b32_e32 v6, v5
6513; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6514; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
6515; GFX908-NEXT:    v_min_f16_e32 v5, v5, v2
6516; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6517; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
6518; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6519; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6520; GFX908-NEXT:    buffer_wbinvl1
6521; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
6522; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6523; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6524; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
6525; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
6526; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
6527; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6528; GFX908-NEXT:    s_setpc_b64 s[30:31]
6529;
6530; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
6531; GFX8:       ; %bb.0:
6532; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6533; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
6534; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6535; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
6536; GFX8-NEXT:    flat_load_dword v5, v[0:1]
6537; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
6538; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6539; GFX8-NEXT:    s_mov_b32 s4, 0xffff
6540; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
6541; GFX8-NEXT:    v_not_b32_e32 v4, v4
6542; GFX8-NEXT:    s_mov_b64 s[4:5], 0
6543; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
6544; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
6545; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
6546; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6547; GFX8-NEXT:    v_mov_b32_e32 v6, v5
6548; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6549; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
6550; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
6551; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
6552; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6553; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
6554; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6555; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6556; GFX8-NEXT:    buffer_wbinvl1
6557; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
6558; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6559; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6560; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
6561; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
6562; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
6563; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6564; GFX8-NEXT:    s_setpc_b64 s[30:31]
6565;
6566; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
6567; GFX7:       ; %bb.0:
6568; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6569; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
6570; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6571; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
6572; GFX7-NEXT:    flat_load_dword v5, v[0:1]
6573; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
6574; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
6575; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
6576; GFX7-NEXT:    s_mov_b64 s[4:5], 0
6577; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
6578; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
6579; GFX7-NEXT:    v_not_b32_e32 v4, v4
6580; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
6581; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
6582; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6583; GFX7-NEXT:    v_mov_b32_e32 v6, v5
6584; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
6585; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
6586; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
6587; GFX7-NEXT:    v_min_f32_e32 v5, v5, v3
6588; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
6589; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
6590; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
6591; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6592; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6593; GFX7-NEXT:    buffer_wbinvl1
6594; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
6595; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6596; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6597; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
6598; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
6599; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
6600; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
6601; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
6602; GFX7-NEXT:    s_setpc_b64 s[30:31]
6603  %gep = getelementptr half, ptr %ptr, i64 1023
6604  %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
6605  ret half %result
6606}
6607
6608define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
6609; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
6610; GFX12:       ; %bb.0:
6611; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6612; GFX12-NEXT:    s_wait_expcnt 0x0
6613; GFX12-NEXT:    s_wait_samplecnt 0x0
6614; GFX12-NEXT:    s_wait_bvhcnt 0x0
6615; GFX12-NEXT:    s_wait_kmcnt 0x0
6616; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
6617; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
6618; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
6619; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
6620; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
6621; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
6622; GFX12-NEXT:    s_mov_b32 s0, 0
6623; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
6624; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6625; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
6626; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6627; GFX12-NEXT:    v_not_b32_e32 v4, v4
6628; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
6629; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
6630; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6631; GFX12-NEXT:    v_mov_b32_e32 v6, v5
6632; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6633; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6634; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
6635; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6636; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v2
6637; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
6638; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6639; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6640; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
6641; GFX12-NEXT:    s_wait_storecnt 0x0
6642; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
6643; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6644; GFX12-NEXT:    global_inv scope:SCOPE_DEV
6645; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
6646; GFX12-NEXT:    s_wait_alu 0xfffe
6647; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
6648; GFX12-NEXT:    s_wait_alu 0xfffe
6649; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
6650; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
6651; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
6652; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6653; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6654; GFX12-NEXT:    s_wait_alu 0xfffe
6655; GFX12-NEXT:    s_setpc_b64 s[30:31]
6656;
6657; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
6658; GFX940:       ; %bb.0:
6659; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6660; GFX940-NEXT:    s_movk_i32 s0, 0xf800
6661; GFX940-NEXT:    s_mov_b32 s1, -1
6662; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
6663; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
6664; GFX940-NEXT:    v_mov_b32_e32 v1, v5
6665; GFX940-NEXT:    flat_load_dword v5, v[0:1]
6666; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
6667; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6668; GFX940-NEXT:    s_mov_b32 s0, 0xffff
6669; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
6670; GFX940-NEXT:    v_not_b32_e32 v4, v4
6671; GFX940-NEXT:    s_mov_b64 s[0:1], 0
6672; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
6673; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
6674; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
6675; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6676; GFX940-NEXT:    v_mov_b32_e32 v7, v5
6677; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
6678; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
6679; GFX940-NEXT:    v_min_f16_e32 v5, v5, v2
6680; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6681; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
6682; GFX940-NEXT:    buffer_wbl2 sc1
6683; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
6684; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6685; GFX940-NEXT:    buffer_inv sc1
6686; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
6687; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6688; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6689; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
6690; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
6691; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
6692; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6693; GFX940-NEXT:    s_setpc_b64 s[30:31]
6694;
6695; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
6696; GFX11:       ; %bb.0:
6697; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6698; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
6699; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
6700; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
6701; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
6702; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
6703; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
6704; GFX11-NEXT:    s_mov_b32 s0, 0
6705; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
6706; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6707; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
6708; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6709; GFX11-NEXT:    v_not_b32_e32 v4, v4
6710; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
6711; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
6712; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6713; GFX11-NEXT:    v_mov_b32_e32 v6, v5
6714; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6715; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6716; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
6717; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6718; GFX11-NEXT:    v_min_f16_e32 v5, v5, v2
6719; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
6720; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6721; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6722; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
6723; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
6724; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
6725; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6726; GFX11-NEXT:    buffer_gl1_inv
6727; GFX11-NEXT:    buffer_gl0_inv
6728; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
6729; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
6730; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
6731; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
6732; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
6733; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
6734; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6735; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6736; GFX11-NEXT:    s_setpc_b64 s[30:31]
6737;
6738; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
6739; GFX10:       ; %bb.0:
6740; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6741; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
6742; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
6743; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
6744; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
6745; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
6746; GFX10-NEXT:    s_mov_b32 s4, 0
6747; GFX10-NEXT:    flat_load_dword v5, v[0:1]
6748; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6749; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
6750; GFX10-NEXT:    v_not_b32_e32 v4, v4
6751; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
6752; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
6753; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6754; GFX10-NEXT:    v_mov_b32_e32 v6, v5
6755; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6756; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
6757; GFX10-NEXT:    v_min_f16_e32 v5, v5, v2
6758; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
6759; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
6760; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
6761; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6762; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6763; GFX10-NEXT:    buffer_gl1_inv
6764; GFX10-NEXT:    buffer_gl0_inv
6765; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
6766; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
6767; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
6768; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
6769; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
6770; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
6771; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6772; GFX10-NEXT:    s_setpc_b64 s[30:31]
6773;
6774; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
6775; GFX90A:       ; %bb.0:
6776; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6777; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
6778; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
6779; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
6780; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
6781; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
6782; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6783; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
6784; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
6785; GFX90A-NEXT:    v_not_b32_e32 v4, v4
6786; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
6787; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
6788; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
6789; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
6790; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6791; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
6792; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
6793; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
6794; GFX90A-NEXT:    v_min_f16_e32 v5, v5, v2
6795; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6796; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
6797; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
6798; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6799; GFX90A-NEXT:    buffer_wbinvl1
6800; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
6801; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6802; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6803; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
6804; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
6805; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
6806; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6807; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6808;
6809; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
6810; GFX908:       ; %bb.0:
6811; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6812; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
6813; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
6814; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
6815; GFX908-NEXT:    flat_load_dword v5, v[0:1]
6816; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
6817; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6818; GFX908-NEXT:    s_mov_b32 s4, 0xffff
6819; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
6820; GFX908-NEXT:    v_not_b32_e32 v4, v4
6821; GFX908-NEXT:    s_mov_b64 s[4:5], 0
6822; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
6823; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
6824; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
6825; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6826; GFX908-NEXT:    v_mov_b32_e32 v6, v5
6827; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6828; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
6829; GFX908-NEXT:    v_min_f16_e32 v5, v5, v2
6830; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6831; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
6832; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6833; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6834; GFX908-NEXT:    buffer_wbinvl1
6835; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
6836; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6837; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6838; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
6839; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
6840; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
6841; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6842; GFX908-NEXT:    s_setpc_b64 s[30:31]
6843;
6844; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
6845; GFX8:       ; %bb.0:
6846; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6847; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
6848; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
6849; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
6850; GFX8-NEXT:    flat_load_dword v5, v[0:1]
6851; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
6852; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6853; GFX8-NEXT:    s_mov_b32 s4, 0xffff
6854; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
6855; GFX8-NEXT:    v_not_b32_e32 v4, v4
6856; GFX8-NEXT:    s_mov_b64 s[4:5], 0
6857; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
6858; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
6859; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
6860; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6861; GFX8-NEXT:    v_mov_b32_e32 v6, v5
6862; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
6863; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
6864; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
6865; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
6866; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
6867; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
6868; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6869; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6870; GFX8-NEXT:    buffer_wbinvl1
6871; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
6872; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6873; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6874; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
6875; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
6876; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
6877; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
6878; GFX8-NEXT:    s_setpc_b64 s[30:31]
6879;
6880; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
6881; GFX7:       ; %bb.0:
6882; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6883; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
6884; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
6885; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
6886; GFX7-NEXT:    flat_load_dword v5, v[0:1]
6887; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
6888; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
6889; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
6890; GFX7-NEXT:    s_mov_b64 s[4:5], 0
6891; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
6892; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
6893; GFX7-NEXT:    v_not_b32_e32 v4, v4
6894; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
6895; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
6896; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6897; GFX7-NEXT:    v_mov_b32_e32 v6, v5
6898; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
6899; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
6900; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
6901; GFX7-NEXT:    v_min_f32_e32 v5, v5, v3
6902; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
6903; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
6904; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
6905; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
6906; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6907; GFX7-NEXT:    buffer_wbinvl1
6908; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
6909; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
6910; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
6911; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
6912; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
6913; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
6914; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
6915; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
6916; GFX7-NEXT:    s_setpc_b64 s[30:31]
6917  %gep = getelementptr half, ptr %ptr, i64 -1024
6918  %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
6919  ret half %result
6920 }
6921
6922define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
6923; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
6924; GFX12:       ; %bb.0:
6925; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6926; GFX12-NEXT:    s_wait_expcnt 0x0
6927; GFX12-NEXT:    s_wait_samplecnt 0x0
6928; GFX12-NEXT:    s_wait_bvhcnt 0x0
6929; GFX12-NEXT:    s_wait_kmcnt 0x0
6930; GFX12-NEXT:    v_mov_b32_e32 v3, v0
6931; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
6932; GFX12-NEXT:    s_mov_b32 s0, 0
6933; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
6934; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
6935; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
6936; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
6937; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
6938; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
6939; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6940; GFX12-NEXT:    v_not_b32_e32 v6, v3
6941; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
6942; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
6943; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6944; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
6945; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6946; GFX12-NEXT:    v_max_num_f16_e32 v3, v3, v3
6947; GFX12-NEXT:    v_min_num_f16_e32 v3, v3, v2
6948; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6949; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
6950; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
6951; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
6952; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
6953; GFX12-NEXT:    s_wait_storecnt 0x0
6954; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
6955; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
6956; GFX12-NEXT:    global_inv scope:SCOPE_DEV
6957; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
6958; GFX12-NEXT:    v_mov_b32_e32 v4, v3
6959; GFX12-NEXT:    s_wait_alu 0xfffe
6960; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
6961; GFX12-NEXT:    s_wait_alu 0xfffe
6962; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
6963; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
6964; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
6965; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
6966; GFX12-NEXT:    s_wait_alu 0xfffe
6967; GFX12-NEXT:    s_setpc_b64 s[30:31]
6968;
6969; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
6970; GFX940:       ; %bb.0:
6971; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6972; GFX940-NEXT:    v_mov_b32_e32 v3, v0
6973; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
6974; GFX940-NEXT:    flat_load_dword v5, v[0:1]
6975; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
6976; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
6977; GFX940-NEXT:    s_mov_b32 s0, 0xffff
6978; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
6979; GFX940-NEXT:    v_not_b32_e32 v6, v4
6980; GFX940-NEXT:    s_mov_b64 s[0:1], 0
6981; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
6982; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
6983; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
6984; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6985; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
6986; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
6987; GFX940-NEXT:    v_min_f16_e32 v4, v4, v2
6988; GFX940-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
6989; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
6990; GFX940-NEXT:    buffer_wbl2 sc1
6991; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
6992; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
6993; GFX940-NEXT:    buffer_inv sc1
6994; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
6995; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
6996; GFX940-NEXT:    v_mov_b32_e32 v5, v4
6997; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
6998; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
6999; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
7000; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
7001; GFX940-NEXT:    s_setpc_b64 s[30:31]
7002;
7003; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
7004; GFX11:       ; %bb.0:
7005; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7006; GFX11-NEXT:    v_mov_b32_e32 v3, v0
7007; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
7008; GFX11-NEXT:    s_mov_b32 s0, 0
7009; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
7010; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
7011; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
7012; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
7013; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
7014; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
7015; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7016; GFX11-NEXT:    v_not_b32_e32 v6, v3
7017; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
7018; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
7019; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7020; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
7021; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7022; GFX11-NEXT:    v_max_f16_e32 v3, v3, v3
7023; GFX11-NEXT:    v_min_f16_e32 v3, v3, v2
7024; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7025; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
7026; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
7027; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7028; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
7029; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7030; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
7031; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7032; GFX11-NEXT:    buffer_gl1_inv
7033; GFX11-NEXT:    buffer_gl0_inv
7034; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
7035; GFX11-NEXT:    v_mov_b32_e32 v4, v3
7036; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
7037; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7038; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
7039; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
7040; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
7041; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7042; GFX11-NEXT:    s_setpc_b64 s[30:31]
7043;
7044; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
7045; GFX10:       ; %bb.0:
7046; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7047; GFX10-NEXT:    v_mov_b32_e32 v3, v0
7048; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
7049; GFX10-NEXT:    s_mov_b32 s4, 0
7050; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
7051; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
7052; GFX10-NEXT:    flat_load_dword v4, v[0:1]
7053; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
7054; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
7055; GFX10-NEXT:    v_not_b32_e32 v6, v3
7056; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
7057; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
7058; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7059; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
7060; GFX10-NEXT:    v_max_f16_e32 v3, v3, v3
7061; GFX10-NEXT:    v_min_f16_e32 v3, v3, v2
7062; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
7063; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
7064; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7065; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7066; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7067; GFX10-NEXT:    buffer_gl1_inv
7068; GFX10-NEXT:    buffer_gl0_inv
7069; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
7070; GFX10-NEXT:    v_mov_b32_e32 v4, v3
7071; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
7072; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
7073; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
7074; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
7075; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7076; GFX10-NEXT:    s_setpc_b64 s[30:31]
7077;
7078; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
7079; GFX90A:       ; %bb.0:
7080; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7081; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
7082; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
7083; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
7084; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
7085; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
7086; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
7087; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
7088; GFX90A-NEXT:    v_not_b32_e32 v6, v4
7089; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
7090; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
7091; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
7092; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
7093; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7094; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
7095; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
7096; GFX90A-NEXT:    v_min_f16_e32 v4, v4, v2
7097; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, v3, v4
7098; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
7099; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
7100; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7101; GFX90A-NEXT:    buffer_wbinvl1
7102; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
7103; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7104; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
7105; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7106; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
7107; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
7108; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7109; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7110;
7111; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
7112; GFX908:       ; %bb.0:
7113; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7114; GFX908-NEXT:    v_mov_b32_e32 v3, v0
7115; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
7116; GFX908-NEXT:    flat_load_dword v4, v[0:1]
7117; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
7118; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
7119; GFX908-NEXT:    s_mov_b32 s4, 0xffff
7120; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
7121; GFX908-NEXT:    v_not_b32_e32 v6, v3
7122; GFX908-NEXT:    s_mov_b64 s[4:5], 0
7123; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
7124; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
7125; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
7126; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7127; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
7128; GFX908-NEXT:    v_max_f16_e32 v3, v3, v3
7129; GFX908-NEXT:    v_min_f16_e32 v3, v3, v2
7130; GFX908-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
7131; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
7132; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7133; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7134; GFX908-NEXT:    buffer_wbinvl1
7135; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7136; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7137; GFX908-NEXT:    v_mov_b32_e32 v4, v3
7138; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7139; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
7140; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
7141; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7142; GFX908-NEXT:    s_setpc_b64 s[30:31]
7143;
7144; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
7145; GFX8:       ; %bb.0:
7146; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7147; GFX8-NEXT:    v_mov_b32_e32 v3, v0
7148; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
7149; GFX8-NEXT:    flat_load_dword v4, v[0:1]
7150; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
7151; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
7152; GFX8-NEXT:    s_mov_b32 s4, 0xffff
7153; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
7154; GFX8-NEXT:    v_not_b32_e32 v6, v3
7155; GFX8-NEXT:    s_mov_b64 s[4:5], 0
7156; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
7157; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
7158; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
7159; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7160; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
7161; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
7162; GFX8-NEXT:    v_min_f16_e32 v3, v3, v2
7163; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
7164; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
7165; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
7166; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7167; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7168; GFX8-NEXT:    buffer_wbinvl1
7169; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7170; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7171; GFX8-NEXT:    v_mov_b32_e32 v4, v3
7172; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7173; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
7174; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
7175; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
7176; GFX8-NEXT:    s_setpc_b64 s[30:31]
7177;
7178; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
7179; GFX7:       ; %bb.0:
7180; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7181; GFX7-NEXT:    v_mov_b32_e32 v3, v0
7182; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
7183; GFX7-NEXT:    flat_load_dword v4, v[0:1]
7184; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v2
7185; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
7186; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
7187; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v2
7188; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
7189; GFX7-NEXT:    v_not_b32_e32 v6, v3
7190; GFX7-NEXT:    s_mov_b64 s[4:5], 0
7191; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
7192; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
7193; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7194; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
7195; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
7196; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
7197; GFX7-NEXT:    v_min_f32_e32 v3, v3, v5
7198; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
7199; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
7200; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
7201; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7202; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7203; GFX7-NEXT:    buffer_wbinvl1
7204; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7205; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7206; GFX7-NEXT:    v_mov_b32_e32 v4, v3
7207; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7208; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
7209; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
7210; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7211; GFX7-NEXT:    s_setpc_b64 s[30:31]
7212  %unused = atomicrmw fmin ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
7213  ret void
7214}
7215
7216define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
7217; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
7218; GFX12:       ; %bb.0:
7219; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7220; GFX12-NEXT:    s_wait_expcnt 0x0
7221; GFX12-NEXT:    s_wait_samplecnt 0x0
7222; GFX12-NEXT:    s_wait_bvhcnt 0x0
7223; GFX12-NEXT:    s_wait_kmcnt 0x0
7224; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
7225; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
7226; GFX12-NEXT:    v_max_num_f16_e32 v6, v2, v2
7227; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
7228; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
7229; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
7230; GFX12-NEXT:    s_mov_b32 s0, 0
7231; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
7232; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7233; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
7234; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7235; GFX12-NEXT:    v_not_b32_e32 v5, v5
7236; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
7237; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
7238; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7239; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7240; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7241; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
7242; GFX12-NEXT:    v_min_num_f16_e32 v2, v2, v6
7243; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7244; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
7245; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7246; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7247; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
7248; GFX12-NEXT:    s_wait_storecnt 0x0
7249; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
7250; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7251; GFX12-NEXT:    global_inv scope:SCOPE_DEV
7252; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
7253; GFX12-NEXT:    v_mov_b32_e32 v3, v2
7254; GFX12-NEXT:    s_wait_alu 0xfffe
7255; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
7256; GFX12-NEXT:    s_wait_alu 0xfffe
7257; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
7258; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
7259; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
7260; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7261; GFX12-NEXT:    s_wait_alu 0xfffe
7262; GFX12-NEXT:    s_setpc_b64 s[30:31]
7263;
7264; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
7265; GFX940:       ; %bb.0:
7266; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7267; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
7268; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
7269; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
7270; GFX940-NEXT:    v_mov_b32_e32 v1, v5
7271; GFX940-NEXT:    flat_load_dword v3, v[0:1]
7272; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
7273; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7274; GFX940-NEXT:    s_mov_b32 s0, 0xffff
7275; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
7276; GFX940-NEXT:    v_not_b32_e32 v5, v5
7277; GFX940-NEXT:    s_mov_b64 s[0:1], 0
7278; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
7279; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
7280; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
7281; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7282; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7283; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
7284; GFX940-NEXT:    v_min_f16_e32 v2, v2, v6
7285; GFX940-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7286; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
7287; GFX940-NEXT:    buffer_wbl2 sc1
7288; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
7289; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7290; GFX940-NEXT:    buffer_inv sc1
7291; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7292; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
7293; GFX940-NEXT:    v_mov_b32_e32 v3, v2
7294; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
7295; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
7296; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
7297; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
7298; GFX940-NEXT:    s_setpc_b64 s[30:31]
7299;
7300; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
7301; GFX11:       ; %bb.0:
7302; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7303; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
7304; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
7305; GFX11-NEXT:    v_max_f16_e32 v6, v2, v2
7306; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
7307; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
7308; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
7309; GFX11-NEXT:    s_mov_b32 s0, 0
7310; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
7311; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7312; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
7313; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7314; GFX11-NEXT:    v_not_b32_e32 v5, v5
7315; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
7316; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
7317; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7318; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7319; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7320; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
7321; GFX11-NEXT:    v_min_f16_e32 v2, v2, v6
7322; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7323; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
7324; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7325; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7326; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
7327; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7328; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
7329; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7330; GFX11-NEXT:    buffer_gl1_inv
7331; GFX11-NEXT:    buffer_gl0_inv
7332; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
7333; GFX11-NEXT:    v_mov_b32_e32 v3, v2
7334; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
7335; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7336; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
7337; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
7338; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
7339; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7340; GFX11-NEXT:    s_setpc_b64 s[30:31]
7341;
7342; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
7343; GFX10:       ; %bb.0:
7344; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7345; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
7346; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
7347; GFX10-NEXT:    v_max_f16_e32 v6, v2, v2
7348; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
7349; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
7350; GFX10-NEXT:    s_mov_b32 s4, 0
7351; GFX10-NEXT:    flat_load_dword v3, v[0:1]
7352; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7353; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
7354; GFX10-NEXT:    v_not_b32_e32 v5, v5
7355; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
7356; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
7357; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7358; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7359; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
7360; GFX10-NEXT:    v_min_f16_e32 v2, v2, v6
7361; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
7362; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
7363; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7364; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7365; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7366; GFX10-NEXT:    buffer_gl1_inv
7367; GFX10-NEXT:    buffer_gl0_inv
7368; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
7369; GFX10-NEXT:    v_mov_b32_e32 v3, v2
7370; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
7371; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
7372; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
7373; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
7374; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7375; GFX10-NEXT:    s_setpc_b64 s[30:31]
7376;
7377; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
7378; GFX90A:       ; %bb.0:
7379; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7380; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
7381; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7382; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
7383; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
7384; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
7385; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7386; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
7387; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
7388; GFX90A-NEXT:    v_not_b32_e32 v5, v5
7389; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
7390; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
7391; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
7392; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
7393; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7394; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7395; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
7396; GFX90A-NEXT:    v_min_f16_e32 v2, v2, v6
7397; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7398; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
7399; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7400; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7401; GFX90A-NEXT:    buffer_wbinvl1
7402; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7403; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7404; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
7405; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7406; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
7407; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
7408; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7409; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7410;
7411; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
7412; GFX908:       ; %bb.0:
7413; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7414; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
7415; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7416; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
7417; GFX908-NEXT:    flat_load_dword v3, v[0:1]
7418; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
7419; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7420; GFX908-NEXT:    s_mov_b32 s4, 0xffff
7421; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
7422; GFX908-NEXT:    v_not_b32_e32 v5, v5
7423; GFX908-NEXT:    s_mov_b64 s[4:5], 0
7424; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
7425; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
7426; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
7427; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7428; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7429; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
7430; GFX908-NEXT:    v_min_f16_e32 v2, v2, v6
7431; GFX908-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7432; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
7433; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7434; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7435; GFX908-NEXT:    buffer_wbinvl1
7436; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7437; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7438; GFX908-NEXT:    v_mov_b32_e32 v3, v2
7439; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7440; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
7441; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
7442; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7443; GFX908-NEXT:    s_setpc_b64 s[30:31]
7444;
7445; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
7446; GFX8:       ; %bb.0:
7447; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7448; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
7449; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7450; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
7451; GFX8-NEXT:    flat_load_dword v3, v[0:1]
7452; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
7453; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7454; GFX8-NEXT:    s_mov_b32 s4, 0xffff
7455; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
7456; GFX8-NEXT:    v_not_b32_e32 v5, v5
7457; GFX8-NEXT:    s_mov_b64 s[4:5], 0
7458; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
7459; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
7460; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
7461; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7462; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7463; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
7464; GFX8-NEXT:    v_min_f16_e32 v2, v2, v6
7465; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
7466; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7467; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
7468; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7469; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7470; GFX8-NEXT:    buffer_wbinvl1
7471; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7472; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7473; GFX8-NEXT:    v_mov_b32_e32 v3, v2
7474; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7475; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
7476; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
7477; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
7478; GFX8-NEXT:    s_setpc_b64 s[30:31]
7479;
7480; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
7481; GFX7:       ; %bb.0:
7482; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7483; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
7484; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7485; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
7486; GFX7-NEXT:    flat_load_dword v3, v[0:1]
7487; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
7488; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
7489; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7490; GFX7-NEXT:    s_mov_b64 s[4:5], 0
7491; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
7492; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
7493; GFX7-NEXT:    v_not_b32_e32 v6, v2
7494; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
7495; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
7496; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7497; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7498; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
7499; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
7500; GFX7-NEXT:    v_min_f32_e32 v2, v2, v5
7501; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
7502; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7503; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
7504; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7505; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7506; GFX7-NEXT:    buffer_wbinvl1
7507; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7508; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7509; GFX7-NEXT:    v_mov_b32_e32 v3, v2
7510; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7511; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
7512; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
7513; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7514; GFX7-NEXT:    s_setpc_b64 s[30:31]
7515  %gep = getelementptr half, ptr %ptr, i64 1023
7516  %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
7517  ret void
7518}
7519
7520define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
7521; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
7522; GFX12:       ; %bb.0:
7523; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7524; GFX12-NEXT:    s_wait_expcnt 0x0
7525; GFX12-NEXT:    s_wait_samplecnt 0x0
7526; GFX12-NEXT:    s_wait_bvhcnt 0x0
7527; GFX12-NEXT:    s_wait_kmcnt 0x0
7528; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
7529; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
7530; GFX12-NEXT:    v_max_num_f16_e32 v6, v2, v2
7531; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
7532; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
7533; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
7534; GFX12-NEXT:    s_mov_b32 s0, 0
7535; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
7536; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7537; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
7538; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7539; GFX12-NEXT:    v_not_b32_e32 v5, v5
7540; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
7541; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
7542; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7543; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7544; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7545; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
7546; GFX12-NEXT:    v_min_num_f16_e32 v2, v2, v6
7547; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7548; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
7549; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7550; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7551; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
7552; GFX12-NEXT:    s_wait_storecnt 0x0
7553; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
7554; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7555; GFX12-NEXT:    global_inv scope:SCOPE_DEV
7556; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
7557; GFX12-NEXT:    v_mov_b32_e32 v3, v2
7558; GFX12-NEXT:    s_wait_alu 0xfffe
7559; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
7560; GFX12-NEXT:    s_wait_alu 0xfffe
7561; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
7562; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
7563; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
7564; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7565; GFX12-NEXT:    s_wait_alu 0xfffe
7566; GFX12-NEXT:    s_setpc_b64 s[30:31]
7567;
7568; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
7569; GFX940:       ; %bb.0:
7570; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7571; GFX940-NEXT:    s_movk_i32 s0, 0xf800
7572; GFX940-NEXT:    s_mov_b32 s1, -1
7573; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
7574; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
7575; GFX940-NEXT:    v_mov_b32_e32 v1, v5
7576; GFX940-NEXT:    flat_load_dword v3, v[0:1]
7577; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
7578; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7579; GFX940-NEXT:    s_mov_b32 s0, 0xffff
7580; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
7581; GFX940-NEXT:    v_not_b32_e32 v5, v5
7582; GFX940-NEXT:    s_mov_b64 s[0:1], 0
7583; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
7584; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
7585; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
7586; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7587; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7588; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
7589; GFX940-NEXT:    v_min_f16_e32 v2, v2, v6
7590; GFX940-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7591; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
7592; GFX940-NEXT:    buffer_wbl2 sc1
7593; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
7594; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7595; GFX940-NEXT:    buffer_inv sc1
7596; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7597; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
7598; GFX940-NEXT:    v_mov_b32_e32 v3, v2
7599; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
7600; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
7601; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
7602; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
7603; GFX940-NEXT:    s_setpc_b64 s[30:31]
7604;
7605; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
7606; GFX11:       ; %bb.0:
7607; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7608; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
7609; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
7610; GFX11-NEXT:    v_max_f16_e32 v6, v2, v2
7611; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
7612; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
7613; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
7614; GFX11-NEXT:    s_mov_b32 s0, 0
7615; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
7616; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7617; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
7618; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7619; GFX11-NEXT:    v_not_b32_e32 v5, v5
7620; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
7621; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
7622; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7623; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7624; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7625; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
7626; GFX11-NEXT:    v_min_f16_e32 v2, v2, v6
7627; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7628; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
7629; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7630; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
7631; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
7632; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7633; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
7634; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7635; GFX11-NEXT:    buffer_gl1_inv
7636; GFX11-NEXT:    buffer_gl0_inv
7637; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
7638; GFX11-NEXT:    v_mov_b32_e32 v3, v2
7639; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
7640; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7641; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
7642; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
7643; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
7644; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7645; GFX11-NEXT:    s_setpc_b64 s[30:31]
7646;
7647; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
7648; GFX10:       ; %bb.0:
7649; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7650; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
7651; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
7652; GFX10-NEXT:    v_max_f16_e32 v6, v2, v2
7653; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
7654; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
7655; GFX10-NEXT:    s_mov_b32 s4, 0
7656; GFX10-NEXT:    flat_load_dword v3, v[0:1]
7657; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7658; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
7659; GFX10-NEXT:    v_not_b32_e32 v5, v5
7660; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
7661; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
7662; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7663; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7664; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
7665; GFX10-NEXT:    v_min_f16_e32 v2, v2, v6
7666; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
7667; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
7668; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7669; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7670; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7671; GFX10-NEXT:    buffer_gl1_inv
7672; GFX10-NEXT:    buffer_gl0_inv
7673; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
7674; GFX10-NEXT:    v_mov_b32_e32 v3, v2
7675; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
7676; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
7677; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
7678; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
7679; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7680; GFX10-NEXT:    s_setpc_b64 s[30:31]
7681;
7682; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
7683; GFX90A:       ; %bb.0:
7684; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7685; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
7686; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
7687; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
7688; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
7689; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
7690; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7691; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
7692; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
7693; GFX90A-NEXT:    v_not_b32_e32 v5, v5
7694; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
7695; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
7696; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
7697; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
7698; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7699; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7700; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
7701; GFX90A-NEXT:    v_min_f16_e32 v2, v2, v6
7702; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7703; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
7704; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7705; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7706; GFX90A-NEXT:    buffer_wbinvl1
7707; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7708; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7709; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
7710; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7711; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
7712; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
7713; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7714; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7715;
7716; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
7717; GFX908:       ; %bb.0:
7718; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7719; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
7720; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
7721; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
7722; GFX908-NEXT:    flat_load_dword v3, v[0:1]
7723; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
7724; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7725; GFX908-NEXT:    s_mov_b32 s4, 0xffff
7726; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
7727; GFX908-NEXT:    v_not_b32_e32 v5, v5
7728; GFX908-NEXT:    s_mov_b64 s[4:5], 0
7729; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
7730; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
7731; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
7732; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7733; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7734; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
7735; GFX908-NEXT:    v_min_f16_e32 v2, v2, v6
7736; GFX908-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7737; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
7738; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7739; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7740; GFX908-NEXT:    buffer_wbinvl1
7741; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7742; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7743; GFX908-NEXT:    v_mov_b32_e32 v3, v2
7744; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7745; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
7746; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
7747; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7748; GFX908-NEXT:    s_setpc_b64 s[30:31]
7749;
7750; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
7751; GFX8:       ; %bb.0:
7752; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7753; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
7754; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
7755; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
7756; GFX8-NEXT:    flat_load_dword v3, v[0:1]
7757; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
7758; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7759; GFX8-NEXT:    s_mov_b32 s4, 0xffff
7760; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
7761; GFX8-NEXT:    v_not_b32_e32 v5, v5
7762; GFX8-NEXT:    s_mov_b64 s[4:5], 0
7763; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
7764; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
7765; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
7766; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7767; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7768; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
7769; GFX8-NEXT:    v_min_f16_e32 v2, v2, v6
7770; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
7771; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7772; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
7773; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7774; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7775; GFX8-NEXT:    buffer_wbinvl1
7776; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7777; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7778; GFX8-NEXT:    v_mov_b32_e32 v3, v2
7779; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7780; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
7781; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
7782; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
7783; GFX8-NEXT:    s_setpc_b64 s[30:31]
7784;
7785; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
7786; GFX7:       ; %bb.0:
7787; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7788; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
7789; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
7790; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
7791; GFX7-NEXT:    flat_load_dword v3, v[0:1]
7792; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
7793; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
7794; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
7795; GFX7-NEXT:    s_mov_b64 s[4:5], 0
7796; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
7797; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
7798; GFX7-NEXT:    v_not_b32_e32 v6, v2
7799; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
7800; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
7801; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7802; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
7803; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
7804; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
7805; GFX7-NEXT:    v_min_f32_e32 v2, v2, v5
7806; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
7807; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
7808; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
7809; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7810; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7811; GFX7-NEXT:    buffer_wbinvl1
7812; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
7813; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7814; GFX7-NEXT:    v_mov_b32_e32 v3, v2
7815; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7816; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
7817; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
7818; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
7819; GFX7-NEXT:    s_setpc_b64 s[30:31]
7820  %gep = getelementptr half, ptr %ptr, i64 -1024
7821  %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
7822  ret void
7823}
7824
7825define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
7826; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
7827; GFX12:       ; %bb.0:
7828; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7829; GFX12-NEXT:    s_wait_expcnt 0x0
7830; GFX12-NEXT:    s_wait_samplecnt 0x0
7831; GFX12-NEXT:    s_wait_bvhcnt 0x0
7832; GFX12-NEXT:    s_wait_kmcnt 0x0
7833; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
7834; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
7835; GFX12-NEXT:    s_mov_b32 s0, 0
7836; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
7837; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
7838; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7839; GFX12-NEXT:    v_mov_b32_e32 v4, v3
7840; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7841; GFX12-NEXT:    v_max_num_f16_e32 v3, v4, v4
7842; GFX12-NEXT:    v_min_num_f16_e32 v3, v3, v2
7843; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7844; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
7845; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
7846; GFX12-NEXT:    s_wait_storecnt 0x0
7847; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
7848; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
7849; GFX12-NEXT:    global_inv scope:SCOPE_DEV
7850; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
7851; GFX12-NEXT:    s_wait_alu 0xfffe
7852; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
7853; GFX12-NEXT:    s_wait_alu 0xfffe
7854; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
7855; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
7856; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
7857; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7858; GFX12-NEXT:    v_mov_b32_e32 v0, v3
7859; GFX12-NEXT:    s_wait_alu 0xfffe
7860; GFX12-NEXT:    s_setpc_b64 s[30:31]
7861;
7862; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
7863; GFX940:       ; %bb.0:
7864; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7865; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
7866; GFX940-NEXT:    s_mov_b64 s[0:1], 0
7867; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
7868; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
7869; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
7870; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
7871; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7872; GFX940-NEXT:    v_mov_b32_e32 v5, v3
7873; GFX940-NEXT:    v_max_f16_e32 v3, v5, v5
7874; GFX940-NEXT:    v_min_f16_e32 v3, v3, v2
7875; GFX940-NEXT:    v_and_or_b32 v4, v5, s2, v3
7876; GFX940-NEXT:    buffer_wbl2 sc1
7877; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
7878; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7879; GFX940-NEXT:    buffer_inv sc1
7880; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
7881; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
7882; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
7883; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
7884; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
7885; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
7886; GFX940-NEXT:    v_mov_b32_e32 v0, v3
7887; GFX940-NEXT:    s_setpc_b64 s[30:31]
7888;
7889; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
7890; GFX11:       ; %bb.0:
7891; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7892; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
7893; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
7894; GFX11-NEXT:    s_mov_b32 s0, 0
7895; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
7896; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
7897; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7898; GFX11-NEXT:    v_mov_b32_e32 v4, v3
7899; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7900; GFX11-NEXT:    v_max_f16_e32 v3, v4, v4
7901; GFX11-NEXT:    v_min_f16_e32 v3, v3, v2
7902; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7903; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
7904; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
7905; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
7906; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
7907; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7908; GFX11-NEXT:    buffer_gl1_inv
7909; GFX11-NEXT:    buffer_gl0_inv
7910; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
7911; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
7912; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
7913; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
7914; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
7915; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
7916; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
7917; GFX11-NEXT:    v_mov_b32_e32 v0, v3
7918; GFX11-NEXT:    s_setpc_b64 s[30:31]
7919;
7920; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
7921; GFX10:       ; %bb.0:
7922; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7923; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
7924; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
7925; GFX10-NEXT:    v_max_f16_e32 v1, v2, v2
7926; GFX10-NEXT:    s_mov_b32 s4, 0
7927; GFX10-NEXT:    flat_load_dword v0, v[3:4]
7928; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
7929; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
7930; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7931; GFX10-NEXT:    v_mov_b32_e32 v6, v0
7932; GFX10-NEXT:    v_max_f16_e32 v0, v6, v6
7933; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
7934; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
7935; GFX10-NEXT:    v_and_or_b32 v5, 0xffff0000, v6, v0
7936; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
7937; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
7938; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7939; GFX10-NEXT:    buffer_gl1_inv
7940; GFX10-NEXT:    buffer_gl0_inv
7941; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
7942; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
7943; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
7944; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
7945; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
7946; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
7947; GFX10-NEXT:    s_setpc_b64 s[30:31]
7948;
7949; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
7950; GFX90A:       ; %bb.0:
7951; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7952; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
7953; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
7954; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
7955; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
7956; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
7957; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
7958; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7959; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
7960; GFX90A-NEXT:    v_max_f16_e32 v3, v5, v5
7961; GFX90A-NEXT:    v_min_f16_e32 v3, v3, v2
7962; GFX90A-NEXT:    v_and_or_b32 v4, v5, s6, v3
7963; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
7964; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7965; GFX90A-NEXT:    buffer_wbinvl1
7966; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
7967; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7968; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7969; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
7970; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
7971; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
7972; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
7973; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7974;
7975; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
7976; GFX908:       ; %bb.0:
7977; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7978; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
7979; GFX908-NEXT:    s_mov_b64 s[4:5], 0
7980; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
7981; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
7982; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
7983; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
7984; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7985; GFX908-NEXT:    v_mov_b32_e32 v4, v3
7986; GFX908-NEXT:    v_max_f16_e32 v3, v4, v4
7987; GFX908-NEXT:    v_min_f16_e32 v3, v3, v2
7988; GFX908-NEXT:    v_and_or_b32 v3, v4, s6, v3
7989; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
7990; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
7991; GFX908-NEXT:    buffer_wbinvl1
7992; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
7993; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
7994; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
7995; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
7996; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
7997; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
7998; GFX908-NEXT:    v_mov_b32_e32 v0, v3
7999; GFX908-NEXT:    s_setpc_b64 s[30:31]
8000;
8001; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
8002; GFX8:       ; %bb.0:
8003; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8004; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
8005; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
8006; GFX8-NEXT:    flat_load_dword v0, v[3:4]
8007; GFX8-NEXT:    s_mov_b64 s[4:5], 0
8008; GFX8-NEXT:    v_max_f16_e32 v1, v2, v2
8009; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
8010; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
8011; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8012; GFX8-NEXT:    v_mov_b32_e32 v6, v0
8013; GFX8-NEXT:    v_max_f16_e32 v0, v6, v6
8014; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
8015; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
8016; GFX8-NEXT:    v_or_b32_e32 v5, v2, v0
8017; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
8018; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8019; GFX8-NEXT:    buffer_wbinvl1
8020; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
8021; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8022; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8023; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
8024; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
8025; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
8026; GFX8-NEXT:    s_setpc_b64 s[30:31]
8027;
8028; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
8029; GFX7:       ; %bb.0:
8030; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8031; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
8032; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8033; GFX7-NEXT:    flat_load_dword v3, v[0:1]
8034; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
8035; GFX7-NEXT:    s_mov_b64 s[4:5], 0
8036; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
8037; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
8038; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
8039; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8040; GFX7-NEXT:    v_mov_b32_e32 v4, v3
8041; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
8042; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
8043; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
8044; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
8045; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
8046; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
8047; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8048; GFX7-NEXT:    buffer_wbinvl1
8049; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
8050; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8051; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8052; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
8053; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
8054; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
8055; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
8056; GFX7-NEXT:    s_setpc_b64 s[30:31]
8057  %gep = getelementptr half, ptr %ptr, i64 1023
8058  %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
8059  ret half %result
8060}
8061
8062define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
8063; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
8064; GFX12:       ; %bb.0:
8065; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8066; GFX12-NEXT:    s_wait_expcnt 0x0
8067; GFX12-NEXT:    s_wait_samplecnt 0x0
8068; GFX12-NEXT:    s_wait_bvhcnt 0x0
8069; GFX12-NEXT:    s_wait_kmcnt 0x0
8070; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
8071; GFX12-NEXT:    v_max_num_f16_e32 v4, v2, v2
8072; GFX12-NEXT:    s_mov_b32 s0, 0
8073; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
8074; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
8075; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8076; GFX12-NEXT:    v_max_num_f16_e32 v2, v3, v3
8077; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8078; GFX12-NEXT:    v_min_num_f16_e32 v2, v2, v4
8079; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
8080; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8081; GFX12-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
8082; GFX12-NEXT:    s_wait_storecnt 0x0
8083; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
8084; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8085; GFX12-NEXT:    global_inv scope:SCOPE_DEV
8086; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
8087; GFX12-NEXT:    v_mov_b32_e32 v3, v2
8088; GFX12-NEXT:    s_wait_alu 0xfffe
8089; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
8090; GFX12-NEXT:    s_wait_alu 0xfffe
8091; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8092; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
8093; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
8094; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8095; GFX12-NEXT:    s_wait_alu 0xfffe
8096; GFX12-NEXT:    s_setpc_b64 s[30:31]
8097;
8098; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
8099; GFX940:       ; %bb.0:
8100; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8101; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
8102; GFX940-NEXT:    s_mov_b64 s[0:1], 0
8103; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
8104; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
8105; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
8106; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
8107; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8108; GFX940-NEXT:    v_max_f16_e32 v2, v3, v3
8109; GFX940-NEXT:    v_min_f16_e32 v2, v2, v4
8110; GFX940-NEXT:    v_and_or_b32 v2, v3, s2, v2
8111; GFX940-NEXT:    buffer_wbl2 sc1
8112; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
8113; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8114; GFX940-NEXT:    buffer_inv sc1
8115; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
8116; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
8117; GFX940-NEXT:    v_mov_b32_e32 v3, v2
8118; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
8119; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
8120; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
8121; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
8122; GFX940-NEXT:    s_setpc_b64 s[30:31]
8123;
8124; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
8125; GFX11:       ; %bb.0:
8126; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8127; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
8128; GFX11-NEXT:    v_max_f16_e32 v4, v2, v2
8129; GFX11-NEXT:    s_mov_b32 s0, 0
8130; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
8131; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
8132; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8133; GFX11-NEXT:    v_max_f16_e32 v2, v3, v3
8134; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8135; GFX11-NEXT:    v_min_f16_e32 v2, v2, v4
8136; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
8137; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8138; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
8139; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
8140; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
8141; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8142; GFX11-NEXT:    buffer_gl1_inv
8143; GFX11-NEXT:    buffer_gl0_inv
8144; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
8145; GFX11-NEXT:    v_mov_b32_e32 v3, v2
8146; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
8147; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8148; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8149; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
8150; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
8151; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8152; GFX11-NEXT:    s_setpc_b64 s[30:31]
8153;
8154; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
8155; GFX10:       ; %bb.0:
8156; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8157; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
8158; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8159; GFX10-NEXT:    v_max_f16_e32 v4, v2, v2
8160; GFX10-NEXT:    s_mov_b32 s4, 0
8161; GFX10-NEXT:    flat_load_dword v3, v[0:1]
8162; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
8163; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
8164; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8165; GFX10-NEXT:    v_max_f16_e32 v2, v3, v3
8166; GFX10-NEXT:    v_min_f16_e32 v2, v2, v4
8167; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
8168; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
8169; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
8170; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8171; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8172; GFX10-NEXT:    buffer_gl1_inv
8173; GFX10-NEXT:    buffer_gl0_inv
8174; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
8175; GFX10-NEXT:    v_mov_b32_e32 v3, v2
8176; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
8177; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
8178; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
8179; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
8180; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
8181; GFX10-NEXT:    s_setpc_b64 s[30:31]
8182;
8183; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
8184; GFX90A:       ; %bb.0:
8185; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8186; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
8187; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
8188; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
8189; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
8190; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
8191; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
8192; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8193; GFX90A-NEXT:    v_max_f16_e32 v2, v3, v3
8194; GFX90A-NEXT:    v_min_f16_e32 v2, v2, v4
8195; GFX90A-NEXT:    v_and_or_b32 v2, v3, s6, v2
8196; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
8197; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8198; GFX90A-NEXT:    buffer_wbinvl1
8199; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
8200; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8201; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
8202; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8203; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
8204; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
8205; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
8206; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8207;
8208; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
8209; GFX908:       ; %bb.0:
8210; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8211; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
8212; GFX908-NEXT:    s_mov_b64 s[4:5], 0
8213; GFX908-NEXT:    v_max_f16_e32 v4, v2, v2
8214; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
8215; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
8216; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
8217; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8218; GFX908-NEXT:    v_max_f16_e32 v2, v3, v3
8219; GFX908-NEXT:    v_min_f16_e32 v2, v2, v4
8220; GFX908-NEXT:    v_and_or_b32 v2, v3, s6, v2
8221; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
8222; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8223; GFX908-NEXT:    buffer_wbinvl1
8224; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
8225; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8226; GFX908-NEXT:    v_mov_b32_e32 v3, v2
8227; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8228; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
8229; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
8230; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
8231; GFX908-NEXT:    s_setpc_b64 s[30:31]
8232;
8233; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
8234; GFX8:       ; %bb.0:
8235; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8236; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
8237; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8238; GFX8-NEXT:    flat_load_dword v3, v[0:1]
8239; GFX8-NEXT:    s_mov_b64 s[4:5], 0
8240; GFX8-NEXT:    v_max_f16_e32 v4, v2, v2
8241; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
8242; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
8243; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8244; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
8245; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
8246; GFX8-NEXT:    v_min_f16_e32 v2, v2, v4
8247; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
8248; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8249; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8250; GFX8-NEXT:    buffer_wbinvl1
8251; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
8252; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8253; GFX8-NEXT:    v_mov_b32_e32 v3, v2
8254; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8255; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
8256; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
8257; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
8258; GFX8-NEXT:    s_setpc_b64 s[30:31]
8259;
8260; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
8261; GFX7:       ; %bb.0:
8262; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8263; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
8264; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8265; GFX7-NEXT:    flat_load_dword v3, v[0:1]
8266; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
8267; GFX7-NEXT:    s_mov_b64 s[4:5], 0
8268; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
8269; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
8270; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
8271; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8272; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
8273; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
8274; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
8275; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
8276; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
8277; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8278; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8279; GFX7-NEXT:    buffer_wbinvl1
8280; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
8281; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8282; GFX7-NEXT:    v_mov_b32_e32 v3, v2
8283; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8284; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
8285; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
8286; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
8287; GFX7-NEXT:    s_setpc_b64 s[30:31]
8288  %gep = getelementptr half, ptr %ptr, i64 1023
8289  %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
8290  ret void
8291}
8292
8293define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
8294; GFX12-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8295; GFX12:       ; %bb.0:
8296; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8297; GFX12-NEXT:    s_wait_expcnt 0x0
8298; GFX12-NEXT:    s_wait_samplecnt 0x0
8299; GFX12-NEXT:    s_wait_bvhcnt 0x0
8300; GFX12-NEXT:    s_wait_kmcnt 0x0
8301; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
8302; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8303; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
8304; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
8305; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
8306; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
8307; GFX12-NEXT:    s_mov_b32 s0, 0
8308; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
8309; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8310; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8311; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8312; GFX12-NEXT:    v_not_b32_e32 v4, v4
8313; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
8314; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
8315; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8316; GFX12-NEXT:    v_mov_b32_e32 v6, v5
8317; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8318; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8319; GFX12-NEXT:    v_max_num_f16_e32 v5, v5, v5
8320; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8321; GFX12-NEXT:    v_min_num_f16_e32 v5, v5, v2
8322; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
8323; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8324; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8325; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
8326; GFX12-NEXT:    global_wb scope:SCOPE_SYS
8327; GFX12-NEXT:    s_wait_storecnt 0x0
8328; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
8329; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8330; GFX12-NEXT:    global_inv scope:SCOPE_SYS
8331; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8332; GFX12-NEXT:    s_wait_alu 0xfffe
8333; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
8334; GFX12-NEXT:    s_wait_alu 0xfffe
8335; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8336; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
8337; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
8338; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8339; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8340; GFX12-NEXT:    s_wait_alu 0xfffe
8341; GFX12-NEXT:    s_setpc_b64 s[30:31]
8342;
8343; GFX940-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8344; GFX940:       ; %bb.0:
8345; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8346; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
8347; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
8348; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
8349; GFX940-NEXT:    v_mov_b32_e32 v1, v5
8350; GFX940-NEXT:    flat_load_dword v5, v[0:1]
8351; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
8352; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8353; GFX940-NEXT:    s_mov_b32 s0, 0xffff
8354; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
8355; GFX940-NEXT:    v_not_b32_e32 v4, v4
8356; GFX940-NEXT:    s_mov_b64 s[0:1], 0
8357; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
8358; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
8359; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
8360; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8361; GFX940-NEXT:    v_mov_b32_e32 v7, v5
8362; GFX940-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
8363; GFX940-NEXT:    v_max_f16_e32 v5, v5, v5
8364; GFX940-NEXT:    v_min_f16_e32 v5, v5, v2
8365; GFX940-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8366; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
8367; GFX940-NEXT:    buffer_wbl2 sc0 sc1
8368; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1
8369; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8370; GFX940-NEXT:    buffer_inv sc0 sc1
8371; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
8372; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
8373; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
8374; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
8375; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
8376; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
8377; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8378; GFX940-NEXT:    s_setpc_b64 s[30:31]
8379;
8380; GFX11-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8381; GFX11:       ; %bb.0:
8382; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8383; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
8384; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8385; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
8386; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
8387; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
8388; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
8389; GFX11-NEXT:    s_mov_b32 s0, 0
8390; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
8391; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8392; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8393; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8394; GFX11-NEXT:    v_not_b32_e32 v4, v4
8395; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
8396; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
8397; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8398; GFX11-NEXT:    v_mov_b32_e32 v6, v5
8399; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8400; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8401; GFX11-NEXT:    v_max_f16_e32 v5, v5, v5
8402; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8403; GFX11-NEXT:    v_min_f16_e32 v5, v5, v2
8404; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
8405; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8406; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8407; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
8408; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
8409; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
8410; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8411; GFX11-NEXT:    buffer_gl1_inv
8412; GFX11-NEXT:    buffer_gl0_inv
8413; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8414; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
8415; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8416; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8417; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
8418; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
8419; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8420; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8421; GFX11-NEXT:    s_setpc_b64 s[30:31]
8422;
8423; GFX10-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8424; GFX10:       ; %bb.0:
8425; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8426; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
8427; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8428; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
8429; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
8430; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
8431; GFX10-NEXT:    s_mov_b32 s4, 0
8432; GFX10-NEXT:    flat_load_dword v5, v[0:1]
8433; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8434; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8435; GFX10-NEXT:    v_not_b32_e32 v4, v4
8436; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
8437; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
8438; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8439; GFX10-NEXT:    v_mov_b32_e32 v6, v5
8440; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8441; GFX10-NEXT:    v_max_f16_e32 v5, v5, v5
8442; GFX10-NEXT:    v_min_f16_e32 v5, v5, v2
8443; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
8444; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
8445; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
8446; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8447; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8448; GFX10-NEXT:    buffer_gl1_inv
8449; GFX10-NEXT:    buffer_gl0_inv
8450; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8451; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
8452; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
8453; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
8454; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
8455; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
8456; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8457; GFX10-NEXT:    s_setpc_b64 s[30:31]
8458;
8459; GFX90A-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8460; GFX90A:       ; %bb.0:
8461; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8462; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
8463; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8464; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
8465; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
8466; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
8467; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8468; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
8469; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
8470; GFX90A-NEXT:    v_not_b32_e32 v4, v4
8471; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
8472; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
8473; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
8474; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
8475; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8476; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
8477; GFX90A-NEXT:    v_lshrrev_b32_e32 v5, v3, v7
8478; GFX90A-NEXT:    v_max_f16_e32 v5, v5, v5
8479; GFX90A-NEXT:    v_min_f16_e32 v5, v5, v2
8480; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8481; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
8482; GFX90A-NEXT:    buffer_wbl2
8483; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
8484; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8485; GFX90A-NEXT:    buffer_invl2
8486; GFX90A-NEXT:    buffer_wbinvl1
8487; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
8488; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8489; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8490; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
8491; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
8492; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
8493; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8494; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8495;
8496; GFX908-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8497; GFX908:       ; %bb.0:
8498; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8499; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
8500; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8501; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
8502; GFX908-NEXT:    flat_load_dword v5, v[0:1]
8503; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
8504; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8505; GFX908-NEXT:    s_mov_b32 s4, 0xffff
8506; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
8507; GFX908-NEXT:    v_not_b32_e32 v4, v4
8508; GFX908-NEXT:    s_mov_b64 s[4:5], 0
8509; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
8510; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
8511; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
8512; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8513; GFX908-NEXT:    v_mov_b32_e32 v6, v5
8514; GFX908-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8515; GFX908-NEXT:    v_max_f16_e32 v5, v5, v5
8516; GFX908-NEXT:    v_min_f16_e32 v5, v5, v2
8517; GFX908-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8518; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
8519; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8520; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8521; GFX908-NEXT:    buffer_wbinvl1
8522; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
8523; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8524; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8525; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
8526; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
8527; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
8528; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8529; GFX908-NEXT:    s_setpc_b64 s[30:31]
8530;
8531; GFX8-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8532; GFX8:       ; %bb.0:
8533; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8534; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
8535; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8536; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
8537; GFX8-NEXT:    flat_load_dword v5, v[0:1]
8538; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
8539; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8540; GFX8-NEXT:    s_mov_b32 s4, 0xffff
8541; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
8542; GFX8-NEXT:    v_not_b32_e32 v4, v4
8543; GFX8-NEXT:    s_mov_b64 s[4:5], 0
8544; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
8545; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
8546; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
8547; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8548; GFX8-NEXT:    v_mov_b32_e32 v6, v5
8549; GFX8-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8550; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
8551; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
8552; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
8553; GFX8-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8554; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
8555; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8556; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8557; GFX8-NEXT:    buffer_wbinvl1
8558; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
8559; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8560; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8561; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
8562; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
8563; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
8564; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8565; GFX8-NEXT:    s_setpc_b64 s[30:31]
8566;
8567; GFX7-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8568; GFX7:       ; %bb.0:
8569; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8570; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
8571; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8572; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
8573; GFX7-NEXT:    flat_load_dword v5, v[0:1]
8574; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v2
8575; GFX7-NEXT:    v_and_b32_e32 v2, 3, v3
8576; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
8577; GFX7-NEXT:    s_mov_b64 s[4:5], 0
8578; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
8579; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v2
8580; GFX7-NEXT:    v_not_b32_e32 v4, v4
8581; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
8582; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
8583; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8584; GFX7-NEXT:    v_mov_b32_e32 v6, v5
8585; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v2, v6
8586; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
8587; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
8588; GFX7-NEXT:    v_min_f32_e32 v5, v5, v3
8589; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
8590; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v2, v5
8591; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
8592; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
8593; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8594; GFX7-NEXT:    buffer_wbinvl1
8595; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
8596; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8597; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8598; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
8599; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
8600; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
8601; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v2, v5
8602; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
8603; GFX7-NEXT:    s_setpc_b64 s[30:31]
8604  %gep = getelementptr half, ptr %ptr, i64 1023
8605  %result = atomicrmw fmin ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
8606  ret half %result
8607}
8608
8609define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
8610; GFX12-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8611; GFX12:       ; %bb.0:
8612; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8613; GFX12-NEXT:    s_wait_expcnt 0x0
8614; GFX12-NEXT:    s_wait_samplecnt 0x0
8615; GFX12-NEXT:    s_wait_bvhcnt 0x0
8616; GFX12-NEXT:    s_wait_kmcnt 0x0
8617; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
8618; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8619; GFX12-NEXT:    v_max_num_f16_e32 v6, v2, v2
8620; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
8621; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
8622; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
8623; GFX12-NEXT:    s_mov_b32 s0, 0
8624; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
8625; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
8626; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
8627; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8628; GFX12-NEXT:    v_not_b32_e32 v5, v5
8629; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
8630; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
8631; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8632; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
8633; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8634; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
8635; GFX12-NEXT:    v_min_num_f16_e32 v2, v2, v6
8636; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8637; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
8638; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
8639; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8640; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
8641; GFX12-NEXT:    global_wb scope:SCOPE_SYS
8642; GFX12-NEXT:    s_wait_storecnt 0x0
8643; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
8644; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8645; GFX12-NEXT:    global_inv scope:SCOPE_SYS
8646; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
8647; GFX12-NEXT:    v_mov_b32_e32 v3, v2
8648; GFX12-NEXT:    s_wait_alu 0xfffe
8649; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
8650; GFX12-NEXT:    s_wait_alu 0xfffe
8651; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8652; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
8653; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
8654; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8655; GFX12-NEXT:    s_wait_alu 0xfffe
8656; GFX12-NEXT:    s_setpc_b64 s[30:31]
8657;
8658; GFX940-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8659; GFX940:       ; %bb.0:
8660; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8661; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
8662; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
8663; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
8664; GFX940-NEXT:    v_mov_b32_e32 v1, v5
8665; GFX940-NEXT:    flat_load_dword v3, v[0:1]
8666; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
8667; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
8668; GFX940-NEXT:    s_mov_b32 s0, 0xffff
8669; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
8670; GFX940-NEXT:    v_not_b32_e32 v5, v5
8671; GFX940-NEXT:    s_mov_b64 s[0:1], 0
8672; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
8673; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
8674; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
8675; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8676; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
8677; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
8678; GFX940-NEXT:    v_min_f16_e32 v2, v2, v6
8679; GFX940-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
8680; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
8681; GFX940-NEXT:    buffer_wbl2 sc0 sc1
8682; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
8683; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8684; GFX940-NEXT:    buffer_inv sc0 sc1
8685; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
8686; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
8687; GFX940-NEXT:    v_mov_b32_e32 v3, v2
8688; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
8689; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
8690; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
8691; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
8692; GFX940-NEXT:    s_setpc_b64 s[30:31]
8693;
8694; GFX11-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8695; GFX11:       ; %bb.0:
8696; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8697; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
8698; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8699; GFX11-NEXT:    v_max_f16_e32 v6, v2, v2
8700; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
8701; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
8702; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
8703; GFX11-NEXT:    s_mov_b32 s0, 0
8704; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
8705; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
8706; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
8707; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8708; GFX11-NEXT:    v_not_b32_e32 v5, v5
8709; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
8710; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
8711; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8712; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
8713; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8714; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
8715; GFX11-NEXT:    v_min_f16_e32 v2, v2, v6
8716; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8717; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
8718; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
8719; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8720; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
8721; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
8722; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
8723; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8724; GFX11-NEXT:    buffer_gl1_inv
8725; GFX11-NEXT:    buffer_gl0_inv
8726; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
8727; GFX11-NEXT:    v_mov_b32_e32 v3, v2
8728; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
8729; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
8730; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8731; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
8732; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
8733; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8734; GFX11-NEXT:    s_setpc_b64 s[30:31]
8735;
8736; GFX10-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8737; GFX10:       ; %bb.0:
8738; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8739; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
8740; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
8741; GFX10-NEXT:    v_max_f16_e32 v6, v2, v2
8742; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
8743; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
8744; GFX10-NEXT:    s_mov_b32 s4, 0
8745; GFX10-NEXT:    flat_load_dword v3, v[0:1]
8746; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
8747; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
8748; GFX10-NEXT:    v_not_b32_e32 v5, v5
8749; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
8750; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
8751; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8752; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
8753; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
8754; GFX10-NEXT:    v_min_f16_e32 v2, v2, v6
8755; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
8756; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
8757; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
8758; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8759; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8760; GFX10-NEXT:    buffer_gl1_inv
8761; GFX10-NEXT:    buffer_gl0_inv
8762; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
8763; GFX10-NEXT:    v_mov_b32_e32 v3, v2
8764; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
8765; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
8766; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
8767; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
8768; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
8769; GFX10-NEXT:    s_setpc_b64 s[30:31]
8770;
8771; GFX90A-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8772; GFX90A:       ; %bb.0:
8773; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8774; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
8775; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8776; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
8777; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
8778; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
8779; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
8780; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
8781; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
8782; GFX90A-NEXT:    v_not_b32_e32 v5, v5
8783; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
8784; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
8785; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
8786; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
8787; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8788; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
8789; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
8790; GFX90A-NEXT:    v_min_f16_e32 v2, v2, v6
8791; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
8792; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
8793; GFX90A-NEXT:    buffer_wbl2
8794; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8795; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8796; GFX90A-NEXT:    buffer_invl2
8797; GFX90A-NEXT:    buffer_wbinvl1
8798; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
8799; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8800; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
8801; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8802; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
8803; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
8804; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
8805; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8806;
8807; GFX908-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8808; GFX908:       ; %bb.0:
8809; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8810; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
8811; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8812; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
8813; GFX908-NEXT:    flat_load_dword v3, v[0:1]
8814; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
8815; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
8816; GFX908-NEXT:    s_mov_b32 s4, 0xffff
8817; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
8818; GFX908-NEXT:    v_not_b32_e32 v5, v5
8819; GFX908-NEXT:    s_mov_b64 s[4:5], 0
8820; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
8821; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
8822; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
8823; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8824; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
8825; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
8826; GFX908-NEXT:    v_min_f16_e32 v2, v2, v6
8827; GFX908-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
8828; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
8829; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8830; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8831; GFX908-NEXT:    buffer_wbinvl1
8832; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
8833; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8834; GFX908-NEXT:    v_mov_b32_e32 v3, v2
8835; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8836; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
8837; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
8838; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
8839; GFX908-NEXT:    s_setpc_b64 s[30:31]
8840;
8841; GFX8-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8842; GFX8:       ; %bb.0:
8843; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8844; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
8845; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8846; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
8847; GFX8-NEXT:    flat_load_dword v3, v[0:1]
8848; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
8849; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
8850; GFX8-NEXT:    s_mov_b32 s4, 0xffff
8851; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
8852; GFX8-NEXT:    v_not_b32_e32 v5, v5
8853; GFX8-NEXT:    s_mov_b64 s[4:5], 0
8854; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
8855; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
8856; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
8857; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8858; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
8859; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
8860; GFX8-NEXT:    v_min_f16_e32 v2, v2, v6
8861; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
8862; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
8863; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
8864; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8865; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8866; GFX8-NEXT:    buffer_wbinvl1
8867; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
8868; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8869; GFX8-NEXT:    v_mov_b32_e32 v3, v2
8870; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8871; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
8872; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
8873; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
8874; GFX8-NEXT:    s_setpc_b64 s[30:31]
8875;
8876; GFX7-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
8877; GFX7:       ; %bb.0:
8878; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8879; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
8880; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8881; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
8882; GFX7-NEXT:    flat_load_dword v3, v[0:1]
8883; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
8884; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
8885; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
8886; GFX7-NEXT:    s_mov_b64 s[4:5], 0
8887; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
8888; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
8889; GFX7-NEXT:    v_not_b32_e32 v6, v2
8890; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
8891; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
8892; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8893; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
8894; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
8895; GFX7-NEXT:    v_and_b32_e32 v7, v3, v6
8896; GFX7-NEXT:    v_min_f32_e32 v2, v2, v5
8897; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
8898; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
8899; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
8900; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8901; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8902; GFX7-NEXT:    buffer_wbinvl1
8903; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
8904; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
8905; GFX7-NEXT:    v_mov_b32_e32 v3, v2
8906; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
8907; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
8908; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
8909; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
8910; GFX7-NEXT:    s_setpc_b64 s[30:31]
8911  %gep = getelementptr half, ptr %ptr, i64 1023
8912  %unused = atomicrmw fmin ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
8913  ret void
8914}
8915
8916; --------------------------------------------------------------------
8917; bfloat
8918; --------------------------------------------------------------------
8919
8920define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
8921; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
8922; GFX12:       ; %bb.0:
8923; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8924; GFX12-NEXT:    s_wait_expcnt 0x0
8925; GFX12-NEXT:    s_wait_samplecnt 0x0
8926; GFX12-NEXT:    s_wait_bvhcnt 0x0
8927; GFX12-NEXT:    s_wait_kmcnt 0x0
8928; GFX12-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
8929; GFX12-NEXT:    s_mov_b32 s0, 0
8930; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
8931; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
8932; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
8933; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
8934; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8935; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
8936; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8937; GFX12-NEXT:    v_not_b32_e32 v4, v4
8938; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
8939; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
8940; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8941; GFX12-NEXT:    v_mov_b32_e32 v6, v5
8942; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8943; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
8944; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
8945; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8946; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
8947; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
8948; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
8949; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
8950; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
8951; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
8952; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
8953; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8954; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
8955; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
8956; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
8957; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
8958; GFX12-NEXT:    s_wait_storecnt 0x0
8959; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
8960; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
8961; GFX12-NEXT:    global_inv scope:SCOPE_DEV
8962; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
8963; GFX12-NEXT:    s_wait_alu 0xfffe
8964; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
8965; GFX12-NEXT:    s_wait_alu 0xfffe
8966; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
8967; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
8968; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
8969; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
8970; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
8971; GFX12-NEXT:    s_wait_alu 0xfffe
8972; GFX12-NEXT:    s_setpc_b64 s[30:31]
8973;
8974; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
8975; GFX940:       ; %bb.0:
8976; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8977; GFX940-NEXT:    v_mov_b32_e32 v3, v0
8978; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
8979; GFX940-NEXT:    flat_load_dword v5, v[0:1]
8980; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
8981; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
8982; GFX940-NEXT:    s_mov_b32 s0, 0xffff
8983; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
8984; GFX940-NEXT:    v_not_b32_e32 v4, v4
8985; GFX940-NEXT:    s_mov_b64 s[0:1], 0
8986; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
8987; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
8988; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
8989; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
8990; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
8991; GFX940-NEXT:    v_mov_b32_e32 v7, v5
8992; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
8993; GFX940-NEXT:    s_nop 0
8994; GFX940-NEXT:    v_min_f32_e32 v5, v5, v2
8995; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
8996; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
8997; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
8998; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
8999; GFX940-NEXT:    s_nop 1
9000; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
9001; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9002; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
9003; GFX940-NEXT:    buffer_wbl2 sc1
9004; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
9005; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9006; GFX940-NEXT:    buffer_inv sc1
9007; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
9008; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
9009; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
9010; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
9011; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
9012; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
9013; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9014; GFX940-NEXT:    s_setpc_b64 s[30:31]
9015;
9016; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
9017; GFX11:       ; %bb.0:
9018; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9019; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
9020; GFX11-NEXT:    s_mov_b32 s0, 0
9021; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
9022; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
9023; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
9024; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
9025; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9026; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
9027; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9028; GFX11-NEXT:    v_not_b32_e32 v4, v4
9029; GFX11-NEXT:    .p2align 6
9030; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
9031; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
9032; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9033; GFX11-NEXT:    v_mov_b32_e32 v6, v5
9034; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9035; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
9036; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
9037; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9038; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
9039; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
9040; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9041; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9042; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
9043; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
9044; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
9045; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9046; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
9047; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
9048; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9049; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
9050; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
9051; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
9052; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9053; GFX11-NEXT:    buffer_gl1_inv
9054; GFX11-NEXT:    buffer_gl0_inv
9055; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
9056; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
9057; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
9058; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9059; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
9060; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
9061; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9062; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9063; GFX11-NEXT:    s_setpc_b64 s[30:31]
9064;
9065; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
9066; GFX10:       ; %bb.0:
9067; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9068; GFX10-NEXT:    v_mov_b32_e32 v3, v0
9069; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9070; GFX10-NEXT:    s_mov_b32 s4, 0
9071; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
9072; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
9073; GFX10-NEXT:    flat_load_dword v5, v[0:1]
9074; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9075; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
9076; GFX10-NEXT:    v_not_b32_e32 v4, v4
9077; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
9078; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
9079; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9080; GFX10-NEXT:    v_mov_b32_e32 v6, v5
9081; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9082; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
9083; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
9084; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9085; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9086; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
9087; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
9088; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9089; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
9090; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
9091; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9092; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9093; GFX10-NEXT:    buffer_gl1_inv
9094; GFX10-NEXT:    buffer_gl0_inv
9095; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
9096; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
9097; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
9098; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
9099; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
9100; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
9101; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9102; GFX10-NEXT:    s_setpc_b64 s[30:31]
9103;
9104; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
9105; GFX90A:       ; %bb.0:
9106; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9107; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
9108; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
9109; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
9110; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
9111; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9112; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
9113; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9114; GFX90A-NEXT:    v_not_b32_e32 v4, v4
9115; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
9116; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9117; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
9118; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
9119; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
9120; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9121; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
9122; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9123; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v2
9124; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
9125; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9126; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
9127; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9128; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
9129; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9130; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
9131; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
9132; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9133; GFX90A-NEXT:    buffer_wbinvl1
9134; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
9135; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9136; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9137; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
9138; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
9139; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
9140; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9141; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9142;
9143; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
9144; GFX908:       ; %bb.0:
9145; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9146; GFX908-NEXT:    v_mov_b32_e32 v3, v0
9147; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
9148; GFX908-NEXT:    flat_load_dword v5, v[0:1]
9149; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
9150; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9151; GFX908-NEXT:    s_mov_b32 s4, 0xffff
9152; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9153; GFX908-NEXT:    v_not_b32_e32 v4, v4
9154; GFX908-NEXT:    s_mov_b64 s[4:5], 0
9155; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9156; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
9157; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
9158; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
9159; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9160; GFX908-NEXT:    v_mov_b32_e32 v6, v5
9161; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9162; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
9163; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
9164; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9165; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
9166; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9167; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
9168; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9169; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
9170; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9171; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9172; GFX908-NEXT:    buffer_wbinvl1
9173; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
9174; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9175; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9176; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
9177; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
9178; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
9179; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9180; GFX908-NEXT:    s_setpc_b64 s[30:31]
9181;
9182; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
9183; GFX8:       ; %bb.0:
9184; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9185; GFX8-NEXT:    v_mov_b32_e32 v3, v0
9186; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
9187; GFX8-NEXT:    flat_load_dword v5, v[0:1]
9188; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
9189; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9190; GFX8-NEXT:    s_mov_b32 s4, 0xffff
9191; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9192; GFX8-NEXT:    v_not_b32_e32 v4, v4
9193; GFX8-NEXT:    s_mov_b64 s[4:5], 0
9194; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9195; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
9196; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
9197; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9198; GFX8-NEXT:    v_mov_b32_e32 v6, v5
9199; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9200; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
9201; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
9202; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
9203; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
9204; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
9205; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9206; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
9207; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
9208; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9209; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
9210; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9211; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9212; GFX8-NEXT:    buffer_wbinvl1
9213; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
9214; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9215; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9216; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
9217; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
9218; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
9219; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9220; GFX8-NEXT:    s_setpc_b64 s[30:31]
9221;
9222; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
9223; GFX7:       ; %bb.0:
9224; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9225; GFX7-NEXT:    v_mov_b32_e32 v3, v0
9226; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
9227; GFX7-NEXT:    flat_load_dword v5, v[0:1]
9228; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
9229; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9230; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
9231; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
9232; GFX7-NEXT:    v_not_b32_e32 v4, v4
9233; GFX7-NEXT:    s_mov_b64 s[4:5], 0
9234; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9235; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
9236; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
9237; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9238; GFX7-NEXT:    v_mov_b32_e32 v6, v5
9239; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
9240; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
9241; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
9242; GFX7-NEXT:    v_min_f32_e32 v5, v5, v2
9243; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
9244; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
9245; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
9246; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
9247; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9248; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9249; GFX7-NEXT:    buffer_wbinvl1
9250; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
9251; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9252; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9253; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
9254; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
9255; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
9256; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9257; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
9258; GFX7-NEXT:    s_setpc_b64 s[30:31]
9259  %result = atomicrmw fmin ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
9260  ret bfloat %result
9261}
9262
9263define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
9264; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
9265; GFX12:       ; %bb.0:
9266; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9267; GFX12-NEXT:    s_wait_expcnt 0x0
9268; GFX12-NEXT:    s_wait_samplecnt 0x0
9269; GFX12-NEXT:    s_wait_bvhcnt 0x0
9270; GFX12-NEXT:    s_wait_kmcnt 0x0
9271; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
9272; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
9273; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9274; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
9275; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
9276; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
9277; GFX12-NEXT:    s_mov_b32 s0, 0
9278; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
9279; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9280; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
9281; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9282; GFX12-NEXT:    v_not_b32_e32 v4, v4
9283; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
9284; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
9285; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9286; GFX12-NEXT:    v_mov_b32_e32 v6, v5
9287; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9288; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
9289; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
9290; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9291; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
9292; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
9293; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9294; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9295; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
9296; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
9297; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
9298; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9299; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
9300; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
9301; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9302; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
9303; GFX12-NEXT:    s_wait_storecnt 0x0
9304; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
9305; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9306; GFX12-NEXT:    global_inv scope:SCOPE_DEV
9307; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
9308; GFX12-NEXT:    s_wait_alu 0xfffe
9309; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
9310; GFX12-NEXT:    s_wait_alu 0xfffe
9311; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9312; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
9313; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
9314; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9315; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9316; GFX12-NEXT:    s_wait_alu 0xfffe
9317; GFX12-NEXT:    s_setpc_b64 s[30:31]
9318;
9319; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
9320; GFX940:       ; %bb.0:
9321; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9322; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
9323; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
9324; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
9325; GFX940-NEXT:    v_mov_b32_e32 v1, v5
9326; GFX940-NEXT:    flat_load_dword v5, v[0:1]
9327; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
9328; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9329; GFX940-NEXT:    s_mov_b32 s0, 0xffff
9330; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
9331; GFX940-NEXT:    v_not_b32_e32 v4, v4
9332; GFX940-NEXT:    s_mov_b64 s[0:1], 0
9333; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9334; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
9335; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
9336; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
9337; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9338; GFX940-NEXT:    v_mov_b32_e32 v7, v5
9339; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9340; GFX940-NEXT:    s_nop 0
9341; GFX940-NEXT:    v_min_f32_e32 v5, v5, v2
9342; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
9343; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9344; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
9345; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9346; GFX940-NEXT:    s_nop 1
9347; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
9348; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9349; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
9350; GFX940-NEXT:    buffer_wbl2 sc1
9351; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
9352; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9353; GFX940-NEXT:    buffer_inv sc1
9354; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
9355; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
9356; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
9357; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
9358; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
9359; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
9360; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9361; GFX940-NEXT:    s_setpc_b64 s[30:31]
9362;
9363; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
9364; GFX11:       ; %bb.0:
9365; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9366; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
9367; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
9368; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9369; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
9370; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
9371; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
9372; GFX11-NEXT:    s_mov_b32 s0, 0
9373; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
9374; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9375; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
9376; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9377; GFX11-NEXT:    v_not_b32_e32 v4, v4
9378; GFX11-NEXT:    .p2align 6
9379; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
9380; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
9381; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9382; GFX11-NEXT:    v_mov_b32_e32 v6, v5
9383; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9384; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
9385; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
9386; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9387; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
9388; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
9389; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9390; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9391; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
9392; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
9393; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
9394; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9395; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
9396; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
9397; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9398; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
9399; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
9400; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
9401; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9402; GFX11-NEXT:    buffer_gl1_inv
9403; GFX11-NEXT:    buffer_gl0_inv
9404; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
9405; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
9406; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
9407; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9408; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
9409; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
9410; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9411; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9412; GFX11-NEXT:    s_setpc_b64 s[30:31]
9413;
9414; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
9415; GFX10:       ; %bb.0:
9416; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9417; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
9418; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
9419; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9420; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
9421; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
9422; GFX10-NEXT:    s_mov_b32 s4, 0
9423; GFX10-NEXT:    flat_load_dword v5, v[0:1]
9424; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9425; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
9426; GFX10-NEXT:    v_not_b32_e32 v4, v4
9427; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
9428; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
9429; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9430; GFX10-NEXT:    v_mov_b32_e32 v6, v5
9431; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9432; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
9433; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
9434; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9435; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9436; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
9437; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
9438; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9439; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
9440; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
9441; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9442; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9443; GFX10-NEXT:    buffer_gl1_inv
9444; GFX10-NEXT:    buffer_gl0_inv
9445; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
9446; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
9447; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
9448; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
9449; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
9450; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
9451; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9452; GFX10-NEXT:    s_setpc_b64 s[30:31]
9453;
9454; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
9455; GFX90A:       ; %bb.0:
9456; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9457; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
9458; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9459; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
9460; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
9461; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
9462; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9463; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
9464; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9465; GFX90A-NEXT:    v_not_b32_e32 v4, v4
9466; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
9467; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9468; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
9469; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
9470; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
9471; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9472; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
9473; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9474; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v2
9475; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
9476; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9477; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
9478; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9479; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
9480; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9481; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
9482; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
9483; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9484; GFX90A-NEXT:    buffer_wbinvl1
9485; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
9486; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9487; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9488; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
9489; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
9490; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
9491; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9492; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9493;
9494; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
9495; GFX908:       ; %bb.0:
9496; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9497; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
9498; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9499; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
9500; GFX908-NEXT:    flat_load_dword v5, v[0:1]
9501; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
9502; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9503; GFX908-NEXT:    s_mov_b32 s4, 0xffff
9504; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9505; GFX908-NEXT:    v_not_b32_e32 v4, v4
9506; GFX908-NEXT:    s_mov_b64 s[4:5], 0
9507; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9508; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
9509; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
9510; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
9511; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9512; GFX908-NEXT:    v_mov_b32_e32 v6, v5
9513; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9514; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
9515; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
9516; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9517; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
9518; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9519; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
9520; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9521; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
9522; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9523; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9524; GFX908-NEXT:    buffer_wbinvl1
9525; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
9526; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9527; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9528; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
9529; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
9530; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
9531; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9532; GFX908-NEXT:    s_setpc_b64 s[30:31]
9533;
9534; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
9535; GFX8:       ; %bb.0:
9536; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9537; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
9538; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9539; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
9540; GFX8-NEXT:    flat_load_dword v5, v[0:1]
9541; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
9542; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9543; GFX8-NEXT:    s_mov_b32 s4, 0xffff
9544; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9545; GFX8-NEXT:    v_not_b32_e32 v4, v4
9546; GFX8-NEXT:    s_mov_b64 s[4:5], 0
9547; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9548; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
9549; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
9550; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9551; GFX8-NEXT:    v_mov_b32_e32 v6, v5
9552; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9553; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
9554; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
9555; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
9556; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
9557; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
9558; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9559; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
9560; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
9561; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9562; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
9563; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9564; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9565; GFX8-NEXT:    buffer_wbinvl1
9566; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
9567; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9568; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9569; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
9570; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
9571; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
9572; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9573; GFX8-NEXT:    s_setpc_b64 s[30:31]
9574;
9575; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
9576; GFX7:       ; %bb.0:
9577; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9578; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
9579; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9580; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
9581; GFX7-NEXT:    flat_load_dword v5, v[0:1]
9582; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
9583; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9584; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
9585; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
9586; GFX7-NEXT:    v_not_b32_e32 v4, v4
9587; GFX7-NEXT:    s_mov_b64 s[4:5], 0
9588; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9589; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
9590; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
9591; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9592; GFX7-NEXT:    v_mov_b32_e32 v6, v5
9593; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
9594; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
9595; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
9596; GFX7-NEXT:    v_min_f32_e32 v5, v5, v2
9597; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
9598; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
9599; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
9600; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
9601; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9602; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9603; GFX7-NEXT:    buffer_wbinvl1
9604; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
9605; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9606; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9607; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
9608; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
9609; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
9610; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9611; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
9612; GFX7-NEXT:    s_setpc_b64 s[30:31]
9613  %gep = getelementptr bfloat, ptr %ptr, i64 1023
9614  %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
9615  ret bfloat %result
9616}
9617
9618define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
9619; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
9620; GFX12:       ; %bb.0:
9621; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9622; GFX12-NEXT:    s_wait_expcnt 0x0
9623; GFX12-NEXT:    s_wait_samplecnt 0x0
9624; GFX12-NEXT:    s_wait_bvhcnt 0x0
9625; GFX12-NEXT:    s_wait_kmcnt 0x0
9626; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
9627; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
9628; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9629; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
9630; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
9631; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
9632; GFX12-NEXT:    s_mov_b32 s0, 0
9633; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
9634; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9635; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
9636; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9637; GFX12-NEXT:    v_not_b32_e32 v4, v4
9638; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
9639; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
9640; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9641; GFX12-NEXT:    v_mov_b32_e32 v6, v5
9642; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9643; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
9644; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
9645; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9646; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
9647; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
9648; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9649; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9650; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
9651; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
9652; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
9653; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9654; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
9655; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
9656; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9657; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
9658; GFX12-NEXT:    s_wait_storecnt 0x0
9659; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
9660; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9661; GFX12-NEXT:    global_inv scope:SCOPE_DEV
9662; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
9663; GFX12-NEXT:    s_wait_alu 0xfffe
9664; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
9665; GFX12-NEXT:    s_wait_alu 0xfffe
9666; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9667; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
9668; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
9669; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9670; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9671; GFX12-NEXT:    s_wait_alu 0xfffe
9672; GFX12-NEXT:    s_setpc_b64 s[30:31]
9673;
9674; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
9675; GFX940:       ; %bb.0:
9676; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9677; GFX940-NEXT:    s_movk_i32 s0, 0xf800
9678; GFX940-NEXT:    s_mov_b32 s1, -1
9679; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
9680; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
9681; GFX940-NEXT:    v_mov_b32_e32 v1, v5
9682; GFX940-NEXT:    flat_load_dword v5, v[0:1]
9683; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
9684; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9685; GFX940-NEXT:    s_mov_b32 s0, 0xffff
9686; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
9687; GFX940-NEXT:    v_not_b32_e32 v4, v4
9688; GFX940-NEXT:    s_mov_b64 s[0:1], 0
9689; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9690; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
9691; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
9692; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
9693; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9694; GFX940-NEXT:    v_mov_b32_e32 v7, v5
9695; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9696; GFX940-NEXT:    s_nop 0
9697; GFX940-NEXT:    v_min_f32_e32 v5, v5, v2
9698; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
9699; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9700; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
9701; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9702; GFX940-NEXT:    s_nop 1
9703; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
9704; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9705; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
9706; GFX940-NEXT:    buffer_wbl2 sc1
9707; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
9708; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9709; GFX940-NEXT:    buffer_inv sc1
9710; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
9711; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
9712; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
9713; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
9714; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
9715; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
9716; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9717; GFX940-NEXT:    s_setpc_b64 s[30:31]
9718;
9719; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
9720; GFX11:       ; %bb.0:
9721; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9722; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
9723; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
9724; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9725; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
9726; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
9727; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
9728; GFX11-NEXT:    s_mov_b32 s0, 0
9729; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
9730; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9731; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
9732; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9733; GFX11-NEXT:    v_not_b32_e32 v4, v4
9734; GFX11-NEXT:    .p2align 6
9735; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
9736; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
9737; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9738; GFX11-NEXT:    v_mov_b32_e32 v6, v5
9739; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9740; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
9741; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
9742; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9743; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
9744; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
9745; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9746; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9747; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
9748; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
9749; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
9750; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9751; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
9752; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
9753; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9754; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
9755; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
9756; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
9757; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9758; GFX11-NEXT:    buffer_gl1_inv
9759; GFX11-NEXT:    buffer_gl0_inv
9760; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
9761; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
9762; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
9763; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
9764; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
9765; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
9766; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
9767; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9768; GFX11-NEXT:    s_setpc_b64 s[30:31]
9769;
9770; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
9771; GFX10:       ; %bb.0:
9772; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9773; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
9774; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
9775; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9776; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
9777; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
9778; GFX10-NEXT:    s_mov_b32 s4, 0
9779; GFX10-NEXT:    flat_load_dword v5, v[0:1]
9780; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9781; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
9782; GFX10-NEXT:    v_not_b32_e32 v4, v4
9783; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
9784; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
9785; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9786; GFX10-NEXT:    v_mov_b32_e32 v6, v5
9787; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9788; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
9789; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
9790; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9791; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
9792; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
9793; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
9794; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9795; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
9796; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
9797; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9798; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9799; GFX10-NEXT:    buffer_gl1_inv
9800; GFX10-NEXT:    buffer_gl0_inv
9801; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
9802; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
9803; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
9804; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
9805; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
9806; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
9807; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9808; GFX10-NEXT:    s_setpc_b64 s[30:31]
9809;
9810; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
9811; GFX90A:       ; %bb.0:
9812; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9813; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
9814; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
9815; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
9816; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
9817; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
9818; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9819; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
9820; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9821; GFX90A-NEXT:    v_not_b32_e32 v4, v4
9822; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
9823; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9824; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
9825; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
9826; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
9827; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9828; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
9829; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9830; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v2
9831; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
9832; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9833; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
9834; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9835; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
9836; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9837; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
9838; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
9839; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9840; GFX90A-NEXT:    buffer_wbinvl1
9841; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
9842; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9843; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9844; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
9845; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
9846; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
9847; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9848; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9849;
9850; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
9851; GFX908:       ; %bb.0:
9852; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9853; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
9854; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
9855; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
9856; GFX908-NEXT:    flat_load_dword v5, v[0:1]
9857; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
9858; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9859; GFX908-NEXT:    s_mov_b32 s4, 0xffff
9860; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9861; GFX908-NEXT:    v_not_b32_e32 v4, v4
9862; GFX908-NEXT:    s_mov_b64 s[4:5], 0
9863; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9864; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
9865; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
9866; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
9867; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9868; GFX908-NEXT:    v_mov_b32_e32 v6, v5
9869; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9870; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
9871; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
9872; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
9873; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
9874; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9875; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
9876; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9877; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
9878; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9879; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9880; GFX908-NEXT:    buffer_wbinvl1
9881; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
9882; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9883; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9884; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
9885; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
9886; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
9887; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9888; GFX908-NEXT:    s_setpc_b64 s[30:31]
9889;
9890; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
9891; GFX8:       ; %bb.0:
9892; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9893; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
9894; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
9895; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
9896; GFX8-NEXT:    flat_load_dword v5, v[0:1]
9897; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
9898; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9899; GFX8-NEXT:    s_mov_b32 s4, 0xffff
9900; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
9901; GFX8-NEXT:    v_not_b32_e32 v4, v4
9902; GFX8-NEXT:    s_mov_b64 s[4:5], 0
9903; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
9904; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
9905; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
9906; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9907; GFX8-NEXT:    v_mov_b32_e32 v6, v5
9908; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9909; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
9910; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
9911; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
9912; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
9913; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
9914; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
9915; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
9916; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
9917; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9918; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
9919; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9920; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9921; GFX8-NEXT:    buffer_wbinvl1
9922; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
9923; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9924; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9925; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
9926; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
9927; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
9928; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9929; GFX8-NEXT:    s_setpc_b64 s[30:31]
9930;
9931; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
9932; GFX7:       ; %bb.0:
9933; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9934; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0xfffff800, v0
9935; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
9936; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
9937; GFX7-NEXT:    flat_load_dword v5, v[0:1]
9938; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
9939; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
9940; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
9941; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
9942; GFX7-NEXT:    v_not_b32_e32 v4, v4
9943; GFX7-NEXT:    s_mov_b64 s[4:5], 0
9944; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
9945; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
9946; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
9947; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9948; GFX7-NEXT:    v_mov_b32_e32 v6, v5
9949; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
9950; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
9951; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
9952; GFX7-NEXT:    v_min_f32_e32 v5, v5, v2
9953; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
9954; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
9955; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
9956; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
9957; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
9958; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
9959; GFX7-NEXT:    buffer_wbinvl1
9960; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
9961; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
9962; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
9963; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
9964; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
9965; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
9966; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
9967; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
9968; GFX7-NEXT:    s_setpc_b64 s[30:31]
9969  %gep = getelementptr bfloat, ptr %ptr, i64 -1024
9970  %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
9971  ret bfloat %result
9972 }
9973
9974define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
9975; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
9976; GFX12:       ; %bb.0:
9977; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9978; GFX12-NEXT:    s_wait_expcnt 0x0
9979; GFX12-NEXT:    s_wait_samplecnt 0x0
9980; GFX12-NEXT:    s_wait_bvhcnt 0x0
9981; GFX12-NEXT:    s_wait_kmcnt 0x0
9982; GFX12-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
9983; GFX12-NEXT:    s_mov_b32 s0, 0
9984; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
9985; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
9986; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
9987; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
9988; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
9989; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
9990; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
9991; GFX12-NEXT:    v_not_b32_e32 v6, v3
9992; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
9993; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
9994; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
9995; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
9996; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9997; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
9998; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
9999; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10000; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
10001; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v3
10002; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
10003; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
10004; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10005; GFX12-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
10006; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
10007; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10008; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
10009; GFX12-NEXT:    v_and_or_b32 v3, v4, v6, v3
10010; GFX12-NEXT:    s_wait_storecnt 0x0
10011; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
10012; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10013; GFX12-NEXT:    global_inv scope:SCOPE_DEV
10014; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
10015; GFX12-NEXT:    v_mov_b32_e32 v4, v3
10016; GFX12-NEXT:    s_wait_alu 0xfffe
10017; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
10018; GFX12-NEXT:    s_wait_alu 0xfffe
10019; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10020; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
10021; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
10022; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10023; GFX12-NEXT:    s_wait_alu 0xfffe
10024; GFX12-NEXT:    s_setpc_b64 s[30:31]
10025;
10026; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
10027; GFX940:       ; %bb.0:
10028; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10029; GFX940-NEXT:    v_mov_b32_e32 v3, v0
10030; GFX940-NEXT:    v_and_b32_e32 v0, -4, v3
10031; GFX940-NEXT:    flat_load_dword v5, v[0:1]
10032; GFX940-NEXT:    v_and_b32_e32 v3, 3, v3
10033; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
10034; GFX940-NEXT:    s_mov_b32 s0, 0xffff
10035; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
10036; GFX940-NEXT:    v_not_b32_e32 v6, v4
10037; GFX940-NEXT:    s_mov_b64 s[0:1], 0
10038; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10039; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
10040; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
10041; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
10042; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10043; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10044; GFX940-NEXT:    s_nop 0
10045; GFX940-NEXT:    v_min_f32_e32 v4, v4, v2
10046; GFX940-NEXT:    v_bfe_u32 v7, v4, 16, 1
10047; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v4
10048; GFX940-NEXT:    v_add3_u32 v7, v7, v4, s2
10049; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
10050; GFX940-NEXT:    s_nop 1
10051; GFX940-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
10052; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10053; GFX940-NEXT:    v_and_or_b32 v4, v5, v6, v4
10054; GFX940-NEXT:    buffer_wbl2 sc1
10055; GFX940-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
10056; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10057; GFX940-NEXT:    buffer_inv sc1
10058; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
10059; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
10060; GFX940-NEXT:    v_mov_b32_e32 v5, v4
10061; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
10062; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
10063; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
10064; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
10065; GFX940-NEXT:    s_setpc_b64 s[30:31]
10066;
10067; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
10068; GFX11:       ; %bb.0:
10069; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10070; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
10071; GFX11-NEXT:    s_mov_b32 s0, 0
10072; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
10073; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
10074; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
10075; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
10076; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
10077; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
10078; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10079; GFX11-NEXT:    v_not_b32_e32 v6, v3
10080; GFX11-NEXT:    .p2align 6
10081; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
10082; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
10083; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10084; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
10085; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10086; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
10087; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
10088; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10089; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
10090; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
10091; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
10092; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
10093; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10094; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
10095; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
10096; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10097; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
10098; GFX11-NEXT:    v_and_or_b32 v3, v4, v6, v3
10099; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
10100; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
10101; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10102; GFX11-NEXT:    buffer_gl1_inv
10103; GFX11-NEXT:    buffer_gl0_inv
10104; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
10105; GFX11-NEXT:    v_mov_b32_e32 v4, v3
10106; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
10107; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10108; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10109; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
10110; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
10111; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10112; GFX11-NEXT:    s_setpc_b64 s[30:31]
10113;
10114; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
10115; GFX10:       ; %bb.0:
10116; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10117; GFX10-NEXT:    v_mov_b32_e32 v3, v0
10118; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10119; GFX10-NEXT:    s_mov_b32 s4, 0
10120; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
10121; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
10122; GFX10-NEXT:    flat_load_dword v4, v[0:1]
10123; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
10124; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
10125; GFX10-NEXT:    v_not_b32_e32 v6, v3
10126; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
10127; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
10128; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10129; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10130; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
10131; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
10132; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v3
10133; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
10134; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
10135; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
10136; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10137; GFX10-NEXT:    v_and_or_b32 v3, v4, v6, v3
10138; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
10139; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
10140; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10141; GFX10-NEXT:    buffer_gl1_inv
10142; GFX10-NEXT:    buffer_gl0_inv
10143; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
10144; GFX10-NEXT:    v_mov_b32_e32 v4, v3
10145; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
10146; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
10147; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
10148; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
10149; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
10150; GFX10-NEXT:    s_setpc_b64 s[30:31]
10151;
10152; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
10153; GFX90A:       ; %bb.0:
10154; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10155; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
10156; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
10157; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
10158; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
10159; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
10160; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
10161; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
10162; GFX90A-NEXT:    v_not_b32_e32 v6, v4
10163; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
10164; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10165; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
10166; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
10167; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
10168; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10169; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10170; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v2
10171; GFX90A-NEXT:    v_bfe_u32 v7, v4, 16, 1
10172; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v4
10173; GFX90A-NEXT:    v_add3_u32 v7, v7, v4, s6
10174; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
10175; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
10176; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10177; GFX90A-NEXT:    v_and_or_b32 v4, v5, v6, v4
10178; GFX90A-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
10179; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10180; GFX90A-NEXT:    buffer_wbinvl1
10181; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
10182; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10183; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
10184; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10185; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
10186; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
10187; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
10188; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10189;
10190; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
10191; GFX908:       ; %bb.0:
10192; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10193; GFX908-NEXT:    v_mov_b32_e32 v3, v0
10194; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
10195; GFX908-NEXT:    flat_load_dword v4, v[0:1]
10196; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
10197; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
10198; GFX908-NEXT:    s_mov_b32 s4, 0xffff
10199; GFX908-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
10200; GFX908-NEXT:    v_not_b32_e32 v6, v3
10201; GFX908-NEXT:    s_mov_b64 s[4:5], 0
10202; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10203; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
10204; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
10205; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
10206; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10207; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10208; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
10209; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
10210; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
10211; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s6
10212; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
10213; GFX908-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
10214; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10215; GFX908-NEXT:    v_and_or_b32 v3, v4, v6, v3
10216; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
10217; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10218; GFX908-NEXT:    buffer_wbinvl1
10219; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
10220; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10221; GFX908-NEXT:    v_mov_b32_e32 v4, v3
10222; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10223; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
10224; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
10225; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
10226; GFX908-NEXT:    s_setpc_b64 s[30:31]
10227;
10228; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
10229; GFX8:       ; %bb.0:
10230; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10231; GFX8-NEXT:    v_mov_b32_e32 v3, v0
10232; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
10233; GFX8-NEXT:    flat_load_dword v4, v[0:1]
10234; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
10235; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
10236; GFX8-NEXT:    s_mov_b32 s4, 0xffff
10237; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v5, s4
10238; GFX8-NEXT:    v_not_b32_e32 v6, v3
10239; GFX8-NEXT:    s_mov_b64 s[4:5], 0
10240; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10241; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
10242; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
10243; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10244; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10245; GFX8-NEXT:    v_min_f32_e32 v3, v3, v2
10246; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 1
10247; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v3
10248; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
10249; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
10250; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
10251; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
10252; GFX8-NEXT:    v_and_b32_e32 v7, v4, v6
10253; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10254; GFX8-NEXT:    v_or_b32_e32 v3, v7, v3
10255; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
10256; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10257; GFX8-NEXT:    buffer_wbinvl1
10258; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
10259; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10260; GFX8-NEXT:    v_mov_b32_e32 v4, v3
10261; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10262; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
10263; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
10264; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
10265; GFX8-NEXT:    s_setpc_b64 s[30:31]
10266;
10267; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
10268; GFX7:       ; %bb.0:
10269; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10270; GFX7-NEXT:    v_mov_b32_e32 v3, v0
10271; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
10272; GFX7-NEXT:    flat_load_dword v4, v[0:1]
10273; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
10274; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
10275; GFX7-NEXT:    v_lshl_b32_e32 v3, 0xffff, v5
10276; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
10277; GFX7-NEXT:    v_not_b32_e32 v6, v3
10278; GFX7-NEXT:    s_mov_b64 s[4:5], 0
10279; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
10280; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
10281; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
10282; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10283; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
10284; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
10285; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
10286; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
10287; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
10288; GFX7-NEXT:    v_and_b32_e32 v7, v4, v6
10289; GFX7-NEXT:    v_lshlrev_b32_e32 v3, v5, v3
10290; GFX7-NEXT:    v_or_b32_e32 v3, v7, v3
10291; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
10292; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10293; GFX7-NEXT:    buffer_wbinvl1
10294; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
10295; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10296; GFX7-NEXT:    v_mov_b32_e32 v4, v3
10297; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10298; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
10299; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
10300; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
10301; GFX7-NEXT:    s_setpc_b64 s[30:31]
10302  %unused = atomicrmw fmin ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
10303  ret void
10304}
10305
10306define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
10307; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
10308; GFX12:       ; %bb.0:
10309; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10310; GFX12-NEXT:    s_wait_expcnt 0x0
10311; GFX12-NEXT:    s_wait_samplecnt 0x0
10312; GFX12-NEXT:    s_wait_bvhcnt 0x0
10313; GFX12-NEXT:    s_wait_kmcnt 0x0
10314; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
10315; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
10316; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10317; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
10318; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
10319; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
10320; GFX12-NEXT:    s_mov_b32 s0, 0
10321; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
10322; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10323; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
10324; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10325; GFX12-NEXT:    v_not_b32_e32 v5, v5
10326; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
10327; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
10328; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10329; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
10330; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10331; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10332; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v6
10333; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10334; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
10335; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10336; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
10337; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
10338; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10339; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
10340; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
10341; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10342; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
10343; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
10344; GFX12-NEXT:    s_wait_storecnt 0x0
10345; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
10346; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10347; GFX12-NEXT:    global_inv scope:SCOPE_DEV
10348; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
10349; GFX12-NEXT:    v_mov_b32_e32 v3, v2
10350; GFX12-NEXT:    s_wait_alu 0xfffe
10351; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
10352; GFX12-NEXT:    s_wait_alu 0xfffe
10353; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10354; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
10355; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
10356; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10357; GFX12-NEXT:    s_wait_alu 0xfffe
10358; GFX12-NEXT:    s_setpc_b64 s[30:31]
10359;
10360; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
10361; GFX940:       ; %bb.0:
10362; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10363; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
10364; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
10365; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
10366; GFX940-NEXT:    v_mov_b32_e32 v1, v5
10367; GFX940-NEXT:    flat_load_dword v3, v[0:1]
10368; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
10369; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10370; GFX940-NEXT:    s_mov_b32 s0, 0xffff
10371; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
10372; GFX940-NEXT:    v_not_b32_e32 v5, v5
10373; GFX940-NEXT:    s_mov_b64 s[0:1], 0
10374; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10375; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
10376; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
10377; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
10378; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10379; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10380; GFX940-NEXT:    s_nop 0
10381; GFX940-NEXT:    v_min_f32_e32 v2, v2, v6
10382; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
10383; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10384; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
10385; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10386; GFX940-NEXT:    s_nop 1
10387; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
10388; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10389; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
10390; GFX940-NEXT:    buffer_wbl2 sc1
10391; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
10392; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10393; GFX940-NEXT:    buffer_inv sc1
10394; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10395; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
10396; GFX940-NEXT:    v_mov_b32_e32 v3, v2
10397; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
10398; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
10399; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
10400; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
10401; GFX940-NEXT:    s_setpc_b64 s[30:31]
10402;
10403; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
10404; GFX11:       ; %bb.0:
10405; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10406; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
10407; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
10408; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10409; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
10410; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
10411; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
10412; GFX11-NEXT:    s_mov_b32 s0, 0
10413; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
10414; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10415; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
10416; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10417; GFX11-NEXT:    v_not_b32_e32 v5, v5
10418; GFX11-NEXT:    .p2align 6
10419; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
10420; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
10421; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10422; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
10423; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10424; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10425; GFX11-NEXT:    v_min_f32_e32 v2, v2, v6
10426; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10427; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
10428; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10429; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
10430; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
10431; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10432; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
10433; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
10434; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10435; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
10436; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
10437; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
10438; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
10439; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10440; GFX11-NEXT:    buffer_gl1_inv
10441; GFX11-NEXT:    buffer_gl0_inv
10442; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
10443; GFX11-NEXT:    v_mov_b32_e32 v3, v2
10444; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
10445; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10446; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10447; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
10448; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
10449; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10450; GFX11-NEXT:    s_setpc_b64 s[30:31]
10451;
10452; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
10453; GFX10:       ; %bb.0:
10454; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10455; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
10456; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
10457; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10458; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
10459; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
10460; GFX10-NEXT:    s_mov_b32 s4, 0
10461; GFX10-NEXT:    flat_load_dword v3, v[0:1]
10462; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10463; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
10464; GFX10-NEXT:    v_not_b32_e32 v5, v5
10465; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
10466; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
10467; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10468; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10469; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
10470; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
10471; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10472; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
10473; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
10474; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
10475; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10476; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
10477; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
10478; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10479; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10480; GFX10-NEXT:    buffer_gl1_inv
10481; GFX10-NEXT:    buffer_gl0_inv
10482; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
10483; GFX10-NEXT:    v_mov_b32_e32 v3, v2
10484; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
10485; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
10486; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
10487; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
10488; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
10489; GFX10-NEXT:    s_setpc_b64 s[30:31]
10490;
10491; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
10492; GFX90A:       ; %bb.0:
10493; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10494; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
10495; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10496; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
10497; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
10498; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
10499; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10500; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
10501; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
10502; GFX90A-NEXT:    v_not_b32_e32 v5, v5
10503; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
10504; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10505; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
10506; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
10507; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
10508; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10509; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10510; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v6
10511; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
10512; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10513; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
10514; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10515; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
10516; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10517; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
10518; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10519; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10520; GFX90A-NEXT:    buffer_wbinvl1
10521; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10522; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10523; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
10524; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10525; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
10526; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
10527; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
10528; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10529;
10530; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
10531; GFX908:       ; %bb.0:
10532; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10533; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
10534; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10535; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
10536; GFX908-NEXT:    flat_load_dword v3, v[0:1]
10537; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
10538; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10539; GFX908-NEXT:    s_mov_b32 s4, 0xffff
10540; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
10541; GFX908-NEXT:    v_not_b32_e32 v5, v5
10542; GFX908-NEXT:    s_mov_b64 s[4:5], 0
10543; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10544; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
10545; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
10546; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
10547; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10548; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10549; GFX908-NEXT:    v_min_f32_e32 v2, v2, v6
10550; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
10551; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10552; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
10553; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10554; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
10555; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10556; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
10557; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10558; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10559; GFX908-NEXT:    buffer_wbinvl1
10560; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10561; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10562; GFX908-NEXT:    v_mov_b32_e32 v3, v2
10563; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10564; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
10565; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
10566; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
10567; GFX908-NEXT:    s_setpc_b64 s[30:31]
10568;
10569; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
10570; GFX8:       ; %bb.0:
10571; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10572; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
10573; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10574; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
10575; GFX8-NEXT:    flat_load_dword v3, v[0:1]
10576; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
10577; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10578; GFX8-NEXT:    s_mov_b32 s4, 0xffff
10579; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
10580; GFX8-NEXT:    v_not_b32_e32 v5, v5
10581; GFX8-NEXT:    s_mov_b64 s[4:5], 0
10582; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10583; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
10584; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
10585; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10586; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10587; GFX8-NEXT:    v_min_f32_e32 v2, v2, v6
10588; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
10589; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
10590; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
10591; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
10592; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10593; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
10594; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
10595; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10596; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
10597; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10598; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10599; GFX8-NEXT:    buffer_wbinvl1
10600; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10601; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10602; GFX8-NEXT:    v_mov_b32_e32 v3, v2
10603; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10604; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
10605; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
10606; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
10607; GFX8-NEXT:    s_setpc_b64 s[30:31]
10608;
10609; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
10610; GFX7:       ; %bb.0:
10611; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10612; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
10613; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10614; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
10615; GFX7-NEXT:    flat_load_dword v3, v[0:1]
10616; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
10617; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10618; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
10619; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
10620; GFX7-NEXT:    v_not_b32_e32 v5, v5
10621; GFX7-NEXT:    s_mov_b64 s[4:5], 0
10622; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
10623; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
10624; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
10625; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10626; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
10627; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10628; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
10629; GFX7-NEXT:    v_min_f32_e32 v2, v2, v6
10630; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
10631; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
10632; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
10633; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
10634; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10635; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10636; GFX7-NEXT:    buffer_wbinvl1
10637; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10638; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10639; GFX7-NEXT:    v_mov_b32_e32 v3, v2
10640; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10641; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
10642; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
10643; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
10644; GFX7-NEXT:    s_setpc_b64 s[30:31]
10645  %gep = getelementptr bfloat, ptr %ptr, i64 1023
10646  %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
10647  ret void
10648}
10649
10650define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
10651; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
10652; GFX12:       ; %bb.0:
10653; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10654; GFX12-NEXT:    s_wait_expcnt 0x0
10655; GFX12-NEXT:    s_wait_samplecnt 0x0
10656; GFX12-NEXT:    s_wait_bvhcnt 0x0
10657; GFX12-NEXT:    s_wait_kmcnt 0x0
10658; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
10659; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
10660; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10661; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
10662; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
10663; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
10664; GFX12-NEXT:    s_mov_b32 s0, 0
10665; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
10666; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10667; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
10668; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10669; GFX12-NEXT:    v_not_b32_e32 v5, v5
10670; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
10671; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
10672; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10673; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
10674; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10675; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10676; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v6
10677; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10678; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
10679; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10680; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
10681; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
10682; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10683; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
10684; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
10685; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10686; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
10687; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
10688; GFX12-NEXT:    s_wait_storecnt 0x0
10689; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
10690; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10691; GFX12-NEXT:    global_inv scope:SCOPE_DEV
10692; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
10693; GFX12-NEXT:    v_mov_b32_e32 v3, v2
10694; GFX12-NEXT:    s_wait_alu 0xfffe
10695; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
10696; GFX12-NEXT:    s_wait_alu 0xfffe
10697; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10698; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
10699; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
10700; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10701; GFX12-NEXT:    s_wait_alu 0xfffe
10702; GFX12-NEXT:    s_setpc_b64 s[30:31]
10703;
10704; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
10705; GFX940:       ; %bb.0:
10706; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10707; GFX940-NEXT:    s_movk_i32 s0, 0xf800
10708; GFX940-NEXT:    s_mov_b32 s1, -1
10709; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
10710; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
10711; GFX940-NEXT:    v_mov_b32_e32 v1, v5
10712; GFX940-NEXT:    flat_load_dword v3, v[0:1]
10713; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
10714; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10715; GFX940-NEXT:    s_mov_b32 s0, 0xffff
10716; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
10717; GFX940-NEXT:    v_not_b32_e32 v5, v5
10718; GFX940-NEXT:    s_mov_b64 s[0:1], 0
10719; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10720; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
10721; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
10722; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
10723; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10724; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10725; GFX940-NEXT:    s_nop 0
10726; GFX940-NEXT:    v_min_f32_e32 v2, v2, v6
10727; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
10728; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10729; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
10730; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10731; GFX940-NEXT:    s_nop 1
10732; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
10733; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10734; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
10735; GFX940-NEXT:    buffer_wbl2 sc1
10736; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
10737; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10738; GFX940-NEXT:    buffer_inv sc1
10739; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10740; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
10741; GFX940-NEXT:    v_mov_b32_e32 v3, v2
10742; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
10743; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
10744; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
10745; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
10746; GFX940-NEXT:    s_setpc_b64 s[30:31]
10747;
10748; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
10749; GFX11:       ; %bb.0:
10750; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10751; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
10752; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
10753; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10754; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
10755; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
10756; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
10757; GFX11-NEXT:    s_mov_b32 s0, 0
10758; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
10759; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10760; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
10761; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
10762; GFX11-NEXT:    v_not_b32_e32 v5, v5
10763; GFX11-NEXT:    .p2align 6
10764; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
10765; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
10766; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10767; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
10768; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10769; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10770; GFX11-NEXT:    v_min_f32_e32 v2, v2, v6
10771; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
10772; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
10773; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10774; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
10775; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
10776; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10777; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
10778; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
10779; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10780; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
10781; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
10782; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
10783; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
10784; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10785; GFX11-NEXT:    buffer_gl1_inv
10786; GFX11-NEXT:    buffer_gl0_inv
10787; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
10788; GFX11-NEXT:    v_mov_b32_e32 v3, v2
10789; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
10790; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
10791; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
10792; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
10793; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
10794; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
10795; GFX11-NEXT:    s_setpc_b64 s[30:31]
10796;
10797; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
10798; GFX10:       ; %bb.0:
10799; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10800; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
10801; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
10802; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10803; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
10804; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
10805; GFX10-NEXT:    s_mov_b32 s4, 0
10806; GFX10-NEXT:    flat_load_dword v3, v[0:1]
10807; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10808; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
10809; GFX10-NEXT:    v_not_b32_e32 v5, v5
10810; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
10811; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
10812; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10813; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10814; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
10815; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
10816; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10817; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
10818; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
10819; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
10820; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10821; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
10822; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
10823; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10824; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10825; GFX10-NEXT:    buffer_gl1_inv
10826; GFX10-NEXT:    buffer_gl0_inv
10827; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
10828; GFX10-NEXT:    v_mov_b32_e32 v3, v2
10829; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
10830; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
10831; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
10832; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
10833; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
10834; GFX10-NEXT:    s_setpc_b64 s[30:31]
10835;
10836; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
10837; GFX90A:       ; %bb.0:
10838; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10839; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
10840; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
10841; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
10842; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
10843; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
10844; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10845; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
10846; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
10847; GFX90A-NEXT:    v_not_b32_e32 v5, v5
10848; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
10849; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10850; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
10851; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
10852; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
10853; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10854; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10855; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v6
10856; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
10857; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10858; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
10859; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10860; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
10861; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10862; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
10863; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10864; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10865; GFX90A-NEXT:    buffer_wbinvl1
10866; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10867; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10868; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
10869; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10870; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
10871; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
10872; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
10873; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10874;
10875; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
10876; GFX908:       ; %bb.0:
10877; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10878; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
10879; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
10880; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
10881; GFX908-NEXT:    flat_load_dword v3, v[0:1]
10882; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
10883; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10884; GFX908-NEXT:    s_mov_b32 s4, 0xffff
10885; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
10886; GFX908-NEXT:    v_not_b32_e32 v5, v5
10887; GFX908-NEXT:    s_mov_b64 s[4:5], 0
10888; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10889; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
10890; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
10891; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
10892; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10893; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10894; GFX908-NEXT:    v_min_f32_e32 v2, v2, v6
10895; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
10896; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
10897; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
10898; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10899; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
10900; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10901; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
10902; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10903; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10904; GFX908-NEXT:    buffer_wbinvl1
10905; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10906; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10907; GFX908-NEXT:    v_mov_b32_e32 v3, v2
10908; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10909; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
10910; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
10911; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
10912; GFX908-NEXT:    s_setpc_b64 s[30:31]
10913;
10914; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
10915; GFX8:       ; %bb.0:
10916; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10917; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
10918; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
10919; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
10920; GFX8-NEXT:    flat_load_dword v3, v[0:1]
10921; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
10922; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10923; GFX8-NEXT:    s_mov_b32 s4, 0xffff
10924; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
10925; GFX8-NEXT:    v_not_b32_e32 v5, v5
10926; GFX8-NEXT:    s_mov_b64 s[4:5], 0
10927; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
10928; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
10929; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
10930; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10931; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10932; GFX8-NEXT:    v_min_f32_e32 v2, v2, v6
10933; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
10934; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
10935; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
10936; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
10937; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
10938; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
10939; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
10940; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
10941; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
10942; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10943; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10944; GFX8-NEXT:    buffer_wbinvl1
10945; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10946; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10947; GFX8-NEXT:    v_mov_b32_e32 v3, v2
10948; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10949; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
10950; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
10951; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
10952; GFX8-NEXT:    s_setpc_b64 s[30:31]
10953;
10954; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
10955; GFX7:       ; %bb.0:
10956; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10957; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
10958; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
10959; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
10960; GFX7-NEXT:    flat_load_dword v3, v[0:1]
10961; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
10962; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
10963; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
10964; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
10965; GFX7-NEXT:    v_not_b32_e32 v5, v5
10966; GFX7-NEXT:    s_mov_b64 s[4:5], 0
10967; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
10968; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
10969; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
10970; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10971; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
10972; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
10973; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
10974; GFX7-NEXT:    v_min_f32_e32 v2, v2, v6
10975; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
10976; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
10977; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
10978; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
10979; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
10980; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
10981; GFX7-NEXT:    buffer_wbinvl1
10982; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
10983; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
10984; GFX7-NEXT:    v_mov_b32_e32 v3, v2
10985; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
10986; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
10987; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
10988; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
10989; GFX7-NEXT:    s_setpc_b64 s[30:31]
10990  %gep = getelementptr bfloat, ptr %ptr, i64 -1024
10991  %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
10992  ret void
10993}
10994
10995define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
10996; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
10997; GFX12:       ; %bb.0:
10998; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
10999; GFX12-NEXT:    s_wait_expcnt 0x0
11000; GFX12-NEXT:    s_wait_samplecnt 0x0
11001; GFX12-NEXT:    s_wait_bvhcnt 0x0
11002; GFX12-NEXT:    s_wait_kmcnt 0x0
11003; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
11004; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11005; GFX12-NEXT:    s_mov_b32 s0, 0
11006; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
11007; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
11008; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11009; GFX12-NEXT:    v_mov_b32_e32 v4, v3
11010; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11011; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
11012; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
11013; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
11014; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
11015; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v3
11016; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
11017; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
11018; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11019; GFX12-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
11020; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
11021; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11022; GFX12-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
11023; GFX12-NEXT:    s_wait_storecnt 0x0
11024; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
11025; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11026; GFX12-NEXT:    global_inv scope:SCOPE_DEV
11027; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
11028; GFX12-NEXT:    s_wait_alu 0xfffe
11029; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
11030; GFX12-NEXT:    s_wait_alu 0xfffe
11031; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11032; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
11033; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
11034; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11035; GFX12-NEXT:    v_mov_b32_e32 v0, v3
11036; GFX12-NEXT:    s_wait_alu 0xfffe
11037; GFX12-NEXT:    s_setpc_b64 s[30:31]
11038;
11039; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
11040; GFX940:       ; %bb.0:
11041; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11042; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
11043; GFX940-NEXT:    s_mov_b64 s[0:1], 0
11044; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11045; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
11046; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
11047; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
11048; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
11049; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11050; GFX940-NEXT:    v_mov_b32_e32 v5, v3
11051; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
11052; GFX940-NEXT:    v_min_f32_e32 v3, v3, v2
11053; GFX940-NEXT:    v_bfe_u32 v4, v3, 16, 1
11054; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
11055; GFX940-NEXT:    v_add3_u32 v4, v4, v3, s2
11056; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
11057; GFX940-NEXT:    s_nop 1
11058; GFX940-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
11059; GFX940-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
11060; GFX940-NEXT:    v_and_or_b32 v4, v5, s3, v3
11061; GFX940-NEXT:    buffer_wbl2 sc1
11062; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
11063; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11064; GFX940-NEXT:    buffer_inv sc1
11065; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
11066; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
11067; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
11068; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
11069; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
11070; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
11071; GFX940-NEXT:    v_mov_b32_e32 v0, v3
11072; GFX940-NEXT:    s_setpc_b64 s[30:31]
11073;
11074; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
11075; GFX11:       ; %bb.0:
11076; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11077; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
11078; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11079; GFX11-NEXT:    s_mov_b32 s0, 0
11080; GFX11-NEXT:    .p2align 6
11081; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
11082; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
11083; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11084; GFX11-NEXT:    v_mov_b32_e32 v4, v3
11085; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11086; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
11087; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
11088; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
11089; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
11090; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v3
11091; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
11092; GFX11-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
11093; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11094; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
11095; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
11096; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11097; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v4, v3
11098; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
11099; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
11100; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11101; GFX11-NEXT:    buffer_gl1_inv
11102; GFX11-NEXT:    buffer_gl0_inv
11103; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
11104; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
11105; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
11106; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11107; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
11108; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
11109; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11110; GFX11-NEXT:    v_mov_b32_e32 v0, v3
11111; GFX11-NEXT:    s_setpc_b64 s[30:31]
11112;
11113; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
11114; GFX10:       ; %bb.0:
11115; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11116; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
11117; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
11118; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
11119; GFX10-NEXT:    s_mov_b32 s4, 0
11120; GFX10-NEXT:    flat_load_dword v0, v[3:4]
11121; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
11122; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
11123; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11124; GFX10-NEXT:    v_mov_b32_e32 v6, v0
11125; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
11126; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
11127; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
11128; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
11129; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
11130; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
11131; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc_lo
11132; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
11133; GFX10-NEXT:    v_and_or_b32 v5, 0xffff0000, v6, v0
11134; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
11135; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
11136; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11137; GFX10-NEXT:    buffer_gl1_inv
11138; GFX10-NEXT:    buffer_gl0_inv
11139; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
11140; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
11141; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
11142; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
11143; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
11144; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
11145; GFX10-NEXT:    s_setpc_b64 s[30:31]
11146;
11147; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
11148; GFX90A:       ; %bb.0:
11149; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11150; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
11151; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
11152; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11153; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
11154; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
11155; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
11156; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
11157; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11158; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
11159; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
11160; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v2
11161; GFX90A-NEXT:    v_bfe_u32 v4, v3, 16, 1
11162; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
11163; GFX90A-NEXT:    v_add3_u32 v4, v4, v3, s6
11164; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
11165; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
11166; GFX90A-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
11167; GFX90A-NEXT:    v_and_or_b32 v4, v5, s7, v3
11168; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
11169; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11170; GFX90A-NEXT:    buffer_wbinvl1
11171; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
11172; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11173; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11174; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
11175; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
11176; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
11177; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
11178; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11179;
11180; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
11181; GFX908:       ; %bb.0:
11182; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11183; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
11184; GFX908-NEXT:    s_mov_b64 s[4:5], 0
11185; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11186; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
11187; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
11188; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
11189; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
11190; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11191; GFX908-NEXT:    v_mov_b32_e32 v4, v3
11192; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
11193; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
11194; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
11195; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
11196; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
11197; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
11198; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
11199; GFX908-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
11200; GFX908-NEXT:    v_and_or_b32 v3, v4, s7, v3
11201; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
11202; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11203; GFX908-NEXT:    buffer_wbinvl1
11204; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
11205; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11206; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11207; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
11208; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
11209; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
11210; GFX908-NEXT:    v_mov_b32_e32 v0, v3
11211; GFX908-NEXT:    s_setpc_b64 s[30:31]
11212;
11213; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
11214; GFX8:       ; %bb.0:
11215; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11216; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
11217; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
11218; GFX8-NEXT:    flat_load_dword v0, v[3:4]
11219; GFX8-NEXT:    s_mov_b64 s[4:5], 0
11220; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
11221; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
11222; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
11223; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11224; GFX8-NEXT:    v_mov_b32_e32 v6, v0
11225; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
11226; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
11227; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
11228; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
11229; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
11230; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v0
11231; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
11232; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
11233; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc
11234; GFX8-NEXT:    v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11235; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
11236; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11237; GFX8-NEXT:    buffer_wbinvl1
11238; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
11239; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11240; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11241; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
11242; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
11243; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
11244; GFX8-NEXT:    s_setpc_b64 s[30:31]
11245;
11246; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
11247; GFX7:       ; %bb.0:
11248; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11249; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
11250; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
11251; GFX7-NEXT:    flat_load_dword v3, v[0:1]
11252; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
11253; GFX7-NEXT:    s_mov_b64 s[4:5], 0
11254; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
11255; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
11256; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
11257; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11258; GFX7-NEXT:    v_mov_b32_e32 v4, v3
11259; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
11260; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
11261; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
11262; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
11263; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
11264; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
11265; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
11266; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11267; GFX7-NEXT:    buffer_wbinvl1
11268; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
11269; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11270; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11271; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
11272; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
11273; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
11274; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
11275; GFX7-NEXT:    s_setpc_b64 s[30:31]
11276  %gep = getelementptr bfloat, ptr %ptr, i64 1023
11277  %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
11278  ret bfloat %result
11279}
11280
11281define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
11282; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
11283; GFX12:       ; %bb.0:
11284; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11285; GFX12-NEXT:    s_wait_expcnt 0x0
11286; GFX12-NEXT:    s_wait_samplecnt 0x0
11287; GFX12-NEXT:    s_wait_bvhcnt 0x0
11288; GFX12-NEXT:    s_wait_kmcnt 0x0
11289; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
11290; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11291; GFX12-NEXT:    s_mov_b32 s0, 0
11292; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
11293; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
11294; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11295; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
11296; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11297; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
11298; GFX12-NEXT:    v_bfe_u32 v5, v2, 16, 1
11299; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v2
11300; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
11301; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
11302; GFX12-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
11303; GFX12-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
11304; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11305; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11306; GFX12-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
11307; GFX12-NEXT:    s_wait_storecnt 0x0
11308; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
11309; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11310; GFX12-NEXT:    global_inv scope:SCOPE_DEV
11311; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
11312; GFX12-NEXT:    v_mov_b32_e32 v3, v2
11313; GFX12-NEXT:    s_wait_alu 0xfffe
11314; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
11315; GFX12-NEXT:    s_wait_alu 0xfffe
11316; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11317; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
11318; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
11319; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11320; GFX12-NEXT:    s_wait_alu 0xfffe
11321; GFX12-NEXT:    s_setpc_b64 s[30:31]
11322;
11323; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
11324; GFX940:       ; %bb.0:
11325; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11326; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2046
11327; GFX940-NEXT:    s_mov_b64 s[0:1], 0
11328; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11329; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
11330; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
11331; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
11332; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
11333; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11334; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
11335; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
11336; GFX940-NEXT:    v_bfe_u32 v5, v2, 16, 1
11337; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v2
11338; GFX940-NEXT:    v_add3_u32 v5, v5, v2, s2
11339; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
11340; GFX940-NEXT:    s_nop 1
11341; GFX940-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
11342; GFX940-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11343; GFX940-NEXT:    v_and_or_b32 v2, v3, s3, v2
11344; GFX940-NEXT:    buffer_wbl2 sc1
11345; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
11346; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11347; GFX940-NEXT:    buffer_inv sc1
11348; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
11349; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
11350; GFX940-NEXT:    v_mov_b32_e32 v3, v2
11351; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
11352; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
11353; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
11354; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
11355; GFX940-NEXT:    s_setpc_b64 s[30:31]
11356;
11357; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
11358; GFX11:       ; %bb.0:
11359; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11360; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2046
11361; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11362; GFX11-NEXT:    s_mov_b32 s0, 0
11363; GFX11-NEXT:    .p2align 6
11364; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
11365; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
11366; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11367; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
11368; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11369; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
11370; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
11371; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
11372; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
11373; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
11374; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
11375; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
11376; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11377; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11378; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
11379; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
11380; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
11381; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11382; GFX11-NEXT:    buffer_gl1_inv
11383; GFX11-NEXT:    buffer_gl0_inv
11384; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
11385; GFX11-NEXT:    v_mov_b32_e32 v3, v2
11386; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
11387; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
11388; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11389; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
11390; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
11391; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11392; GFX11-NEXT:    s_setpc_b64 s[30:31]
11393;
11394; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
11395; GFX10:       ; %bb.0:
11396; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11397; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fe, v0
11398; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11399; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11400; GFX10-NEXT:    s_mov_b32 s4, 0
11401; GFX10-NEXT:    flat_load_dword v3, v[0:1]
11402; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
11403; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
11404; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11405; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
11406; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
11407; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
11408; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
11409; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
11410; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
11411; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
11412; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11413; GFX10-NEXT:    v_and_or_b32 v2, 0xffff0000, v3, v2
11414; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
11415; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11416; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11417; GFX10-NEXT:    buffer_gl1_inv
11418; GFX10-NEXT:    buffer_gl0_inv
11419; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
11420; GFX10-NEXT:    v_mov_b32_e32 v3, v2
11421; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
11422; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
11423; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
11424; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
11425; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
11426; GFX10-NEXT:    s_setpc_b64 s[30:31]
11427;
11428; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
11429; GFX90A:       ; %bb.0:
11430; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11431; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2046
11432; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
11433; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11434; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
11435; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
11436; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
11437; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
11438; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11439; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
11440; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
11441; GFX90A-NEXT:    v_bfe_u32 v5, v2, 16, 1
11442; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v2
11443; GFX90A-NEXT:    v_add3_u32 v5, v5, v2, s6
11444; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
11445; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
11446; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11447; GFX90A-NEXT:    v_and_or_b32 v2, v3, s7, v2
11448; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
11449; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11450; GFX90A-NEXT:    buffer_wbinvl1
11451; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
11452; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11453; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
11454; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11455; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
11456; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
11457; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
11458; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11459;
11460; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
11461; GFX908:       ; %bb.0:
11462; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11463; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2046
11464; GFX908-NEXT:    s_mov_b64 s[4:5], 0
11465; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11466; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
11467; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
11468; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
11469; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
11470; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11471; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
11472; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
11473; GFX908-NEXT:    v_bfe_u32 v5, v2, 16, 1
11474; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v2
11475; GFX908-NEXT:    v_add3_u32 v5, v5, v2, s6
11476; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
11477; GFX908-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
11478; GFX908-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11479; GFX908-NEXT:    v_and_or_b32 v2, v3, s7, v2
11480; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
11481; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11482; GFX908-NEXT:    buffer_wbinvl1
11483; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
11484; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11485; GFX908-NEXT:    v_mov_b32_e32 v3, v2
11486; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11487; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
11488; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
11489; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
11490; GFX908-NEXT:    s_setpc_b64 s[30:31]
11491;
11492; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
11493; GFX8:       ; %bb.0:
11494; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11495; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
11496; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
11497; GFX8-NEXT:    flat_load_dword v3, v[0:1]
11498; GFX8-NEXT:    s_mov_b64 s[4:5], 0
11499; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
11500; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
11501; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
11502; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11503; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
11504; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
11505; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
11506; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
11507; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
11508; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v2
11509; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
11510; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
11511; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
11512; GFX8-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11513; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11514; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11515; GFX8-NEXT:    buffer_wbinvl1
11516; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
11517; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11518; GFX8-NEXT:    v_mov_b32_e32 v3, v2
11519; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11520; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
11521; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
11522; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
11523; GFX8-NEXT:    s_setpc_b64 s[30:31]
11524;
11525; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
11526; GFX7:       ; %bb.0:
11527; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11528; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fe, v0
11529; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
11530; GFX7-NEXT:    flat_load_dword v3, v[0:1]
11531; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
11532; GFX7-NEXT:    s_mov_b64 s[4:5], 0
11533; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
11534; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
11535; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
11536; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11537; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
11538; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
11539; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
11540; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
11541; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11542; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
11543; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
11544; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11545; GFX7-NEXT:    buffer_wbinvl1
11546; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
11547; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11548; GFX7-NEXT:    v_mov_b32_e32 v3, v2
11549; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11550; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
11551; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
11552; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
11553; GFX7-NEXT:    s_setpc_b64 s[30:31]
11554  %gep = getelementptr bfloat, ptr %ptr, i64 1023
11555  %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
11556  ret void
11557}
11558
11559define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
11560; GFX12-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11561; GFX12:       ; %bb.0:
11562; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11563; GFX12-NEXT:    s_wait_expcnt 0x0
11564; GFX12-NEXT:    s_wait_samplecnt 0x0
11565; GFX12-NEXT:    s_wait_bvhcnt 0x0
11566; GFX12-NEXT:    s_wait_kmcnt 0x0
11567; GFX12-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
11568; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11569; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11570; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
11571; GFX12-NEXT:    v_and_b32_e32 v0, -4, v3
11572; GFX12-NEXT:    v_and_b32_e32 v3, 3, v3
11573; GFX12-NEXT:    s_mov_b32 s0, 0
11574; GFX12-NEXT:    flat_load_b32 v5, v[0:1]
11575; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11576; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
11577; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11578; GFX12-NEXT:    v_not_b32_e32 v4, v4
11579; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
11580; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
11581; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11582; GFX12-NEXT:    v_mov_b32_e32 v6, v5
11583; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11584; GFX12-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
11585; GFX12-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
11586; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11587; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
11588; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
11589; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11590; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11591; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
11592; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11593; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
11594; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11595; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11596; GFX12-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
11597; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11598; GFX12-NEXT:    v_and_or_b32 v5, v6, v4, v5
11599; GFX12-NEXT:    global_wb scope:SCOPE_SYS
11600; GFX12-NEXT:    s_wait_storecnt 0x0
11601; GFX12-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
11602; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11603; GFX12-NEXT:    global_inv scope:SCOPE_SYS
11604; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
11605; GFX12-NEXT:    s_wait_alu 0xfffe
11606; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
11607; GFX12-NEXT:    s_wait_alu 0xfffe
11608; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11609; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
11610; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
11611; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11612; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11613; GFX12-NEXT:    s_wait_alu 0xfffe
11614; GFX12-NEXT:    s_setpc_b64 s[30:31]
11615;
11616; GFX940-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11617; GFX940:       ; %bb.0:
11618; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11619; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
11620; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
11621; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
11622; GFX940-NEXT:    v_mov_b32_e32 v1, v5
11623; GFX940-NEXT:    flat_load_dword v5, v[0:1]
11624; GFX940-NEXT:    v_and_b32_e32 v3, 3, v4
11625; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11626; GFX940-NEXT:    s_mov_b32 s0, 0xffff
11627; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
11628; GFX940-NEXT:    v_not_b32_e32 v4, v4
11629; GFX940-NEXT:    s_mov_b64 s[0:1], 0
11630; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11631; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
11632; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
11633; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
11634; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11635; GFX940-NEXT:    v_mov_b32_e32 v7, v5
11636; GFX940-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11637; GFX940-NEXT:    s_nop 0
11638; GFX940-NEXT:    v_min_f32_e32 v5, v5, v2
11639; GFX940-NEXT:    v_bfe_u32 v6, v5, 16, 1
11640; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11641; GFX940-NEXT:    v_add3_u32 v6, v6, v5, s2
11642; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11643; GFX940-NEXT:    s_nop 1
11644; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
11645; GFX940-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11646; GFX940-NEXT:    v_and_or_b32 v6, v7, v4, v5
11647; GFX940-NEXT:    buffer_wbl2 sc0 sc1
11648; GFX940-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1
11649; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11650; GFX940-NEXT:    buffer_inv sc0 sc1
11651; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
11652; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
11653; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
11654; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
11655; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
11656; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
11657; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11658; GFX940-NEXT:    s_setpc_b64 s[30:31]
11659;
11660; GFX11-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11661; GFX11:       ; %bb.0:
11662; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11663; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
11664; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11665; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11666; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
11667; GFX11-NEXT:    v_and_b32_e32 v0, -4, v3
11668; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
11669; GFX11-NEXT:    s_mov_b32 s0, 0
11670; GFX11-NEXT:    flat_load_b32 v5, v[0:1]
11671; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11672; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
11673; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11674; GFX11-NEXT:    v_not_b32_e32 v4, v4
11675; GFX11-NEXT:    .p2align 6
11676; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
11677; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
11678; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11679; GFX11-NEXT:    v_mov_b32_e32 v6, v5
11680; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11681; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
11682; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
11683; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11684; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
11685; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
11686; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11687; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11688; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
11689; GFX11-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11690; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
11691; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11692; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11693; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
11694; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11695; GFX11-NEXT:    v_and_or_b32 v5, v6, v4, v5
11696; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
11697; GFX11-NEXT:    flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
11698; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11699; GFX11-NEXT:    buffer_gl1_inv
11700; GFX11-NEXT:    buffer_gl0_inv
11701; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
11702; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
11703; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
11704; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11705; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
11706; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
11707; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11708; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11709; GFX11-NEXT:    s_setpc_b64 s[30:31]
11710;
11711; GFX10-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11712; GFX10:       ; %bb.0:
11713; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11714; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
11715; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11716; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11717; GFX10-NEXT:    v_and_b32_e32 v0, -4, v3
11718; GFX10-NEXT:    v_and_b32_e32 v3, 3, v3
11719; GFX10-NEXT:    s_mov_b32 s4, 0
11720; GFX10-NEXT:    flat_load_dword v5, v[0:1]
11721; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11722; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
11723; GFX10-NEXT:    v_not_b32_e32 v4, v4
11724; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
11725; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
11726; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11727; GFX10-NEXT:    v_mov_b32_e32 v6, v5
11728; GFX10-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11729; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
11730; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
11731; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11732; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
11733; GFX10-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
11734; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
11735; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11736; GFX10-NEXT:    v_and_or_b32 v5, v6, v4, v5
11737; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
11738; GFX10-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11739; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11740; GFX10-NEXT:    buffer_gl1_inv
11741; GFX10-NEXT:    buffer_gl0_inv
11742; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
11743; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
11744; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
11745; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
11746; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
11747; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
11748; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11749; GFX10-NEXT:    s_setpc_b64 s[30:31]
11750;
11751; GFX90A-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11752; GFX90A:       ; %bb.0:
11753; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11754; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
11755; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
11756; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v3
11757; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
11758; GFX90A-NEXT:    v_and_b32_e32 v3, 3, v3
11759; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11760; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
11761; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
11762; GFX90A-NEXT:    v_not_b32_e32 v4, v4
11763; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
11764; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11765; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
11766; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
11767; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
11768; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11769; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
11770; GFX90A-NEXT:    v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11771; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v2
11772; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
11773; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11774; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s6
11775; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11776; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
11777; GFX90A-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11778; GFX90A-NEXT:    v_and_or_b32 v6, v7, v4, v5
11779; GFX90A-NEXT:    buffer_wbl2
11780; GFX90A-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
11781; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11782; GFX90A-NEXT:    buffer_invl2
11783; GFX90A-NEXT:    buffer_wbinvl1
11784; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
11785; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11786; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11787; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
11788; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
11789; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
11790; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11791; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11792;
11793; GFX908-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11794; GFX908:       ; %bb.0:
11795; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11796; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
11797; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
11798; GFX908-NEXT:    v_and_b32_e32 v0, -4, v3
11799; GFX908-NEXT:    flat_load_dword v5, v[0:1]
11800; GFX908-NEXT:    v_and_b32_e32 v3, 3, v3
11801; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11802; GFX908-NEXT:    s_mov_b32 s4, 0xffff
11803; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
11804; GFX908-NEXT:    v_not_b32_e32 v4, v4
11805; GFX908-NEXT:    s_mov_b64 s[4:5], 0
11806; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11807; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
11808; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
11809; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
11810; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11811; GFX908-NEXT:    v_mov_b32_e32 v6, v5
11812; GFX908-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11813; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
11814; GFX908-NEXT:    v_bfe_u32 v7, v5, 16, 1
11815; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v5
11816; GFX908-NEXT:    v_add3_u32 v7, v7, v5, s6
11817; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11818; GFX908-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
11819; GFX908-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11820; GFX908-NEXT:    v_and_or_b32 v5, v6, v4, v5
11821; GFX908-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11822; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11823; GFX908-NEXT:    buffer_wbinvl1
11824; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
11825; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11826; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11827; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
11828; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
11829; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
11830; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11831; GFX908-NEXT:    s_setpc_b64 s[30:31]
11832;
11833; GFX8-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11834; GFX8:       ; %bb.0:
11835; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11836; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
11837; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
11838; GFX8-NEXT:    v_and_b32_e32 v0, -4, v3
11839; GFX8-NEXT:    flat_load_dword v5, v[0:1]
11840; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
11841; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11842; GFX8-NEXT:    s_mov_b32 s4, 0xffff
11843; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v3, s4
11844; GFX8-NEXT:    v_not_b32_e32 v4, v4
11845; GFX8-NEXT:    s_mov_b64 s[4:5], 0
11846; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11847; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
11848; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
11849; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11850; GFX8-NEXT:    v_mov_b32_e32 v6, v5
11851; GFX8-NEXT:    v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11852; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
11853; GFX8-NEXT:    v_bfe_u32 v8, v5, 16, 1
11854; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v5
11855; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
11856; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v5
11857; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
11858; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
11859; GFX8-NEXT:    v_and_b32_e32 v7, v6, v4
11860; GFX8-NEXT:    v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
11861; GFX8-NEXT:    v_or_b32_e32 v5, v7, v5
11862; GFX8-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11863; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11864; GFX8-NEXT:    buffer_wbinvl1
11865; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
11866; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11867; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11868; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
11869; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
11870; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
11871; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11872; GFX8-NEXT:    s_setpc_b64 s[30:31]
11873;
11874; GFX7-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11875; GFX7:       ; %bb.0:
11876; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11877; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fe, v0
11878; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
11879; GFX7-NEXT:    v_and_b32_e32 v0, -4, v3
11880; GFX7-NEXT:    flat_load_dword v5, v[0:1]
11881; GFX7-NEXT:    v_and_b32_e32 v3, 3, v3
11882; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
11883; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v3
11884; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
11885; GFX7-NEXT:    v_not_b32_e32 v4, v4
11886; GFX7-NEXT:    s_mov_b64 s[4:5], 0
11887; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
11888; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
11889; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
11890; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11891; GFX7-NEXT:    v_mov_b32_e32 v6, v5
11892; GFX7-NEXT:    v_lshrrev_b32_e32 v5, v3, v6
11893; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
11894; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
11895; GFX7-NEXT:    v_min_f32_e32 v5, v5, v2
11896; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
11897; GFX7-NEXT:    v_and_b32_e32 v7, v6, v4
11898; GFX7-NEXT:    v_lshlrev_b32_e32 v5, v3, v5
11899; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
11900; GFX7-NEXT:    flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
11901; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11902; GFX7-NEXT:    buffer_wbinvl1
11903; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
11904; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
11905; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
11906; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
11907; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
11908; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
11909; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
11910; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
11911; GFX7-NEXT:    s_setpc_b64 s[30:31]
11912  %gep = getelementptr bfloat, ptr %ptr, i64 1023
11913  %result = atomicrmw fmin ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
11914  ret bfloat %result
11915}
11916
11917define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
11918; GFX12-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11919; GFX12:       ; %bb.0:
11920; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11921; GFX12-NEXT:    s_wait_expcnt 0x0
11922; GFX12-NEXT:    s_wait_samplecnt 0x0
11923; GFX12-NEXT:    s_wait_bvhcnt 0x0
11924; GFX12-NEXT:    s_wait_kmcnt 0x0
11925; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
11926; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
11927; GFX12-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
11928; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
11929; GFX12-NEXT:    v_and_b32_e32 v0, -4, v4
11930; GFX12-NEXT:    v_and_b32_e32 v4, 3, v4
11931; GFX12-NEXT:    s_mov_b32 s0, 0
11932; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
11933; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
11934; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
11935; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
11936; GFX12-NEXT:    v_not_b32_e32 v5, v5
11937; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
11938; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
11939; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11940; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
11941; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11942; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
11943; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v6
11944; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
11945; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
11946; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
11947; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
11948; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
11949; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11950; GFX12-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
11951; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
11952; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11953; GFX12-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
11954; GFX12-NEXT:    v_and_or_b32 v2, v3, v5, v2
11955; GFX12-NEXT:    global_wb scope:SCOPE_SYS
11956; GFX12-NEXT:    s_wait_storecnt 0x0
11957; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
11958; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
11959; GFX12-NEXT:    global_inv scope:SCOPE_SYS
11960; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
11961; GFX12-NEXT:    v_mov_b32_e32 v3, v2
11962; GFX12-NEXT:    s_wait_alu 0xfffe
11963; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
11964; GFX12-NEXT:    s_wait_alu 0xfffe
11965; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
11966; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
11967; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
11968; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
11969; GFX12-NEXT:    s_wait_alu 0xfffe
11970; GFX12-NEXT:    s_setpc_b64 s[30:31]
11971;
11972; GFX940-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
11973; GFX940:       ; %bb.0:
11974; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11975; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
11976; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
11977; GFX940-NEXT:    v_and_b32_e32 v0, -4, v4
11978; GFX940-NEXT:    v_mov_b32_e32 v1, v5
11979; GFX940-NEXT:    flat_load_dword v3, v[0:1]
11980; GFX940-NEXT:    v_and_b32_e32 v4, 3, v4
11981; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
11982; GFX940-NEXT:    s_mov_b32 s0, 0xffff
11983; GFX940-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
11984; GFX940-NEXT:    v_not_b32_e32 v5, v5
11985; GFX940-NEXT:    s_mov_b64 s[0:1], 0
11986; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
11987; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
11988; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
11989; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
11990; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
11991; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11992; GFX940-NEXT:    s_nop 0
11993; GFX940-NEXT:    v_min_f32_e32 v2, v2, v6
11994; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
11995; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
11996; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s2
11997; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
11998; GFX940-NEXT:    s_nop 1
11999; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
12000; GFX940-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12001; GFX940-NEXT:    v_and_or_b32 v2, v3, v5, v2
12002; GFX940-NEXT:    buffer_wbl2 sc0 sc1
12003; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
12004; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12005; GFX940-NEXT:    buffer_inv sc0 sc1
12006; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12007; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
12008; GFX940-NEXT:    v_mov_b32_e32 v3, v2
12009; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
12010; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
12011; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
12012; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
12013; GFX940-NEXT:    s_setpc_b64 s[30:31]
12014;
12015; GFX11-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
12016; GFX11:       ; %bb.0:
12017; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12018; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
12019; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
12020; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12021; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
12022; GFX11-NEXT:    v_and_b32_e32 v0, -4, v4
12023; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
12024; GFX11-NEXT:    s_mov_b32 s0, 0
12025; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
12026; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12027; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
12028; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12029; GFX11-NEXT:    v_not_b32_e32 v5, v5
12030; GFX11-NEXT:    .p2align 6
12031; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
12032; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
12033; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12034; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
12035; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12036; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
12037; GFX11-NEXT:    v_min_f32_e32 v2, v2, v6
12038; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
12039; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
12040; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
12041; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
12042; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
12043; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12044; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
12045; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
12046; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12047; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
12048; GFX11-NEXT:    v_and_or_b32 v2, v3, v5, v2
12049; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
12050; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
12051; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12052; GFX11-NEXT:    buffer_gl1_inv
12053; GFX11-NEXT:    buffer_gl0_inv
12054; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
12055; GFX11-NEXT:    v_mov_b32_e32 v3, v2
12056; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
12057; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
12058; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12059; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
12060; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
12061; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12062; GFX11-NEXT:    s_setpc_b64 s[30:31]
12063;
12064; GFX10-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
12065; GFX10:       ; %bb.0:
12066; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12067; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
12068; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
12069; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12070; GFX10-NEXT:    v_and_b32_e32 v0, -4, v4
12071; GFX10-NEXT:    v_and_b32_e32 v4, 3, v4
12072; GFX10-NEXT:    s_mov_b32 s4, 0
12073; GFX10-NEXT:    flat_load_dword v3, v[0:1]
12074; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12075; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
12076; GFX10-NEXT:    v_not_b32_e32 v5, v5
12077; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
12078; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
12079; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12080; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12081; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
12082; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
12083; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
12084; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
12085; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
12086; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
12087; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12088; GFX10-NEXT:    v_and_or_b32 v2, v3, v5, v2
12089; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
12090; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12091; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12092; GFX10-NEXT:    buffer_gl1_inv
12093; GFX10-NEXT:    buffer_gl0_inv
12094; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
12095; GFX10-NEXT:    v_mov_b32_e32 v3, v2
12096; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
12097; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
12098; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
12099; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
12100; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
12101; GFX10-NEXT:    s_setpc_b64 s[30:31]
12102;
12103; GFX90A-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
12104; GFX90A:       ; %bb.0:
12105; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12106; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
12107; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
12108; GFX90A-NEXT:    v_and_b32_e32 v0, -4, v4
12109; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
12110; GFX90A-NEXT:    v_and_b32_e32 v4, 3, v4
12111; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12112; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
12113; GFX90A-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
12114; GFX90A-NEXT:    v_not_b32_e32 v5, v5
12115; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
12116; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12117; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
12118; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
12119; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
12120; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12121; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12122; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v6
12123; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
12124; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
12125; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s6
12126; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
12127; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
12128; GFX90A-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12129; GFX90A-NEXT:    v_and_or_b32 v2, v3, v5, v2
12130; GFX90A-NEXT:    buffer_wbl2
12131; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12132; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12133; GFX90A-NEXT:    buffer_invl2
12134; GFX90A-NEXT:    buffer_wbinvl1
12135; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12136; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12137; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
12138; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12139; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
12140; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
12141; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
12142; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12143;
12144; GFX908-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
12145; GFX908:       ; %bb.0:
12146; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12147; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
12148; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
12149; GFX908-NEXT:    v_and_b32_e32 v0, -4, v4
12150; GFX908-NEXT:    flat_load_dword v3, v[0:1]
12151; GFX908-NEXT:    v_and_b32_e32 v4, 3, v4
12152; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12153; GFX908-NEXT:    s_mov_b32 s4, 0xffff
12154; GFX908-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
12155; GFX908-NEXT:    v_not_b32_e32 v5, v5
12156; GFX908-NEXT:    s_mov_b64 s[4:5], 0
12157; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12158; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
12159; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
12160; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
12161; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12162; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12163; GFX908-NEXT:    v_min_f32_e32 v2, v2, v6
12164; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
12165; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
12166; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s6
12167; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
12168; GFX908-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc
12169; GFX908-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12170; GFX908-NEXT:    v_and_or_b32 v2, v3, v5, v2
12171; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12172; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12173; GFX908-NEXT:    buffer_wbinvl1
12174; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12175; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12176; GFX908-NEXT:    v_mov_b32_e32 v3, v2
12177; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12178; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
12179; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
12180; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
12181; GFX908-NEXT:    s_setpc_b64 s[30:31]
12182;
12183; GFX8-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
12184; GFX8:       ; %bb.0:
12185; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12186; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
12187; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
12188; GFX8-NEXT:    v_and_b32_e32 v0, -4, v4
12189; GFX8-NEXT:    flat_load_dword v3, v[0:1]
12190; GFX8-NEXT:    v_and_b32_e32 v4, 3, v4
12191; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12192; GFX8-NEXT:    s_mov_b32 s4, 0xffff
12193; GFX8-NEXT:    v_lshlrev_b32_e64 v5, v4, s4
12194; GFX8-NEXT:    v_not_b32_e32 v5, v5
12195; GFX8-NEXT:    s_mov_b64 s[4:5], 0
12196; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
12197; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
12198; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
12199; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12200; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12201; GFX8-NEXT:    v_min_f32_e32 v2, v2, v6
12202; GFX8-NEXT:    v_bfe_u32 v8, v2, 16, 1
12203; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v2
12204; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0x7fff, v8
12205; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
12206; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
12207; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
12208; GFX8-NEXT:    v_and_b32_e32 v7, v3, v5
12209; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
12210; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
12211; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12212; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12213; GFX8-NEXT:    buffer_wbinvl1
12214; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12215; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12216; GFX8-NEXT:    v_mov_b32_e32 v3, v2
12217; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12218; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
12219; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
12220; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
12221; GFX8-NEXT:    s_setpc_b64 s[30:31]
12222;
12223; GFX7-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
12224; GFX7:       ; %bb.0:
12225; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12226; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
12227; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
12228; GFX7-NEXT:    v_and_b32_e32 v0, -4, v4
12229; GFX7-NEXT:    flat_load_dword v3, v[0:1]
12230; GFX7-NEXT:    v_and_b32_e32 v4, 3, v4
12231; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
12232; GFX7-NEXT:    v_lshl_b32_e32 v5, 0xffff, v4
12233; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
12234; GFX7-NEXT:    v_not_b32_e32 v5, v5
12235; GFX7-NEXT:    s_mov_b64 s[4:5], 0
12236; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
12237; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
12238; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
12239; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12240; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
12241; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
12242; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
12243; GFX7-NEXT:    v_min_f32_e32 v2, v2, v6
12244; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
12245; GFX7-NEXT:    v_and_b32_e32 v7, v3, v5
12246; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
12247; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
12248; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
12249; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12250; GFX7-NEXT:    buffer_wbinvl1
12251; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
12252; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12253; GFX7-NEXT:    v_mov_b32_e32 v3, v2
12254; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12255; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
12256; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
12257; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
12258; GFX7-NEXT:    s_setpc_b64 s[30:31]
12259  %gep = getelementptr bfloat, ptr %ptr, i64 1023
12260  %unused = atomicrmw fmin ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
12261  ret void
12262}
12263
12264; --------------------------------------------------------------------
12265; <2 x half>
12266; --------------------------------------------------------------------
12267
12268define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
12269; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
12270; GFX12:       ; %bb.0:
12271; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12272; GFX12-NEXT:    s_wait_expcnt 0x0
12273; GFX12-NEXT:    s_wait_samplecnt 0x0
12274; GFX12-NEXT:    s_wait_bvhcnt 0x0
12275; GFX12-NEXT:    s_wait_kmcnt 0x0
12276; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
12277; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
12278; GFX12-NEXT:    s_mov_b32 s0, 0
12279; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
12280; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
12281; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12282; GFX12-NEXT:    v_mov_b32_e32 v4, v3
12283; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12284; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
12285; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v2
12286; GFX12-NEXT:    s_wait_storecnt 0x0
12287; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
12288; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12289; GFX12-NEXT:    global_inv scope:SCOPE_DEV
12290; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12291; GFX12-NEXT:    s_wait_alu 0xfffe
12292; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
12293; GFX12-NEXT:    s_wait_alu 0xfffe
12294; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12295; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
12296; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
12297; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12298; GFX12-NEXT:    v_mov_b32_e32 v0, v3
12299; GFX12-NEXT:    s_wait_alu 0xfffe
12300; GFX12-NEXT:    s_setpc_b64 s[30:31]
12301;
12302; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
12303; GFX940:       ; %bb.0:
12304; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12305; GFX940-NEXT:    flat_load_dword v3, v[0:1]
12306; GFX940-NEXT:    s_mov_b64 s[0:1], 0
12307; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
12308; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
12309; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
12310; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12311; GFX940-NEXT:    v_mov_b32_e32 v5, v3
12312; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
12313; GFX940-NEXT:    s_nop 0
12314; GFX940-NEXT:    v_pk_min_f16 v4, v3, v2
12315; GFX940-NEXT:    buffer_wbl2 sc1
12316; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
12317; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12318; GFX940-NEXT:    buffer_inv sc1
12319; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
12320; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
12321; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
12322; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
12323; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
12324; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
12325; GFX940-NEXT:    v_mov_b32_e32 v0, v3
12326; GFX940-NEXT:    s_setpc_b64 s[30:31]
12327;
12328; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
12329; GFX11:       ; %bb.0:
12330; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12331; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
12332; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
12333; GFX11-NEXT:    s_mov_b32 s0, 0
12334; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
12335; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
12336; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12337; GFX11-NEXT:    v_mov_b32_e32 v4, v3
12338; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12339; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
12340; GFX11-NEXT:    v_pk_min_f16 v3, v3, v2
12341; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
12342; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
12343; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12344; GFX11-NEXT:    buffer_gl1_inv
12345; GFX11-NEXT:    buffer_gl0_inv
12346; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12347; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
12348; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
12349; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12350; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
12351; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
12352; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12353; GFX11-NEXT:    v_mov_b32_e32 v0, v3
12354; GFX11-NEXT:    s_setpc_b64 s[30:31]
12355;
12356; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
12357; GFX10:       ; %bb.0:
12358; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12359; GFX10-NEXT:    flat_load_dword v3, v[0:1]
12360; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
12361; GFX10-NEXT:    s_mov_b32 s4, 0
12362; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
12363; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
12364; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12365; GFX10-NEXT:    v_mov_b32_e32 v4, v3
12366; GFX10-NEXT:    v_pk_max_f16 v3, v4, v4
12367; GFX10-NEXT:    v_pk_min_f16 v3, v3, v2
12368; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
12369; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
12370; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12371; GFX10-NEXT:    buffer_gl1_inv
12372; GFX10-NEXT:    buffer_gl0_inv
12373; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12374; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
12375; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
12376; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
12377; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
12378; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
12379; GFX10-NEXT:    v_mov_b32_e32 v0, v3
12380; GFX10-NEXT:    s_setpc_b64 s[30:31]
12381;
12382; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
12383; GFX90A:       ; %bb.0:
12384; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12385; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
12386; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
12387; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
12388; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
12389; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
12390; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12391; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
12392; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
12393; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v2
12394; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
12395; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12396; GFX90A-NEXT:    buffer_wbinvl1
12397; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
12398; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12399; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12400; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
12401; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
12402; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
12403; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
12404; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12405;
12406; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
12407; GFX908:       ; %bb.0:
12408; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12409; GFX908-NEXT:    flat_load_dword v3, v[0:1]
12410; GFX908-NEXT:    s_mov_b64 s[4:5], 0
12411; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
12412; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
12413; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
12414; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12415; GFX908-NEXT:    v_mov_b32_e32 v4, v3
12416; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
12417; GFX908-NEXT:    v_pk_min_f16 v3, v3, v2
12418; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
12419; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12420; GFX908-NEXT:    buffer_wbinvl1
12421; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
12422; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12423; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12424; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
12425; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
12426; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
12427; GFX908-NEXT:    v_mov_b32_e32 v0, v3
12428; GFX908-NEXT:    s_setpc_b64 s[30:31]
12429;
12430; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
12431; GFX8:       ; %bb.0:
12432; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12433; GFX8-NEXT:    flat_load_dword v3, v[0:1]
12434; GFX8-NEXT:    s_mov_b64 s[4:5], 0
12435; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
12436; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
12437; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
12438; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
12439; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12440; GFX8-NEXT:    v_mov_b32_e32 v6, v3
12441; GFX8-NEXT:    v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
12442; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
12443; GFX8-NEXT:    v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12444; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
12445; GFX8-NEXT:    v_or_b32_e32 v5, v5, v3
12446; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
12447; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12448; GFX8-NEXT:    buffer_wbinvl1
12449; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
12450; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12451; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12452; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
12453; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
12454; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
12455; GFX8-NEXT:    v_mov_b32_e32 v0, v3
12456; GFX8-NEXT:    s_setpc_b64 s[30:31]
12457;
12458; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
12459; GFX7:       ; %bb.0:
12460; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12461; GFX7-NEXT:    flat_load_dword v5, v[0:1]
12462; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
12463; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
12464; GFX7-NEXT:    s_mov_b64 s[4:5], 0
12465; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
12466; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12467; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
12468; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
12469; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
12470; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
12471; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
12472; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
12473; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
12474; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
12475; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
12476; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
12477; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
12478; GFX7-NEXT:    v_min_f32_e32 v6, v6, v4
12479; GFX7-NEXT:    v_min_f32_e32 v7, v7, v5
12480; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
12481; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
12482; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
12483; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
12484; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
12485; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[6:7] glc
12486; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12487; GFX7-NEXT:    buffer_wbinvl1
12488; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
12489; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v6
12490; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
12491; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
12492; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12493; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12494; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
12495; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
12496; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
12497; GFX7-NEXT:    v_mov_b32_e32 v0, v2
12498; GFX7-NEXT:    v_mov_b32_e32 v1, v3
12499; GFX7-NEXT:    s_setpc_b64 s[30:31]
12500  %result = atomicrmw fmin ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
12501  ret <2 x half> %result
12502}
12503
12504define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
12505; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
12506; GFX12:       ; %bb.0:
12507; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12508; GFX12-NEXT:    s_wait_expcnt 0x0
12509; GFX12-NEXT:    s_wait_samplecnt 0x0
12510; GFX12-NEXT:    s_wait_bvhcnt 0x0
12511; GFX12-NEXT:    s_wait_kmcnt 0x0
12512; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
12513; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
12514; GFX12-NEXT:    s_mov_b32 s0, 0
12515; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
12516; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
12517; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12518; GFX12-NEXT:    v_mov_b32_e32 v4, v3
12519; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12520; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
12521; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v2
12522; GFX12-NEXT:    s_wait_storecnt 0x0
12523; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
12524; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12525; GFX12-NEXT:    global_inv scope:SCOPE_DEV
12526; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12527; GFX12-NEXT:    s_wait_alu 0xfffe
12528; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
12529; GFX12-NEXT:    s_wait_alu 0xfffe
12530; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12531; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
12532; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
12533; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12534; GFX12-NEXT:    v_mov_b32_e32 v0, v3
12535; GFX12-NEXT:    s_wait_alu 0xfffe
12536; GFX12-NEXT:    s_setpc_b64 s[30:31]
12537;
12538; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
12539; GFX940:       ; %bb.0:
12540; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12541; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
12542; GFX940-NEXT:    s_mov_b64 s[0:1], 0
12543; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
12544; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
12545; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
12546; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12547; GFX940-NEXT:    v_mov_b32_e32 v5, v3
12548; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
12549; GFX940-NEXT:    s_nop 0
12550; GFX940-NEXT:    v_pk_min_f16 v4, v3, v2
12551; GFX940-NEXT:    buffer_wbl2 sc1
12552; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0
12553; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12554; GFX940-NEXT:    buffer_inv sc1
12555; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
12556; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
12557; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
12558; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
12559; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
12560; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
12561; GFX940-NEXT:    v_mov_b32_e32 v0, v3
12562; GFX940-NEXT:    s_setpc_b64 s[30:31]
12563;
12564; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
12565; GFX11:       ; %bb.0:
12566; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12567; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
12568; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
12569; GFX11-NEXT:    s_mov_b32 s0, 0
12570; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
12571; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
12572; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12573; GFX11-NEXT:    v_mov_b32_e32 v4, v3
12574; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12575; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
12576; GFX11-NEXT:    v_pk_min_f16 v3, v3, v2
12577; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
12578; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
12579; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12580; GFX11-NEXT:    buffer_gl1_inv
12581; GFX11-NEXT:    buffer_gl0_inv
12582; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12583; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
12584; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
12585; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12586; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
12587; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
12588; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12589; GFX11-NEXT:    v_mov_b32_e32 v0, v3
12590; GFX11-NEXT:    s_setpc_b64 s[30:31]
12591;
12592; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
12593; GFX10:       ; %bb.0:
12594; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12595; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
12596; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
12597; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
12598; GFX10-NEXT:    s_mov_b32 s4, 0
12599; GFX10-NEXT:    flat_load_dword v0, v[3:4]
12600; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
12601; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
12602; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12603; GFX10-NEXT:    v_mov_b32_e32 v6, v0
12604; GFX10-NEXT:    v_pk_max_f16 v0, v6, v6
12605; GFX10-NEXT:    v_pk_min_f16 v5, v0, v1
12606; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
12607; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
12608; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12609; GFX10-NEXT:    buffer_gl1_inv
12610; GFX10-NEXT:    buffer_gl0_inv
12611; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
12612; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
12613; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
12614; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
12615; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
12616; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
12617; GFX10-NEXT:    s_setpc_b64 s[30:31]
12618;
12619; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
12620; GFX90A:       ; %bb.0:
12621; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12622; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
12623; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
12624; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
12625; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
12626; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
12627; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12628; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
12629; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
12630; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v2
12631; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
12632; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12633; GFX90A-NEXT:    buffer_wbinvl1
12634; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
12635; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12636; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12637; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
12638; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
12639; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
12640; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
12641; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12642;
12643; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
12644; GFX908:       ; %bb.0:
12645; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12646; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
12647; GFX908-NEXT:    s_mov_b64 s[4:5], 0
12648; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
12649; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
12650; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
12651; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12652; GFX908-NEXT:    v_mov_b32_e32 v4, v3
12653; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
12654; GFX908-NEXT:    v_pk_min_f16 v3, v3, v2
12655; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
12656; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12657; GFX908-NEXT:    buffer_wbinvl1
12658; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
12659; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12660; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12661; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
12662; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
12663; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
12664; GFX908-NEXT:    v_mov_b32_e32 v0, v3
12665; GFX908-NEXT:    s_setpc_b64 s[30:31]
12666;
12667; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
12668; GFX8:       ; %bb.0:
12669; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12670; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
12671; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
12672; GFX8-NEXT:    flat_load_dword v0, v[3:4]
12673; GFX8-NEXT:    s_mov_b64 s[4:5], 0
12674; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
12675; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
12676; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
12677; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
12678; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12679; GFX8-NEXT:    v_mov_b32_e32 v6, v0
12680; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
12681; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
12682; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12683; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
12684; GFX8-NEXT:    v_or_b32_e32 v5, v5, v0
12685; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
12686; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12687; GFX8-NEXT:    buffer_wbinvl1
12688; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
12689; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12690; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12691; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
12692; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
12693; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
12694; GFX8-NEXT:    s_setpc_b64 s[30:31]
12695;
12696; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
12697; GFX7:       ; %bb.0:
12698; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12699; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
12700; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
12701; GFX7-NEXT:    flat_load_dword v1, v[4:5]
12702; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
12703; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
12704; GFX7-NEXT:    s_mov_b64 s[4:5], 0
12705; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
12706; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
12707; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12708; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
12709; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
12710; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
12711; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
12712; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
12713; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
12714; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
12715; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
12716; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
12717; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
12718; GFX7-NEXT:    v_min_f32_e32 v6, v6, v2
12719; GFX7-NEXT:    v_min_f32_e32 v7, v7, v3
12720; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
12721; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
12722; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
12723; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
12724; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
12725; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
12726; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12727; GFX7-NEXT:    buffer_wbinvl1
12728; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
12729; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
12730; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
12731; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
12732; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12733; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12734; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
12735; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
12736; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
12737; GFX7-NEXT:    s_setpc_b64 s[30:31]
12738  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
12739  %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
12740  ret <2 x half> %result
12741}
12742
12743define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
12744; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
12745; GFX12:       ; %bb.0:
12746; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12747; GFX12-NEXT:    s_wait_expcnt 0x0
12748; GFX12-NEXT:    s_wait_samplecnt 0x0
12749; GFX12-NEXT:    s_wait_bvhcnt 0x0
12750; GFX12-NEXT:    s_wait_kmcnt 0x0
12751; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
12752; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
12753; GFX12-NEXT:    s_mov_b32 s0, 0
12754; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
12755; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
12756; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12757; GFX12-NEXT:    v_mov_b32_e32 v4, v3
12758; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12759; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
12760; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v2
12761; GFX12-NEXT:    s_wait_storecnt 0x0
12762; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
12763; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
12764; GFX12-NEXT:    global_inv scope:SCOPE_DEV
12765; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
12766; GFX12-NEXT:    s_wait_alu 0xfffe
12767; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
12768; GFX12-NEXT:    s_wait_alu 0xfffe
12769; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12770; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
12771; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
12772; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12773; GFX12-NEXT:    v_mov_b32_e32 v0, v3
12774; GFX12-NEXT:    s_wait_alu 0xfffe
12775; GFX12-NEXT:    s_setpc_b64 s[30:31]
12776;
12777; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
12778; GFX940:       ; %bb.0:
12779; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12780; GFX940-NEXT:    v_mov_b32_e32 v4, v0
12781; GFX940-NEXT:    v_mov_b32_e32 v5, v1
12782; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
12783; GFX940-NEXT:    s_movk_i32 s0, 0xf800
12784; GFX940-NEXT:    s_nop 0
12785; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
12786; GFX940-NEXT:    flat_load_dword v0, v[0:1]
12787; GFX940-NEXT:    s_mov_b32 s1, -1
12788; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
12789; GFX940-NEXT:    s_mov_b64 s[0:1], 0
12790; GFX940-NEXT:    v_pk_max_f16 v1, v2, v2
12791; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
12792; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
12793; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12794; GFX940-NEXT:    v_mov_b32_e32 v3, v0
12795; GFX940-NEXT:    v_pk_max_f16 v0, v3, v3
12796; GFX940-NEXT:    s_nop 0
12797; GFX940-NEXT:    v_pk_min_f16 v2, v0, v1
12798; GFX940-NEXT:    buffer_wbl2 sc1
12799; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0
12800; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12801; GFX940-NEXT:    buffer_inv sc1
12802; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
12803; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
12804; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
12805; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
12806; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
12807; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
12808; GFX940-NEXT:    s_setpc_b64 s[30:31]
12809;
12810; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
12811; GFX11:       ; %bb.0:
12812; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12813; GFX11-NEXT:    v_mov_b32_e32 v3, v0
12814; GFX11-NEXT:    s_mov_b32 s0, 0
12815; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
12816; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
12817; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
12818; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
12819; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
12820; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
12821; GFX11-NEXT:    v_pk_max_f16 v1, v2, v2
12822; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
12823; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
12824; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12825; GFX11-NEXT:    v_mov_b32_e32 v6, v0
12826; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12827; GFX11-NEXT:    v_pk_max_f16 v0, v6, v6
12828; GFX11-NEXT:    v_pk_min_f16 v5, v0, v1
12829; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
12830; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
12831; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12832; GFX11-NEXT:    buffer_gl1_inv
12833; GFX11-NEXT:    buffer_gl0_inv
12834; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
12835; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
12836; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
12837; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
12838; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
12839; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
12840; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
12841; GFX11-NEXT:    s_setpc_b64 s[30:31]
12842;
12843; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
12844; GFX10:       ; %bb.0:
12845; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12846; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
12847; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
12848; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
12849; GFX10-NEXT:    s_mov_b32 s4, 0
12850; GFX10-NEXT:    flat_load_dword v0, v[3:4]
12851; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
12852; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
12853; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12854; GFX10-NEXT:    v_mov_b32_e32 v6, v0
12855; GFX10-NEXT:    v_pk_max_f16 v0, v6, v6
12856; GFX10-NEXT:    v_pk_min_f16 v5, v0, v1
12857; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
12858; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
12859; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12860; GFX10-NEXT:    buffer_gl1_inv
12861; GFX10-NEXT:    buffer_gl0_inv
12862; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
12863; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
12864; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
12865; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
12866; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
12867; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
12868; GFX10-NEXT:    s_setpc_b64 s[30:31]
12869;
12870; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
12871; GFX90A:       ; %bb.0:
12872; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12873; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
12874; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
12875; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
12876; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
12877; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
12878; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
12879; GFX90A-NEXT:    v_pk_max_f16 v1, v2, v2
12880; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
12881; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
12882; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12883; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
12884; GFX90A-NEXT:    v_pk_max_f16 v0, v3, v3
12885; GFX90A-NEXT:    v_pk_min_f16 v2, v0, v1
12886; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[2:3] glc
12887; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12888; GFX90A-NEXT:    buffer_wbinvl1
12889; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
12890; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12891; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12892; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
12893; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
12894; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
12895; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12896;
12897; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
12898; GFX908:       ; %bb.0:
12899; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12900; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
12901; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
12902; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
12903; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
12904; GFX908-NEXT:    flat_load_dword v0, v[0:1]
12905; GFX908-NEXT:    s_mov_b64 s[4:5], 0
12906; GFX908-NEXT:    v_pk_max_f16 v1, v2, v2
12907; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
12908; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
12909; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12910; GFX908-NEXT:    v_mov_b32_e32 v6, v0
12911; GFX908-NEXT:    v_pk_max_f16 v0, v6, v6
12912; GFX908-NEXT:    v_pk_min_f16 v5, v0, v1
12913; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
12914; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12915; GFX908-NEXT:    buffer_wbinvl1
12916; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
12917; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12918; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12919; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
12920; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
12921; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
12922; GFX908-NEXT:    s_setpc_b64 s[30:31]
12923;
12924; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
12925; GFX8:       ; %bb.0:
12926; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12927; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
12928; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
12929; GFX8-NEXT:    flat_load_dword v0, v[3:4]
12930; GFX8-NEXT:    s_mov_b64 s[4:5], 0
12931; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
12932; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
12933; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
12934; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
12935; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12936; GFX8-NEXT:    v_mov_b32_e32 v6, v0
12937; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
12938; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
12939; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12940; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
12941; GFX8-NEXT:    v_or_b32_e32 v5, v5, v0
12942; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
12943; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12944; GFX8-NEXT:    buffer_wbinvl1
12945; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
12946; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12947; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12948; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
12949; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
12950; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
12951; GFX8-NEXT:    s_setpc_b64 s[30:31]
12952;
12953; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
12954; GFX7:       ; %bb.0:
12955; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12956; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
12957; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
12958; GFX7-NEXT:    flat_load_dword v1, v[4:5]
12959; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
12960; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
12961; GFX7-NEXT:    s_mov_b64 s[4:5], 0
12962; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
12963; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
12964; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12965; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
12966; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
12967; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
12968; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
12969; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
12970; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
12971; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
12972; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
12973; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
12974; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
12975; GFX7-NEXT:    v_min_f32_e32 v6, v6, v2
12976; GFX7-NEXT:    v_min_f32_e32 v7, v7, v3
12977; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
12978; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
12979; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
12980; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
12981; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
12982; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
12983; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
12984; GFX7-NEXT:    buffer_wbinvl1
12985; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
12986; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
12987; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
12988; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
12989; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
12990; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
12991; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
12992; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
12993; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
12994; GFX7-NEXT:    s_setpc_b64 s[30:31]
12995  %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
12996  %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
12997  ret <2 x half> %result
12998}
12999
13000define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
13001; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
13002; GFX12:       ; %bb.0:
13003; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13004; GFX12-NEXT:    s_wait_expcnt 0x0
13005; GFX12-NEXT:    s_wait_samplecnt 0x0
13006; GFX12-NEXT:    s_wait_bvhcnt 0x0
13007; GFX12-NEXT:    s_wait_kmcnt 0x0
13008; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
13009; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
13010; GFX12-NEXT:    s_mov_b32 s0, 0
13011; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
13012; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
13013; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13014; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
13015; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13016; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v4
13017; GFX12-NEXT:    s_wait_storecnt 0x0
13018; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
13019; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13020; GFX12-NEXT:    global_inv scope:SCOPE_DEV
13021; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
13022; GFX12-NEXT:    v_mov_b32_e32 v3, v2
13023; GFX12-NEXT:    s_wait_alu 0xfffe
13024; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
13025; GFX12-NEXT:    s_wait_alu 0xfffe
13026; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13027; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
13028; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
13029; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13030; GFX12-NEXT:    s_wait_alu 0xfffe
13031; GFX12-NEXT:    s_setpc_b64 s[30:31]
13032;
13033; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
13034; GFX940:       ; %bb.0:
13035; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13036; GFX940-NEXT:    flat_load_dword v3, v[0:1]
13037; GFX940-NEXT:    s_mov_b64 s[0:1], 0
13038; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
13039; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
13040; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
13041; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13042; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
13043; GFX940-NEXT:    s_nop 0
13044; GFX940-NEXT:    v_pk_min_f16 v2, v2, v4
13045; GFX940-NEXT:    buffer_wbl2 sc1
13046; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
13047; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13048; GFX940-NEXT:    buffer_inv sc1
13049; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
13050; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
13051; GFX940-NEXT:    v_mov_b32_e32 v3, v2
13052; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
13053; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
13054; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
13055; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
13056; GFX940-NEXT:    s_setpc_b64 s[30:31]
13057;
13058; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
13059; GFX11:       ; %bb.0:
13060; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13061; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
13062; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
13063; GFX11-NEXT:    s_mov_b32 s0, 0
13064; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
13065; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
13066; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13067; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
13068; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13069; GFX11-NEXT:    v_pk_min_f16 v2, v2, v4
13070; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
13071; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
13072; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13073; GFX11-NEXT:    buffer_gl1_inv
13074; GFX11-NEXT:    buffer_gl0_inv
13075; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
13076; GFX11-NEXT:    v_mov_b32_e32 v3, v2
13077; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
13078; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13079; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13080; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
13081; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
13082; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13083; GFX11-NEXT:    s_setpc_b64 s[30:31]
13084;
13085; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
13086; GFX10:       ; %bb.0:
13087; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13088; GFX10-NEXT:    flat_load_dword v3, v[0:1]
13089; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
13090; GFX10-NEXT:    s_mov_b32 s4, 0
13091; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
13092; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
13093; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13094; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
13095; GFX10-NEXT:    v_pk_min_f16 v2, v2, v4
13096; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
13097; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
13098; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13099; GFX10-NEXT:    buffer_gl1_inv
13100; GFX10-NEXT:    buffer_gl0_inv
13101; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
13102; GFX10-NEXT:    v_mov_b32_e32 v3, v2
13103; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
13104; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
13105; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
13106; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
13107; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
13108; GFX10-NEXT:    s_setpc_b64 s[30:31]
13109;
13110; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
13111; GFX90A:       ; %bb.0:
13112; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13113; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
13114; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
13115; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
13116; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
13117; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
13118; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13119; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
13120; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v4
13121; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
13122; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13123; GFX90A-NEXT:    buffer_wbinvl1
13124; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
13125; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13126; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
13127; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13128; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
13129; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
13130; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
13131; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13132;
13133; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
13134; GFX908:       ; %bb.0:
13135; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13136; GFX908-NEXT:    flat_load_dword v3, v[0:1]
13137; GFX908-NEXT:    s_mov_b64 s[4:5], 0
13138; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
13139; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
13140; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
13141; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13142; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
13143; GFX908-NEXT:    v_pk_min_f16 v2, v2, v4
13144; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
13145; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13146; GFX908-NEXT:    buffer_wbinvl1
13147; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
13148; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13149; GFX908-NEXT:    v_mov_b32_e32 v3, v2
13150; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13151; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
13152; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
13153; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
13154; GFX908-NEXT:    s_setpc_b64 s[30:31]
13155;
13156; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
13157; GFX8:       ; %bb.0:
13158; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13159; GFX8-NEXT:    flat_load_dword v3, v[0:1]
13160; GFX8-NEXT:    s_mov_b64 s[4:5], 0
13161; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
13162; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
13163; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
13164; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
13165; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13166; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
13167; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
13168; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
13169; GFX8-NEXT:    v_min_f16_e32 v6, v6, v5
13170; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
13171; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
13172; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13173; GFX8-NEXT:    buffer_wbinvl1
13174; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
13175; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13176; GFX8-NEXT:    v_mov_b32_e32 v3, v2
13177; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13178; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
13179; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
13180; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
13181; GFX8-NEXT:    s_setpc_b64 s[30:31]
13182;
13183; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
13184; GFX7:       ; %bb.0:
13185; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13186; GFX7-NEXT:    flat_load_dword v5, v[0:1]
13187; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
13188; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
13189; GFX7-NEXT:    s_mov_b64 s[4:5], 0
13190; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
13191; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13192; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
13193; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
13194; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
13195; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
13196; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
13197; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
13198; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
13199; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
13200; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
13201; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
13202; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
13203; GFX7-NEXT:    v_min_f32_e32 v6, v6, v2
13204; GFX7-NEXT:    v_min_f32_e32 v7, v7, v3
13205; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
13206; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
13207; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
13208; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
13209; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
13210; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
13211; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13212; GFX7-NEXT:    buffer_wbinvl1
13213; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
13214; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
13215; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
13216; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
13217; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13218; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13219; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
13220; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
13221; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
13222; GFX7-NEXT:    s_setpc_b64 s[30:31]
13223  %unused = atomicrmw fmin ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
13224  ret void
13225}
13226
13227define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
13228; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13229; GFX12:       ; %bb.0:
13230; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13231; GFX12-NEXT:    s_wait_expcnt 0x0
13232; GFX12-NEXT:    s_wait_samplecnt 0x0
13233; GFX12-NEXT:    s_wait_bvhcnt 0x0
13234; GFX12-NEXT:    s_wait_kmcnt 0x0
13235; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
13236; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
13237; GFX12-NEXT:    s_mov_b32 s0, 0
13238; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
13239; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
13240; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13241; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
13242; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13243; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v4
13244; GFX12-NEXT:    s_wait_storecnt 0x0
13245; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
13246; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13247; GFX12-NEXT:    global_inv scope:SCOPE_DEV
13248; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
13249; GFX12-NEXT:    v_mov_b32_e32 v3, v2
13250; GFX12-NEXT:    s_wait_alu 0xfffe
13251; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
13252; GFX12-NEXT:    s_wait_alu 0xfffe
13253; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13254; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
13255; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
13256; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13257; GFX12-NEXT:    s_wait_alu 0xfffe
13258; GFX12-NEXT:    s_setpc_b64 s[30:31]
13259;
13260; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13261; GFX940:       ; %bb.0:
13262; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13263; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
13264; GFX940-NEXT:    s_mov_b64 s[0:1], 0
13265; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
13266; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
13267; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
13268; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13269; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
13270; GFX940-NEXT:    s_nop 0
13271; GFX940-NEXT:    v_pk_min_f16 v2, v2, v4
13272; GFX940-NEXT:    buffer_wbl2 sc1
13273; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
13274; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13275; GFX940-NEXT:    buffer_inv sc1
13276; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
13277; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
13278; GFX940-NEXT:    v_mov_b32_e32 v3, v2
13279; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
13280; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
13281; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
13282; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
13283; GFX940-NEXT:    s_setpc_b64 s[30:31]
13284;
13285; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13286; GFX11:       ; %bb.0:
13287; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13288; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
13289; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
13290; GFX11-NEXT:    s_mov_b32 s0, 0
13291; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
13292; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
13293; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13294; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
13295; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13296; GFX11-NEXT:    v_pk_min_f16 v2, v2, v4
13297; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
13298; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
13299; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13300; GFX11-NEXT:    buffer_gl1_inv
13301; GFX11-NEXT:    buffer_gl0_inv
13302; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
13303; GFX11-NEXT:    v_mov_b32_e32 v3, v2
13304; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
13305; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13306; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13307; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
13308; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
13309; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13310; GFX11-NEXT:    s_setpc_b64 s[30:31]
13311;
13312; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13313; GFX10:       ; %bb.0:
13314; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13315; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
13316; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
13317; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
13318; GFX10-NEXT:    s_mov_b32 s4, 0
13319; GFX10-NEXT:    flat_load_dword v3, v[0:1]
13320; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
13321; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
13322; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13323; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
13324; GFX10-NEXT:    v_pk_min_f16 v2, v2, v4
13325; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
13326; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
13327; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13328; GFX10-NEXT:    buffer_gl1_inv
13329; GFX10-NEXT:    buffer_gl0_inv
13330; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
13331; GFX10-NEXT:    v_mov_b32_e32 v3, v2
13332; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
13333; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
13334; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
13335; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
13336; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
13337; GFX10-NEXT:    s_setpc_b64 s[30:31]
13338;
13339; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13340; GFX90A:       ; %bb.0:
13341; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13342; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
13343; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
13344; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
13345; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
13346; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
13347; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13348; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
13349; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v4
13350; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
13351; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13352; GFX90A-NEXT:    buffer_wbinvl1
13353; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
13354; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13355; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
13356; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13357; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
13358; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
13359; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
13360; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13361;
13362; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13363; GFX908:       ; %bb.0:
13364; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13365; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
13366; GFX908-NEXT:    s_mov_b64 s[4:5], 0
13367; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
13368; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
13369; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
13370; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13371; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
13372; GFX908-NEXT:    v_pk_min_f16 v2, v2, v4
13373; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
13374; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13375; GFX908-NEXT:    buffer_wbinvl1
13376; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
13377; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13378; GFX908-NEXT:    v_mov_b32_e32 v3, v2
13379; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13380; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
13381; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
13382; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
13383; GFX908-NEXT:    s_setpc_b64 s[30:31]
13384;
13385; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13386; GFX8:       ; %bb.0:
13387; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13388; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
13389; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
13390; GFX8-NEXT:    flat_load_dword v3, v[0:1]
13391; GFX8-NEXT:    s_mov_b64 s[4:5], 0
13392; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
13393; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
13394; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
13395; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
13396; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13397; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
13398; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
13399; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
13400; GFX8-NEXT:    v_min_f16_e32 v6, v6, v5
13401; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
13402; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
13403; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13404; GFX8-NEXT:    buffer_wbinvl1
13405; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
13406; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13407; GFX8-NEXT:    v_mov_b32_e32 v3, v2
13408; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13409; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
13410; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
13411; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
13412; GFX8-NEXT:    s_setpc_b64 s[30:31]
13413;
13414; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13415; GFX7:       ; %bb.0:
13416; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13417; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
13418; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
13419; GFX7-NEXT:    flat_load_dword v5, v[0:1]
13420; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
13421; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
13422; GFX7-NEXT:    s_mov_b64 s[4:5], 0
13423; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
13424; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13425; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
13426; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
13427; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
13428; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
13429; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
13430; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
13431; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
13432; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
13433; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
13434; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
13435; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
13436; GFX7-NEXT:    v_min_f32_e32 v6, v6, v2
13437; GFX7-NEXT:    v_min_f32_e32 v7, v7, v3
13438; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
13439; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
13440; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
13441; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
13442; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
13443; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
13444; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13445; GFX7-NEXT:    buffer_wbinvl1
13446; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
13447; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
13448; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
13449; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
13450; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13451; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13452; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
13453; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
13454; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
13455; GFX7-NEXT:    s_setpc_b64 s[30:31]
13456  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
13457  %unused = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
13458  ret void
13459}
13460
13461define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
13462; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
13463; GFX12:       ; %bb.0:
13464; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13465; GFX12-NEXT:    s_wait_expcnt 0x0
13466; GFX12-NEXT:    s_wait_samplecnt 0x0
13467; GFX12-NEXT:    s_wait_bvhcnt 0x0
13468; GFX12-NEXT:    s_wait_kmcnt 0x0
13469; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
13470; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
13471; GFX12-NEXT:    s_mov_b32 s0, 0
13472; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
13473; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
13474; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13475; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
13476; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13477; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v4
13478; GFX12-NEXT:    s_wait_storecnt 0x0
13479; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
13480; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13481; GFX12-NEXT:    global_inv scope:SCOPE_DEV
13482; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
13483; GFX12-NEXT:    v_mov_b32_e32 v3, v2
13484; GFX12-NEXT:    s_wait_alu 0xfffe
13485; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
13486; GFX12-NEXT:    s_wait_alu 0xfffe
13487; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13488; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
13489; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
13490; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13491; GFX12-NEXT:    s_wait_alu 0xfffe
13492; GFX12-NEXT:    s_setpc_b64 s[30:31]
13493;
13494; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
13495; GFX940:       ; %bb.0:
13496; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13497; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
13498; GFX940-NEXT:    s_movk_i32 s0, 0xf800
13499; GFX940-NEXT:    s_nop 0
13500; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
13501; GFX940-NEXT:    flat_load_dword v3, v[4:5]
13502; GFX940-NEXT:    s_mov_b32 s1, -1
13503; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
13504; GFX940-NEXT:    s_mov_b64 s[0:1], 0
13505; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
13506; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
13507; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
13508; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13509; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
13510; GFX940-NEXT:    s_nop 0
13511; GFX940-NEXT:    v_pk_min_f16 v2, v2, v4
13512; GFX940-NEXT:    buffer_wbl2 sc1
13513; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
13514; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13515; GFX940-NEXT:    buffer_inv sc1
13516; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
13517; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
13518; GFX940-NEXT:    v_mov_b32_e32 v3, v2
13519; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
13520; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
13521; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
13522; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
13523; GFX940-NEXT:    s_setpc_b64 s[30:31]
13524;
13525; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
13526; GFX11:       ; %bb.0:
13527; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13528; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
13529; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
13530; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
13531; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
13532; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
13533; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
13534; GFX11-NEXT:    s_mov_b32 s0, 0
13535; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
13536; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
13537; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13538; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
13539; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13540; GFX11-NEXT:    v_pk_min_f16 v2, v2, v4
13541; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
13542; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
13543; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13544; GFX11-NEXT:    buffer_gl1_inv
13545; GFX11-NEXT:    buffer_gl0_inv
13546; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
13547; GFX11-NEXT:    v_mov_b32_e32 v3, v2
13548; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
13549; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13550; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13551; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
13552; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
13553; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13554; GFX11-NEXT:    s_setpc_b64 s[30:31]
13555;
13556; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
13557; GFX10:       ; %bb.0:
13558; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13559; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
13560; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
13561; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
13562; GFX10-NEXT:    s_mov_b32 s4, 0
13563; GFX10-NEXT:    flat_load_dword v3, v[0:1]
13564; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
13565; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
13566; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13567; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
13568; GFX10-NEXT:    v_pk_min_f16 v2, v2, v4
13569; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
13570; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
13571; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13572; GFX10-NEXT:    buffer_gl1_inv
13573; GFX10-NEXT:    buffer_gl0_inv
13574; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
13575; GFX10-NEXT:    v_mov_b32_e32 v3, v2
13576; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
13577; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
13578; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
13579; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
13580; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
13581; GFX10-NEXT:    s_setpc_b64 s[30:31]
13582;
13583; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
13584; GFX90A:       ; %bb.0:
13585; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13586; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
13587; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
13588; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
13589; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
13590; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
13591; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
13592; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
13593; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
13594; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
13595; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13596; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
13597; GFX90A-NEXT:    v_pk_min_f16 v0, v0, v2
13598; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
13599; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13600; GFX90A-NEXT:    buffer_wbinvl1
13601; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
13602; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13603; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
13604; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13605; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
13606; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
13607; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
13608; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13609;
13610; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
13611; GFX908:       ; %bb.0:
13612; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13613; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
13614; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
13615; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
13616; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
13617; GFX908-NEXT:    flat_load_dword v1, v[0:1]
13618; GFX908-NEXT:    s_mov_b64 s[4:5], 0
13619; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
13620; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
13621; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
13622; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13623; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
13624; GFX908-NEXT:    v_pk_min_f16 v0, v0, v2
13625; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
13626; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13627; GFX908-NEXT:    buffer_wbinvl1
13628; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
13629; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13630; GFX908-NEXT:    v_mov_b32_e32 v1, v0
13631; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13632; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
13633; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
13634; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
13635; GFX908-NEXT:    s_setpc_b64 s[30:31]
13636;
13637; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
13638; GFX8:       ; %bb.0:
13639; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13640; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
13641; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
13642; GFX8-NEXT:    flat_load_dword v3, v[0:1]
13643; GFX8-NEXT:    s_mov_b64 s[4:5], 0
13644; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
13645; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
13646; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
13647; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
13648; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13649; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
13650; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
13651; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
13652; GFX8-NEXT:    v_min_f16_e32 v6, v6, v5
13653; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
13654; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
13655; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13656; GFX8-NEXT:    buffer_wbinvl1
13657; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
13658; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13659; GFX8-NEXT:    v_mov_b32_e32 v3, v2
13660; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13661; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
13662; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
13663; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
13664; GFX8-NEXT:    s_setpc_b64 s[30:31]
13665;
13666; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
13667; GFX7:       ; %bb.0:
13668; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13669; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
13670; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
13671; GFX7-NEXT:    flat_load_dword v5, v[0:1]
13672; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
13673; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
13674; GFX7-NEXT:    s_mov_b64 s[4:5], 0
13675; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
13676; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13677; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
13678; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
13679; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
13680; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
13681; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
13682; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
13683; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
13684; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
13685; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
13686; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
13687; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
13688; GFX7-NEXT:    v_min_f32_e32 v6, v6, v2
13689; GFX7-NEXT:    v_min_f32_e32 v7, v7, v3
13690; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
13691; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
13692; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
13693; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
13694; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
13695; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
13696; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13697; GFX7-NEXT:    buffer_wbinvl1
13698; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
13699; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
13700; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
13701; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
13702; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13703; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13704; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
13705; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
13706; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
13707; GFX7-NEXT:    s_setpc_b64 s[30:31]
13708  %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
13709  %unused = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
13710  ret void
13711}
13712
13713define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
13714; GFX12-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13715; GFX12:       ; %bb.0:
13716; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13717; GFX12-NEXT:    s_wait_expcnt 0x0
13718; GFX12-NEXT:    s_wait_samplecnt 0x0
13719; GFX12-NEXT:    s_wait_bvhcnt 0x0
13720; GFX12-NEXT:    s_wait_kmcnt 0x0
13721; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
13722; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
13723; GFX12-NEXT:    s_mov_b32 s0, 0
13724; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
13725; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
13726; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13727; GFX12-NEXT:    v_mov_b32_e32 v4, v3
13728; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13729; GFX12-NEXT:    v_pk_max_num_f16 v3, v4, v4
13730; GFX12-NEXT:    v_pk_min_num_f16 v3, v3, v2
13731; GFX12-NEXT:    global_wb scope:SCOPE_SYS
13732; GFX12-NEXT:    s_wait_storecnt 0x0
13733; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13734; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13735; GFX12-NEXT:    global_inv scope:SCOPE_SYS
13736; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
13737; GFX12-NEXT:    s_wait_alu 0xfffe
13738; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
13739; GFX12-NEXT:    s_wait_alu 0xfffe
13740; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13741; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
13742; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
13743; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13744; GFX12-NEXT:    v_mov_b32_e32 v0, v3
13745; GFX12-NEXT:    s_wait_alu 0xfffe
13746; GFX12-NEXT:    s_setpc_b64 s[30:31]
13747;
13748; GFX940-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13749; GFX940:       ; %bb.0:
13750; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13751; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
13752; GFX940-NEXT:    s_mov_b64 s[0:1], 0
13753; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
13754; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
13755; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
13756; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13757; GFX940-NEXT:    v_mov_b32_e32 v5, v3
13758; GFX940-NEXT:    v_pk_max_f16 v3, v5, v5
13759; GFX940-NEXT:    s_nop 0
13760; GFX940-NEXT:    v_pk_min_f16 v4, v3, v2
13761; GFX940-NEXT:    buffer_wbl2 sc0 sc1
13762; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1
13763; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13764; GFX940-NEXT:    buffer_inv sc0 sc1
13765; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
13766; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
13767; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
13768; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
13769; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
13770; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
13771; GFX940-NEXT:    v_mov_b32_e32 v0, v3
13772; GFX940-NEXT:    s_setpc_b64 s[30:31]
13773;
13774; GFX11-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13775; GFX11:       ; %bb.0:
13776; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13777; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
13778; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
13779; GFX11-NEXT:    s_mov_b32 s0, 0
13780; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
13781; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
13782; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13783; GFX11-NEXT:    v_mov_b32_e32 v4, v3
13784; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13785; GFX11-NEXT:    v_pk_max_f16 v3, v4, v4
13786; GFX11-NEXT:    v_pk_min_f16 v3, v3, v2
13787; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
13788; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
13789; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13790; GFX11-NEXT:    buffer_gl1_inv
13791; GFX11-NEXT:    buffer_gl0_inv
13792; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
13793; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
13794; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
13795; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13796; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
13797; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
13798; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13799; GFX11-NEXT:    v_mov_b32_e32 v0, v3
13800; GFX11-NEXT:    s_setpc_b64 s[30:31]
13801;
13802; GFX10-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13803; GFX10:       ; %bb.0:
13804; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13805; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
13806; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
13807; GFX10-NEXT:    v_pk_max_f16 v1, v2, v2
13808; GFX10-NEXT:    s_mov_b32 s4, 0
13809; GFX10-NEXT:    flat_load_dword v0, v[3:4]
13810; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
13811; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
13812; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13813; GFX10-NEXT:    v_mov_b32_e32 v6, v0
13814; GFX10-NEXT:    v_pk_max_f16 v0, v6, v6
13815; GFX10-NEXT:    v_pk_min_f16 v5, v0, v1
13816; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
13817; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
13818; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13819; GFX10-NEXT:    buffer_gl1_inv
13820; GFX10-NEXT:    buffer_gl0_inv
13821; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
13822; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
13823; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
13824; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
13825; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
13826; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
13827; GFX10-NEXT:    s_setpc_b64 s[30:31]
13828;
13829; GFX90A-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13830; GFX90A:       ; %bb.0:
13831; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13832; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
13833; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
13834; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
13835; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
13836; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
13837; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13838; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
13839; GFX90A-NEXT:    v_pk_max_f16 v3, v5, v5
13840; GFX90A-NEXT:    v_pk_min_f16 v4, v3, v2
13841; GFX90A-NEXT:    buffer_wbl2
13842; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
13843; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13844; GFX90A-NEXT:    buffer_invl2
13845; GFX90A-NEXT:    buffer_wbinvl1
13846; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
13847; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13848; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13849; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
13850; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
13851; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
13852; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
13853; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13854;
13855; GFX908-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13856; GFX908:       ; %bb.0:
13857; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13858; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
13859; GFX908-NEXT:    s_mov_b64 s[4:5], 0
13860; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
13861; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
13862; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
13863; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13864; GFX908-NEXT:    v_mov_b32_e32 v4, v3
13865; GFX908-NEXT:    v_pk_max_f16 v3, v4, v4
13866; GFX908-NEXT:    v_pk_min_f16 v3, v3, v2
13867; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
13868; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13869; GFX908-NEXT:    buffer_wbinvl1
13870; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
13871; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13872; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13873; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
13874; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
13875; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
13876; GFX908-NEXT:    v_mov_b32_e32 v0, v3
13877; GFX908-NEXT:    s_setpc_b64 s[30:31]
13878;
13879; GFX8-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13880; GFX8:       ; %bb.0:
13881; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13882; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
13883; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
13884; GFX8-NEXT:    flat_load_dword v0, v[3:4]
13885; GFX8-NEXT:    s_mov_b64 s[4:5], 0
13886; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
13887; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
13888; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
13889; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
13890; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13891; GFX8-NEXT:    v_mov_b32_e32 v6, v0
13892; GFX8-NEXT:    v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
13893; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
13894; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
13895; GFX8-NEXT:    v_min_f16_e32 v5, v5, v2
13896; GFX8-NEXT:    v_or_b32_e32 v5, v5, v0
13897; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
13898; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13899; GFX8-NEXT:    buffer_wbinvl1
13900; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
13901; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13902; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13903; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
13904; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
13905; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
13906; GFX8-NEXT:    s_setpc_b64 s[30:31]
13907;
13908; GFX7-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13909; GFX7:       ; %bb.0:
13910; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13911; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
13912; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
13913; GFX7-NEXT:    flat_load_dword v1, v[4:5]
13914; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
13915; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
13916; GFX7-NEXT:    s_mov_b64 s[4:5], 0
13917; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
13918; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
13919; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13920; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
13921; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
13922; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
13923; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
13924; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
13925; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
13926; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
13927; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
13928; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
13929; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
13930; GFX7-NEXT:    v_min_f32_e32 v6, v6, v2
13931; GFX7-NEXT:    v_min_f32_e32 v7, v7, v3
13932; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
13933; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
13934; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
13935; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
13936; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
13937; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
13938; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13939; GFX7-NEXT:    buffer_wbinvl1
13940; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
13941; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
13942; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
13943; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
13944; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
13945; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
13946; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
13947; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
13948; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
13949; GFX7-NEXT:    s_setpc_b64 s[30:31]
13950  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
13951  %result = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
13952  ret <2 x half> %result
13953}
13954
13955define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
13956; GFX12-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13957; GFX12:       ; %bb.0:
13958; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13959; GFX12-NEXT:    s_wait_expcnt 0x0
13960; GFX12-NEXT:    s_wait_samplecnt 0x0
13961; GFX12-NEXT:    s_wait_bvhcnt 0x0
13962; GFX12-NEXT:    s_wait_kmcnt 0x0
13963; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
13964; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
13965; GFX12-NEXT:    s_mov_b32 s0, 0
13966; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
13967; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
13968; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13969; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
13970; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
13971; GFX12-NEXT:    v_pk_min_num_f16 v2, v2, v4
13972; GFX12-NEXT:    global_wb scope:SCOPE_SYS
13973; GFX12-NEXT:    s_wait_storecnt 0x0
13974; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
13975; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
13976; GFX12-NEXT:    global_inv scope:SCOPE_SYS
13977; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
13978; GFX12-NEXT:    v_mov_b32_e32 v3, v2
13979; GFX12-NEXT:    s_wait_alu 0xfffe
13980; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
13981; GFX12-NEXT:    s_wait_alu 0xfffe
13982; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
13983; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
13984; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
13985; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
13986; GFX12-NEXT:    s_wait_alu 0xfffe
13987; GFX12-NEXT:    s_setpc_b64 s[30:31]
13988;
13989; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
13990; GFX940:       ; %bb.0:
13991; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13992; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
13993; GFX940-NEXT:    s_mov_b64 s[0:1], 0
13994; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
13995; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
13996; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
13997; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
13998; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
13999; GFX940-NEXT:    s_nop 0
14000; GFX940-NEXT:    v_pk_min_f16 v2, v2, v4
14001; GFX940-NEXT:    buffer_wbl2 sc0 sc1
14002; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
14003; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14004; GFX940-NEXT:    buffer_inv sc0 sc1
14005; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
14006; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
14007; GFX940-NEXT:    v_mov_b32_e32 v3, v2
14008; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
14009; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
14010; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
14011; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
14012; GFX940-NEXT:    s_setpc_b64 s[30:31]
14013;
14014; GFX11-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
14015; GFX11:       ; %bb.0:
14016; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14017; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
14018; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
14019; GFX11-NEXT:    s_mov_b32 s0, 0
14020; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
14021; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
14022; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14023; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
14024; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14025; GFX11-NEXT:    v_pk_min_f16 v2, v2, v4
14026; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
14027; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
14028; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14029; GFX11-NEXT:    buffer_gl1_inv
14030; GFX11-NEXT:    buffer_gl0_inv
14031; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
14032; GFX11-NEXT:    v_mov_b32_e32 v3, v2
14033; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
14034; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
14035; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
14036; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
14037; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
14038; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
14039; GFX11-NEXT:    s_setpc_b64 s[30:31]
14040;
14041; GFX10-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
14042; GFX10:       ; %bb.0:
14043; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14044; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
14045; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
14046; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
14047; GFX10-NEXT:    s_mov_b32 s4, 0
14048; GFX10-NEXT:    flat_load_dword v3, v[0:1]
14049; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
14050; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
14051; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14052; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
14053; GFX10-NEXT:    v_pk_min_f16 v2, v2, v4
14054; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
14055; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
14056; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14057; GFX10-NEXT:    buffer_gl1_inv
14058; GFX10-NEXT:    buffer_gl0_inv
14059; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
14060; GFX10-NEXT:    v_mov_b32_e32 v3, v2
14061; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
14062; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
14063; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
14064; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
14065; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
14066; GFX10-NEXT:    s_setpc_b64 s[30:31]
14067;
14068; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
14069; GFX90A:       ; %bb.0:
14070; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14071; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
14072; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
14073; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
14074; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
14075; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
14076; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14077; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
14078; GFX90A-NEXT:    v_pk_min_f16 v2, v2, v4
14079; GFX90A-NEXT:    buffer_wbl2
14080; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
14081; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14082; GFX90A-NEXT:    buffer_invl2
14083; GFX90A-NEXT:    buffer_wbinvl1
14084; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
14085; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14086; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
14087; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14088; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
14089; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
14090; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
14091; GFX90A-NEXT:    s_setpc_b64 s[30:31]
14092;
14093; GFX908-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
14094; GFX908:       ; %bb.0:
14095; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14096; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
14097; GFX908-NEXT:    s_mov_b64 s[4:5], 0
14098; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
14099; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
14100; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
14101; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14102; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
14103; GFX908-NEXT:    v_pk_min_f16 v2, v2, v4
14104; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
14105; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14106; GFX908-NEXT:    buffer_wbinvl1
14107; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
14108; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14109; GFX908-NEXT:    v_mov_b32_e32 v3, v2
14110; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14111; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
14112; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
14113; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
14114; GFX908-NEXT:    s_setpc_b64 s[30:31]
14115;
14116; GFX8-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
14117; GFX8:       ; %bb.0:
14118; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14119; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
14120; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
14121; GFX8-NEXT:    flat_load_dword v3, v[0:1]
14122; GFX8-NEXT:    s_mov_b64 s[4:5], 0
14123; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
14124; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
14125; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
14126; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
14127; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14128; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
14129; GFX8-NEXT:    v_max_f16_e32 v6, v3, v3
14130; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
14131; GFX8-NEXT:    v_min_f16_e32 v6, v6, v5
14132; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
14133; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
14134; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14135; GFX8-NEXT:    buffer_wbinvl1
14136; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
14137; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14138; GFX8-NEXT:    v_mov_b32_e32 v3, v2
14139; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14140; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
14141; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
14142; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
14143; GFX8-NEXT:    s_setpc_b64 s[30:31]
14144;
14145; GFX7-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
14146; GFX7:       ; %bb.0:
14147; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14148; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
14149; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
14150; GFX7-NEXT:    flat_load_dword v5, v[0:1]
14151; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
14152; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
14153; GFX7-NEXT:    s_mov_b64 s[4:5], 0
14154; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
14155; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14156; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
14157; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
14158; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
14159; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
14160; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
14161; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
14162; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
14163; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
14164; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
14165; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
14166; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
14167; GFX7-NEXT:    v_min_f32_e32 v6, v6, v2
14168; GFX7-NEXT:    v_min_f32_e32 v7, v7, v3
14169; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
14170; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
14171; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
14172; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
14173; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
14174; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
14175; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14176; GFX7-NEXT:    buffer_wbinvl1
14177; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
14178; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
14179; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
14180; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
14181; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14182; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14183; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
14184; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
14185; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
14186; GFX7-NEXT:    s_setpc_b64 s[30:31]
14187  %gep = getelementptr <2 x half>, ptr %ptr, i64 511
14188  %unused = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
14189  ret void
14190}
14191
14192; --------------------------------------------------------------------
14193; <2 x bfloat>
14194; --------------------------------------------------------------------
14195
14196define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
14197; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
14198; GFX12:       ; %bb.0:
14199; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14200; GFX12-NEXT:    s_wait_expcnt 0x0
14201; GFX12-NEXT:    s_wait_samplecnt 0x0
14202; GFX12-NEXT:    s_wait_bvhcnt 0x0
14203; GFX12-NEXT:    s_wait_kmcnt 0x0
14204; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
14205; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14206; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14207; GFX12-NEXT:    s_mov_b32 s1, 0
14208; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
14209; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
14210; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14211; GFX12-NEXT:    v_mov_b32_e32 v6, v3
14212; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14213; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14214; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
14215; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14216; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14217; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
14218; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v4
14219; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14220; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
14221; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14222; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
14223; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
14224; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
14225; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
14226; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14227; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
14228; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
14229; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14230; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
14231; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
14232; GFX12-NEXT:    s_wait_storecnt 0x0
14233; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
14234; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14235; GFX12-NEXT:    global_inv scope:SCOPE_DEV
14236; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
14237; GFX12-NEXT:    s_wait_alu 0xfffe
14238; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
14239; GFX12-NEXT:    s_wait_alu 0xfffe
14240; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
14241; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
14242; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
14243; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
14244; GFX12-NEXT:    v_mov_b32_e32 v0, v3
14245; GFX12-NEXT:    s_wait_alu 0xfffe
14246; GFX12-NEXT:    s_setpc_b64 s[30:31]
14247;
14248; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
14249; GFX940:       ; %bb.0:
14250; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14251; GFX940-NEXT:    flat_load_dword v3, v[0:1]
14252; GFX940-NEXT:    s_mov_b64 s[2:3], 0
14253; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14254; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
14255; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14256; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
14257; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
14258; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
14259; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14260; GFX940-NEXT:    v_mov_b32_e32 v7, v3
14261; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
14262; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
14263; GFX940-NEXT:    v_min_f32_e32 v3, v3, v4
14264; GFX940-NEXT:    v_min_f32_e32 v5, v5, v2
14265; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
14266; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
14267; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
14268; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14269; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
14270; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
14271; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14272; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
14273; GFX940-NEXT:    s_nop 0
14274; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14275; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
14276; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
14277; GFX940-NEXT:    buffer_wbl2 sc1
14278; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0
14279; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14280; GFX940-NEXT:    buffer_inv sc1
14281; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
14282; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
14283; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
14284; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
14285; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
14286; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
14287; GFX940-NEXT:    v_mov_b32_e32 v0, v3
14288; GFX940-NEXT:    s_setpc_b64 s[30:31]
14289;
14290; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
14291; GFX11:       ; %bb.0:
14292; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14293; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
14294; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14295; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14296; GFX11-NEXT:    s_mov_b32 s1, 0
14297; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
14298; GFX11-NEXT:    .p2align 6
14299; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
14300; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
14301; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14302; GFX11-NEXT:    v_mov_b32_e32 v6, v3
14303; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14304; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14305; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
14306; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14307; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14308; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
14309; GFX11-NEXT:    v_min_f32_e32 v3, v3, v4
14310; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14311; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
14312; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14313; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
14314; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
14315; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
14316; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
14317; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14318; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
14319; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
14320; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14321; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
14322; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
14323; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
14324; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
14325; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14326; GFX11-NEXT:    buffer_gl1_inv
14327; GFX11-NEXT:    buffer_gl0_inv
14328; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
14329; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
14330; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
14331; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
14332; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
14333; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
14334; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
14335; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
14336; GFX11-NEXT:    v_mov_b32_e32 v0, v3
14337; GFX11-NEXT:    s_setpc_b64 s[30:31]
14338;
14339; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
14340; GFX10:       ; %bb.0:
14341; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14342; GFX10-NEXT:    flat_load_dword v3, v[0:1]
14343; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14344; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14345; GFX10-NEXT:    s_mov_b32 s5, 0
14346; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
14347; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
14348; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14349; GFX10-NEXT:    v_mov_b32_e32 v6, v3
14350; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14351; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14352; GFX10-NEXT:    v_min_f32_e32 v3, v3, v4
14353; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
14354; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
14355; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
14356; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
14357; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14358; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
14359; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
14360; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
14361; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
14362; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
14363; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
14364; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
14365; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
14366; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
14367; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14368; GFX10-NEXT:    buffer_gl1_inv
14369; GFX10-NEXT:    buffer_gl0_inv
14370; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
14371; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
14372; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
14373; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
14374; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
14375; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
14376; GFX10-NEXT:    v_mov_b32_e32 v0, v3
14377; GFX10-NEXT:    s_setpc_b64 s[30:31]
14378;
14379; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
14380; GFX90A:       ; %bb.0:
14381; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14382; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
14383; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
14384; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14385; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
14386; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14387; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
14388; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
14389; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
14390; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14391; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
14392; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
14393; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
14394; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v4
14395; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v2
14396; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
14397; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
14398; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
14399; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14400; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
14401; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
14402; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14403; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
14404; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
14405; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14406; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
14407; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
14408; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14409; GFX90A-NEXT:    buffer_wbinvl1
14410; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
14411; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
14412; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
14413; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
14414; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
14415; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
14416; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
14417; GFX90A-NEXT:    s_setpc_b64 s[30:31]
14418;
14419; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
14420; GFX908:       ; %bb.0:
14421; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14422; GFX908-NEXT:    flat_load_dword v3, v[0:1]
14423; GFX908-NEXT:    s_mov_b64 s[6:7], 0
14424; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14425; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
14426; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14427; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
14428; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
14429; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
14430; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14431; GFX908-NEXT:    v_mov_b32_e32 v6, v3
14432; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14433; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14434; GFX908-NEXT:    v_min_f32_e32 v3, v3, v4
14435; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
14436; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
14437; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
14438; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
14439; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14440; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
14441; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
14442; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14443; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
14444; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
14445; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14446; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
14447; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
14448; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14449; GFX908-NEXT:    buffer_wbinvl1
14450; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
14451; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
14452; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
14453; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
14454; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
14455; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
14456; GFX908-NEXT:    v_mov_b32_e32 v0, v3
14457; GFX908-NEXT:    s_setpc_b64 s[30:31]
14458;
14459; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
14460; GFX8:       ; %bb.0:
14461; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14462; GFX8-NEXT:    flat_load_dword v3, v[0:1]
14463; GFX8-NEXT:    s_mov_b64 s[6:7], 0
14464; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14465; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14466; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
14467; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
14468; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14469; GFX8-NEXT:    v_mov_b32_e32 v6, v3
14470; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14471; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14472; GFX8-NEXT:    v_min_f32_e32 v3, v3, v4
14473; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
14474; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
14475; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
14476; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
14477; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
14478; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
14479; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
14480; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14481; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14482; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
14483; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
14484; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14485; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
14486; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
14487; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
14488; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
14489; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14490; GFX8-NEXT:    buffer_wbinvl1
14491; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
14492; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
14493; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
14494; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
14495; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
14496; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
14497; GFX8-NEXT:    v_mov_b32_e32 v0, v3
14498; GFX8-NEXT:    s_setpc_b64 s[30:31]
14499;
14500; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
14501; GFX7:       ; %bb.0:
14502; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14503; GFX7-NEXT:    flat_load_dword v5, v[0:1]
14504; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
14505; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v2
14506; GFX7-NEXT:    s_mov_b64 s[4:5], 0
14507; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
14508; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14509; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
14510; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
14511; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14512; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
14513; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
14514; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
14515; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
14516; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
14517; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
14518; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
14519; GFX7-NEXT:    v_min_f32_e32 v6, v6, v4
14520; GFX7-NEXT:    v_min_f32_e32 v7, v7, v5
14521; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
14522; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
14523; GFX7-NEXT:    v_alignbit_b32 v2, v2, v7, 16
14524; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
14525; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14526; GFX7-NEXT:    buffer_wbinvl1
14527; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
14528; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
14529; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14530; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14531; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14532; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
14533; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
14534; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
14535; GFX7-NEXT:    v_mov_b32_e32 v0, v3
14536; GFX7-NEXT:    v_mov_b32_e32 v1, v2
14537; GFX7-NEXT:    s_setpc_b64 s[30:31]
14538  %result = atomicrmw fmin ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
14539  ret <2 x bfloat> %result
14540}
14541
14542define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
14543; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
14544; GFX12:       ; %bb.0:
14545; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14546; GFX12-NEXT:    s_wait_expcnt 0x0
14547; GFX12-NEXT:    s_wait_samplecnt 0x0
14548; GFX12-NEXT:    s_wait_bvhcnt 0x0
14549; GFX12-NEXT:    s_wait_kmcnt 0x0
14550; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
14551; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14552; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14553; GFX12-NEXT:    s_mov_b32 s1, 0
14554; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
14555; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
14556; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14557; GFX12-NEXT:    v_mov_b32_e32 v6, v3
14558; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14559; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14560; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
14561; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14562; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14563; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
14564; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v4
14565; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14566; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
14567; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14568; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
14569; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
14570; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
14571; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
14572; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14573; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
14574; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
14575; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14576; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
14577; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
14578; GFX12-NEXT:    s_wait_storecnt 0x0
14579; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
14580; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14581; GFX12-NEXT:    global_inv scope:SCOPE_DEV
14582; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
14583; GFX12-NEXT:    s_wait_alu 0xfffe
14584; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
14585; GFX12-NEXT:    s_wait_alu 0xfffe
14586; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
14587; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
14588; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
14589; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
14590; GFX12-NEXT:    v_mov_b32_e32 v0, v3
14591; GFX12-NEXT:    s_wait_alu 0xfffe
14592; GFX12-NEXT:    s_setpc_b64 s[30:31]
14593;
14594; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
14595; GFX940:       ; %bb.0:
14596; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14597; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
14598; GFX940-NEXT:    s_mov_b64 s[2:3], 0
14599; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14600; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
14601; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14602; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
14603; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
14604; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
14605; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14606; GFX940-NEXT:    v_mov_b32_e32 v7, v3
14607; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
14608; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
14609; GFX940-NEXT:    v_min_f32_e32 v3, v3, v4
14610; GFX940-NEXT:    v_min_f32_e32 v5, v5, v2
14611; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
14612; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
14613; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
14614; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14615; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
14616; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
14617; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14618; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
14619; GFX940-NEXT:    s_nop 0
14620; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14621; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
14622; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
14623; GFX940-NEXT:    buffer_wbl2 sc1
14624; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0
14625; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14626; GFX940-NEXT:    buffer_inv sc1
14627; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
14628; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
14629; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
14630; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
14631; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
14632; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
14633; GFX940-NEXT:    v_mov_b32_e32 v0, v3
14634; GFX940-NEXT:    s_setpc_b64 s[30:31]
14635;
14636; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
14637; GFX11:       ; %bb.0:
14638; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14639; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
14640; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14641; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14642; GFX11-NEXT:    s_mov_b32 s1, 0
14643; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
14644; GFX11-NEXT:    .p2align 6
14645; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
14646; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
14647; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14648; GFX11-NEXT:    v_mov_b32_e32 v6, v3
14649; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14650; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14651; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
14652; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14653; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14654; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
14655; GFX11-NEXT:    v_min_f32_e32 v3, v3, v4
14656; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14657; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
14658; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14659; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
14660; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
14661; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
14662; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
14663; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14664; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
14665; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
14666; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14667; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
14668; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
14669; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
14670; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
14671; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14672; GFX11-NEXT:    buffer_gl1_inv
14673; GFX11-NEXT:    buffer_gl0_inv
14674; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
14675; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
14676; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
14677; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
14678; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
14679; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
14680; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
14681; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
14682; GFX11-NEXT:    v_mov_b32_e32 v0, v3
14683; GFX11-NEXT:    s_setpc_b64 s[30:31]
14684;
14685; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
14686; GFX10:       ; %bb.0:
14687; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14688; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
14689; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
14690; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
14691; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14692; GFX10-NEXT:    s_mov_b32 s5, 0
14693; GFX10-NEXT:    flat_load_dword v0, v[3:4]
14694; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
14695; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
14696; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14697; GFX10-NEXT:    v_mov_b32_e32 v6, v0
14698; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
14699; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14700; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
14701; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
14702; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
14703; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
14704; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
14705; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14706; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
14707; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
14708; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
14709; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
14710; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
14711; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
14712; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
14713; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
14714; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
14715; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14716; GFX10-NEXT:    buffer_gl1_inv
14717; GFX10-NEXT:    buffer_gl0_inv
14718; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
14719; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
14720; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
14721; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
14722; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
14723; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
14724; GFX10-NEXT:    s_setpc_b64 s[30:31]
14725;
14726; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
14727; GFX90A:       ; %bb.0:
14728; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14729; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
14730; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
14731; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14732; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
14733; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14734; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
14735; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
14736; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
14737; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14738; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
14739; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
14740; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
14741; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v4
14742; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v2
14743; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
14744; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
14745; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
14746; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14747; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
14748; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
14749; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14750; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
14751; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
14752; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14753; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
14754; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
14755; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14756; GFX90A-NEXT:    buffer_wbinvl1
14757; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
14758; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
14759; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
14760; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
14761; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
14762; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
14763; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
14764; GFX90A-NEXT:    s_setpc_b64 s[30:31]
14765;
14766; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
14767; GFX908:       ; %bb.0:
14768; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14769; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
14770; GFX908-NEXT:    s_mov_b64 s[6:7], 0
14771; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14772; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
14773; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14774; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
14775; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
14776; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
14777; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14778; GFX908-NEXT:    v_mov_b32_e32 v6, v3
14779; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14780; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14781; GFX908-NEXT:    v_min_f32_e32 v3, v3, v4
14782; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
14783; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
14784; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
14785; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
14786; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14787; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
14788; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
14789; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14790; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
14791; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
14792; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14793; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
14794; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
14795; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14796; GFX908-NEXT:    buffer_wbinvl1
14797; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
14798; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
14799; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
14800; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
14801; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
14802; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
14803; GFX908-NEXT:    v_mov_b32_e32 v0, v3
14804; GFX908-NEXT:    s_setpc_b64 s[30:31]
14805;
14806; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
14807; GFX8:       ; %bb.0:
14808; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14809; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
14810; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
14811; GFX8-NEXT:    flat_load_dword v0, v[3:4]
14812; GFX8-NEXT:    s_mov_b64 s[6:7], 0
14813; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
14814; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14815; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
14816; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
14817; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14818; GFX8-NEXT:    v_mov_b32_e32 v6, v0
14819; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
14820; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14821; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
14822; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
14823; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
14824; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
14825; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
14826; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
14827; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
14828; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
14829; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14830; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
14831; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
14832; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
14833; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
14834; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
14835; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
14836; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
14837; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
14838; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14839; GFX8-NEXT:    buffer_wbinvl1
14840; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
14841; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
14842; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
14843; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
14844; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
14845; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
14846; GFX8-NEXT:    s_setpc_b64 s[30:31]
14847;
14848; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
14849; GFX7:       ; %bb.0:
14850; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14851; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
14852; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
14853; GFX7-NEXT:    flat_load_dword v0, v[4:5]
14854; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v3
14855; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v2
14856; GFX7-NEXT:    s_mov_b64 s[4:5], 0
14857; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
14858; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
14859; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14860; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
14861; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
14862; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
14863; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
14864; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
14865; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
14866; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
14867; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
14868; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
14869; GFX7-NEXT:    v_min_f32_e32 v6, v6, v2
14870; GFX7-NEXT:    v_min_f32_e32 v7, v7, v3
14871; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
14872; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
14873; GFX7-NEXT:    v_alignbit_b32 v0, v0, v7, 16
14874; GFX7-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
14875; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14876; GFX7-NEXT:    buffer_wbinvl1
14877; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
14878; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
14879; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
14880; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
14881; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
14882; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
14883; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
14884; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
14885; GFX7-NEXT:    s_setpc_b64 s[30:31]
14886  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
14887  %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
14888  ret <2 x bfloat> %result
14889}
14890
14891define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
14892; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
14893; GFX12:       ; %bb.0:
14894; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14895; GFX12-NEXT:    s_wait_expcnt 0x0
14896; GFX12-NEXT:    s_wait_samplecnt 0x0
14897; GFX12-NEXT:    s_wait_bvhcnt 0x0
14898; GFX12-NEXT:    s_wait_kmcnt 0x0
14899; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
14900; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
14901; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14902; GFX12-NEXT:    s_mov_b32 s1, 0
14903; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
14904; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
14905; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14906; GFX12-NEXT:    v_mov_b32_e32 v6, v3
14907; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14908; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
14909; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
14910; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
14911; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14912; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
14913; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v4
14914; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
14915; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
14916; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14917; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
14918; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
14919; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
14920; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
14921; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
14922; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
14923; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
14924; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14925; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
14926; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
14927; GFX12-NEXT:    s_wait_storecnt 0x0
14928; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
14929; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
14930; GFX12-NEXT:    global_inv scope:SCOPE_DEV
14931; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
14932; GFX12-NEXT:    s_wait_alu 0xfffe
14933; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
14934; GFX12-NEXT:    s_wait_alu 0xfffe
14935; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
14936; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
14937; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
14938; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
14939; GFX12-NEXT:    v_mov_b32_e32 v0, v3
14940; GFX12-NEXT:    s_wait_alu 0xfffe
14941; GFX12-NEXT:    s_setpc_b64 s[30:31]
14942;
14943; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
14944; GFX940:       ; %bb.0:
14945; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14946; GFX940-NEXT:    v_mov_b32_e32 v4, v0
14947; GFX940-NEXT:    v_mov_b32_e32 v5, v1
14948; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
14949; GFX940-NEXT:    s_movk_i32 s0, 0xf800
14950; GFX940-NEXT:    s_nop 0
14951; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
14952; GFX940-NEXT:    flat_load_dword v0, v[0:1]
14953; GFX940-NEXT:    s_mov_b32 s1, -1
14954; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
14955; GFX940-NEXT:    s_mov_b64 s[2:3], 0
14956; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
14957; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
14958; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
14959; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
14960; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
14961; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
14962; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14963; GFX940-NEXT:    v_mov_b32_e32 v7, v0
14964; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
14965; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
14966; GFX940-NEXT:    v_min_f32_e32 v0, v0, v1
14967; GFX940-NEXT:    v_min_f32_e32 v3, v3, v2
14968; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
14969; GFX940-NEXT:    v_bfe_u32 v9, v3, 16, 1
14970; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v0
14971; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v3
14972; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s4
14973; GFX940-NEXT:    v_add3_u32 v9, v9, v3, s4
14974; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
14975; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
14976; GFX940-NEXT:    s_nop 0
14977; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
14978; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[0:1]
14979; GFX940-NEXT:    v_perm_b32 v6, v3, v0, s5
14980; GFX940-NEXT:    buffer_wbl2 sc1
14981; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
14982; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
14983; GFX940-NEXT:    buffer_inv sc1
14984; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
14985; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
14986; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
14987; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
14988; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
14989; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
14990; GFX940-NEXT:    s_setpc_b64 s[30:31]
14991;
14992; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
14993; GFX11:       ; %bb.0:
14994; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14995; GFX11-NEXT:    v_mov_b32_e32 v3, v0
14996; GFX11-NEXT:    s_mov_b32 s1, 0
14997; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
14998; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
14999; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
15000; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
15001; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
15002; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
15003; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
15004; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15005; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
15006; GFX11-NEXT:    .p2align 6
15007; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
15008; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
15009; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15010; GFX11-NEXT:    v_mov_b32_e32 v6, v0
15011; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
15012; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
15013; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
15014; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
15015; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15016; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
15017; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
15018; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
15019; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
15020; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15021; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
15022; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
15023; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v0
15024; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
15025; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15026; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
15027; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
15028; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
15029; GFX11-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s0
15030; GFX11-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
15031; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
15032; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
15033; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15034; GFX11-NEXT:    buffer_gl1_inv
15035; GFX11-NEXT:    buffer_gl0_inv
15036; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
15037; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
15038; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
15039; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
15040; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
15041; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
15042; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
15043; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
15044; GFX11-NEXT:    s_setpc_b64 s[30:31]
15045;
15046; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
15047; GFX10:       ; %bb.0:
15048; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15049; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
15050; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
15051; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
15052; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15053; GFX10-NEXT:    s_mov_b32 s5, 0
15054; GFX10-NEXT:    flat_load_dword v0, v[3:4]
15055; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
15056; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
15057; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15058; GFX10-NEXT:    v_mov_b32_e32 v6, v0
15059; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
15060; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
15061; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
15062; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
15063; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
15064; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
15065; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
15066; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
15067; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
15068; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
15069; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
15070; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
15071; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
15072; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
15073; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
15074; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
15075; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
15076; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15077; GFX10-NEXT:    buffer_gl1_inv
15078; GFX10-NEXT:    buffer_gl0_inv
15079; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
15080; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
15081; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
15082; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
15083; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
15084; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
15085; GFX10-NEXT:    s_setpc_b64 s[30:31]
15086;
15087; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
15088; GFX90A:       ; %bb.0:
15089; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15090; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
15091; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
15092; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
15093; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
15094; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
15095; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
15096; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
15097; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
15098; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15099; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
15100; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
15101; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
15102; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15103; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
15104; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
15105; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
15106; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v1
15107; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v2
15108; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
15109; GFX90A-NEXT:    v_bfe_u32 v9, v3, 16, 1
15110; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
15111; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v3
15112; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
15113; GFX90A-NEXT:    v_add3_u32 v9, v9, v3, s8
15114; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
15115; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
15116; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[4:5]
15117; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
15118; GFX90A-NEXT:    v_perm_b32 v6, v3, v0, s9
15119; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
15120; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15121; GFX90A-NEXT:    buffer_wbinvl1
15122; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
15123; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15124; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15125; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
15126; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
15127; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
15128; GFX90A-NEXT:    s_setpc_b64 s[30:31]
15129;
15130; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
15131; GFX908:       ; %bb.0:
15132; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15133; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
15134; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
15135; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
15136; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
15137; GFX908-NEXT:    flat_load_dword v0, v[0:1]
15138; GFX908-NEXT:    s_mov_b64 s[6:7], 0
15139; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
15140; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
15141; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15142; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
15143; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
15144; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
15145; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15146; GFX908-NEXT:    v_mov_b32_e32 v6, v0
15147; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
15148; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
15149; GFX908-NEXT:    v_min_f32_e32 v0, v0, v1
15150; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
15151; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
15152; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
15153; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
15154; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
15155; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
15156; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
15157; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
15158; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
15159; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
15160; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
15161; GFX908-NEXT:    v_perm_b32 v5, v5, v0, s9
15162; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
15163; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15164; GFX908-NEXT:    buffer_wbinvl1
15165; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
15166; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15167; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15168; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
15169; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
15170; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
15171; GFX908-NEXT:    s_setpc_b64 s[30:31]
15172;
15173; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
15174; GFX8:       ; %bb.0:
15175; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15176; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
15177; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
15178; GFX8-NEXT:    flat_load_dword v0, v[3:4]
15179; GFX8-NEXT:    s_mov_b64 s[6:7], 0
15180; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
15181; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
15182; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
15183; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
15184; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15185; GFX8-NEXT:    v_mov_b32_e32 v6, v0
15186; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
15187; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
15188; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
15189; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
15190; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
15191; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
15192; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
15193; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
15194; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
15195; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
15196; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
15197; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
15198; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
15199; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
15200; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
15201; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
15202; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
15203; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
15204; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
15205; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15206; GFX8-NEXT:    buffer_wbinvl1
15207; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
15208; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15209; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15210; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
15211; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
15212; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
15213; GFX8-NEXT:    s_setpc_b64 s[30:31]
15214;
15215; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
15216; GFX7:       ; %bb.0:
15217; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15218; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
15219; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
15220; GFX7-NEXT:    flat_load_dword v0, v[4:5]
15221; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v3
15222; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v2
15223; GFX7-NEXT:    s_mov_b64 s[4:5], 0
15224; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
15225; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
15226; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15227; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
15228; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
15229; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
15230; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
15231; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
15232; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
15233; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
15234; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
15235; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
15236; GFX7-NEXT:    v_min_f32_e32 v6, v6, v2
15237; GFX7-NEXT:    v_min_f32_e32 v7, v7, v3
15238; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
15239; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
15240; GFX7-NEXT:    v_alignbit_b32 v0, v0, v7, 16
15241; GFX7-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
15242; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15243; GFX7-NEXT:    buffer_wbinvl1
15244; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
15245; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
15246; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15247; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
15248; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15249; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
15250; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
15251; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
15252; GFX7-NEXT:    s_setpc_b64 s[30:31]
15253  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
15254  %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
15255  ret <2 x bfloat> %result
15256}
15257
15258define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
15259; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
15260; GFX12:       ; %bb.0:
15261; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15262; GFX12-NEXT:    s_wait_expcnt 0x0
15263; GFX12-NEXT:    s_wait_samplecnt 0x0
15264; GFX12-NEXT:    s_wait_bvhcnt 0x0
15265; GFX12-NEXT:    s_wait_kmcnt 0x0
15266; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
15267; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15268; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15269; GFX12-NEXT:    s_mov_b32 s1, 0
15270; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
15271; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
15272; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15273; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15274; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15275; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15276; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
15277; GFX12-NEXT:    v_min_num_f32_e32 v6, v6, v5
15278; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15279; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
15280; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
15281; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
15282; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15283; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
15284; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
15285; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
15286; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
15287; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15288; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
15289; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
15290; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15291; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
15292; GFX12-NEXT:    s_wait_storecnt 0x0
15293; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
15294; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15295; GFX12-NEXT:    global_inv scope:SCOPE_DEV
15296; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
15297; GFX12-NEXT:    v_mov_b32_e32 v3, v2
15298; GFX12-NEXT:    s_wait_alu 0xfffe
15299; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
15300; GFX12-NEXT:    s_wait_alu 0xfffe
15301; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
15302; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
15303; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
15304; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
15305; GFX12-NEXT:    s_wait_alu 0xfffe
15306; GFX12-NEXT:    s_setpc_b64 s[30:31]
15307;
15308; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
15309; GFX940:       ; %bb.0:
15310; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15311; GFX940-NEXT:    flat_load_dword v3, v[0:1]
15312; GFX940-NEXT:    s_mov_b64 s[2:3], 0
15313; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15314; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
15315; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15316; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
15317; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
15318; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
15319; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15320; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15321; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15322; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
15323; GFX940-NEXT:    v_min_f32_e32 v6, v6, v5
15324; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
15325; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
15326; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15327; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15328; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
15329; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
15330; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15331; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
15332; GFX940-NEXT:    s_nop 0
15333; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15334; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
15335; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
15336; GFX940-NEXT:    buffer_wbl2 sc1
15337; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
15338; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15339; GFX940-NEXT:    buffer_inv sc1
15340; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15341; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
15342; GFX940-NEXT:    v_mov_b32_e32 v3, v2
15343; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
15344; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
15345; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
15346; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
15347; GFX940-NEXT:    s_setpc_b64 s[30:31]
15348;
15349; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
15350; GFX11:       ; %bb.0:
15351; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15352; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
15353; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15354; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15355; GFX11-NEXT:    s_mov_b32 s1, 0
15356; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
15357; GFX11-NEXT:    .p2align 6
15358; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
15359; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
15360; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15361; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15362; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15363; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15364; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
15365; GFX11-NEXT:    v_min_f32_e32 v6, v6, v5
15366; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15367; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
15368; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
15369; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
15370; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15371; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
15372; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
15373; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
15374; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
15375; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15376; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
15377; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
15378; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15379; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
15380; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
15381; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
15382; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15383; GFX11-NEXT:    buffer_gl1_inv
15384; GFX11-NEXT:    buffer_gl0_inv
15385; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
15386; GFX11-NEXT:    v_mov_b32_e32 v3, v2
15387; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
15388; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
15389; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
15390; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
15391; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
15392; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
15393; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
15394; GFX11-NEXT:    s_setpc_b64 s[30:31]
15395;
15396; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
15397; GFX10:       ; %bb.0:
15398; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15399; GFX10-NEXT:    flat_load_dword v3, v[0:1]
15400; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15401; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15402; GFX10-NEXT:    s_mov_b32 s5, 0
15403; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
15404; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
15405; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15406; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15407; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15408; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
15409; GFX10-NEXT:    v_min_f32_e32 v6, v6, v5
15410; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
15411; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
15412; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
15413; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15414; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
15415; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
15416; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
15417; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
15418; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
15419; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
15420; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
15421; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
15422; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15423; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15424; GFX10-NEXT:    buffer_gl1_inv
15425; GFX10-NEXT:    buffer_gl0_inv
15426; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
15427; GFX10-NEXT:    v_mov_b32_e32 v3, v2
15428; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
15429; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
15430; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
15431; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
15432; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
15433; GFX10-NEXT:    s_setpc_b64 s[30:31]
15434;
15435; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
15436; GFX90A:       ; %bb.0:
15437; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15438; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
15439; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
15440; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15441; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
15442; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15443; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
15444; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
15445; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
15446; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15447; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15448; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15449; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
15450; GFX90A-NEXT:    v_min_f32_e32 v6, v6, v5
15451; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
15452; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
15453; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15454; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15455; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
15456; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
15457; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15458; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
15459; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
15460; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15461; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
15462; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15463; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15464; GFX90A-NEXT:    buffer_wbinvl1
15465; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15466; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15467; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
15468; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15469; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
15470; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
15471; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
15472; GFX90A-NEXT:    s_setpc_b64 s[30:31]
15473;
15474; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
15475; GFX908:       ; %bb.0:
15476; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15477; GFX908-NEXT:    flat_load_dword v3, v[0:1]
15478; GFX908-NEXT:    s_mov_b64 s[6:7], 0
15479; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15480; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
15481; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15482; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
15483; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
15484; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
15485; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15486; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15487; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15488; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
15489; GFX908-NEXT:    v_min_f32_e32 v6, v6, v5
15490; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
15491; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
15492; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15493; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15494; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
15495; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
15496; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15497; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
15498; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
15499; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15500; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
15501; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15502; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15503; GFX908-NEXT:    buffer_wbinvl1
15504; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15505; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15506; GFX908-NEXT:    v_mov_b32_e32 v3, v2
15507; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15508; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
15509; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
15510; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
15511; GFX908-NEXT:    s_setpc_b64 s[30:31]
15512;
15513; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
15514; GFX8:       ; %bb.0:
15515; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15516; GFX8-NEXT:    flat_load_dword v3, v[0:1]
15517; GFX8-NEXT:    s_mov_b64 s[6:7], 0
15518; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15519; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15520; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
15521; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
15522; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15523; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15524; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15525; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
15526; GFX8-NEXT:    v_min_f32_e32 v6, v6, v5
15527; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
15528; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
15529; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
15530; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
15531; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
15532; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
15533; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15534; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15535; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15536; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
15537; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15538; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
15539; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
15540; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
15541; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15542; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15543; GFX8-NEXT:    buffer_wbinvl1
15544; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15545; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15546; GFX8-NEXT:    v_mov_b32_e32 v3, v2
15547; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15548; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
15549; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
15550; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
15551; GFX8-NEXT:    s_setpc_b64 s[30:31]
15552;
15553; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
15554; GFX7:       ; %bb.0:
15555; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15556; GFX7-NEXT:    flat_load_dword v4, v[0:1]
15557; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
15558; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v2
15559; GFX7-NEXT:    s_mov_b64 s[4:5], 0
15560; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
15561; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
15562; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15563; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
15564; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
15565; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
15566; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
15567; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
15568; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
15569; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
15570; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
15571; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
15572; GFX7-NEXT:    v_min_f32_e32 v6, v6, v2
15573; GFX7-NEXT:    v_min_f32_e32 v7, v7, v3
15574; GFX7-NEXT:    v_alignbit_b32 v5, v5, v4, 16
15575; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
15576; GFX7-NEXT:    v_alignbit_b32 v4, v4, v7, 16
15577; GFX7-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
15578; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15579; GFX7-NEXT:    buffer_wbinvl1
15580; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
15581; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
15582; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15583; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
15584; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15585; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
15586; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
15587; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
15588; GFX7-NEXT:    s_setpc_b64 s[30:31]
15589  %unused = atomicrmw fmin ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
15590  ret void
15591}
15592
15593define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
15594; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
15595; GFX12:       ; %bb.0:
15596; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15597; GFX12-NEXT:    s_wait_expcnt 0x0
15598; GFX12-NEXT:    s_wait_samplecnt 0x0
15599; GFX12-NEXT:    s_wait_bvhcnt 0x0
15600; GFX12-NEXT:    s_wait_kmcnt 0x0
15601; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
15602; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15603; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15604; GFX12-NEXT:    s_mov_b32 s1, 0
15605; GFX12-NEXT:  .LBB58_1: ; %atomicrmw.start
15606; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
15607; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15608; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15609; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15610; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15611; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
15612; GFX12-NEXT:    v_min_num_f32_e32 v6, v6, v5
15613; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15614; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
15615; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
15616; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
15617; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15618; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
15619; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
15620; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
15621; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
15622; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15623; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
15624; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
15625; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15626; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
15627; GFX12-NEXT:    s_wait_storecnt 0x0
15628; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
15629; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15630; GFX12-NEXT:    global_inv scope:SCOPE_DEV
15631; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
15632; GFX12-NEXT:    v_mov_b32_e32 v3, v2
15633; GFX12-NEXT:    s_wait_alu 0xfffe
15634; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
15635; GFX12-NEXT:    s_wait_alu 0xfffe
15636; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
15637; GFX12-NEXT:    s_cbranch_execnz .LBB58_1
15638; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
15639; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
15640; GFX12-NEXT:    s_wait_alu 0xfffe
15641; GFX12-NEXT:    s_setpc_b64 s[30:31]
15642;
15643; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
15644; GFX940:       ; %bb.0:
15645; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15646; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
15647; GFX940-NEXT:    s_mov_b64 s[2:3], 0
15648; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15649; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
15650; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15651; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
15652; GFX940-NEXT:  .LBB58_1: ; %atomicrmw.start
15653; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
15654; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15655; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15656; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15657; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
15658; GFX940-NEXT:    v_min_f32_e32 v6, v6, v5
15659; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
15660; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
15661; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15662; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15663; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
15664; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
15665; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15666; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
15667; GFX940-NEXT:    s_nop 0
15668; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15669; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
15670; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
15671; GFX940-NEXT:    buffer_wbl2 sc1
15672; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0
15673; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15674; GFX940-NEXT:    buffer_inv sc1
15675; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15676; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
15677; GFX940-NEXT:    v_mov_b32_e32 v3, v2
15678; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
15679; GFX940-NEXT:    s_cbranch_execnz .LBB58_1
15680; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
15681; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
15682; GFX940-NEXT:    s_setpc_b64 s[30:31]
15683;
15684; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
15685; GFX11:       ; %bb.0:
15686; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15687; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
15688; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15689; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15690; GFX11-NEXT:    s_mov_b32 s1, 0
15691; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
15692; GFX11-NEXT:    .p2align 6
15693; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
15694; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
15695; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15696; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15697; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15698; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15699; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
15700; GFX11-NEXT:    v_min_f32_e32 v6, v6, v5
15701; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15702; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
15703; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
15704; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
15705; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15706; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
15707; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
15708; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
15709; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
15710; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15711; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
15712; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
15713; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15714; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
15715; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
15716; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
15717; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15718; GFX11-NEXT:    buffer_gl1_inv
15719; GFX11-NEXT:    buffer_gl0_inv
15720; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
15721; GFX11-NEXT:    v_mov_b32_e32 v3, v2
15722; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
15723; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
15724; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
15725; GFX11-NEXT:    s_cbranch_execnz .LBB58_1
15726; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
15727; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
15728; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
15729; GFX11-NEXT:    s_setpc_b64 s[30:31]
15730;
15731; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
15732; GFX10:       ; %bb.0:
15733; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15734; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
15735; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
15736; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15737; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15738; GFX10-NEXT:    s_mov_b32 s5, 0
15739; GFX10-NEXT:    flat_load_dword v3, v[0:1]
15740; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
15741; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
15742; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15743; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15744; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15745; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
15746; GFX10-NEXT:    v_min_f32_e32 v6, v6, v5
15747; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
15748; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
15749; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
15750; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15751; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
15752; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
15753; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
15754; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
15755; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
15756; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
15757; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
15758; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
15759; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15760; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15761; GFX10-NEXT:    buffer_gl1_inv
15762; GFX10-NEXT:    buffer_gl0_inv
15763; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
15764; GFX10-NEXT:    v_mov_b32_e32 v3, v2
15765; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
15766; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
15767; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
15768; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
15769; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
15770; GFX10-NEXT:    s_setpc_b64 s[30:31]
15771;
15772; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
15773; GFX90A:       ; %bb.0:
15774; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15775; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
15776; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
15777; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15778; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
15779; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15780; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
15781; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
15782; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
15783; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15784; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15785; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15786; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
15787; GFX90A-NEXT:    v_min_f32_e32 v6, v6, v5
15788; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
15789; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
15790; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15791; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15792; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
15793; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
15794; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15795; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
15796; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
15797; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15798; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
15799; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
15800; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15801; GFX90A-NEXT:    buffer_wbinvl1
15802; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15803; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15804; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
15805; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15806; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
15807; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
15808; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
15809; GFX90A-NEXT:    s_setpc_b64 s[30:31]
15810;
15811; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
15812; GFX908:       ; %bb.0:
15813; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15814; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
15815; GFX908-NEXT:    s_mov_b64 s[6:7], 0
15816; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15817; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
15818; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15819; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
15820; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
15821; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
15822; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15823; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15824; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15825; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
15826; GFX908-NEXT:    v_min_f32_e32 v6, v6, v5
15827; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
15828; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
15829; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15830; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15831; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
15832; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
15833; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15834; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
15835; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
15836; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15837; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
15838; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
15839; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15840; GFX908-NEXT:    buffer_wbinvl1
15841; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15842; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15843; GFX908-NEXT:    v_mov_b32_e32 v3, v2
15844; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15845; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
15846; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
15847; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
15848; GFX908-NEXT:    s_setpc_b64 s[30:31]
15849;
15850; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
15851; GFX8:       ; %bb.0:
15852; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15853; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
15854; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
15855; GFX8-NEXT:    flat_load_dword v3, v[0:1]
15856; GFX8-NEXT:    s_mov_b64 s[6:7], 0
15857; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15858; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15859; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
15860; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
15861; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15862; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15863; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15864; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
15865; GFX8-NEXT:    v_min_f32_e32 v6, v6, v5
15866; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
15867; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
15868; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
15869; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
15870; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
15871; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
15872; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15873; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
15874; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
15875; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
15876; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
15877; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
15878; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
15879; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
15880; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
15881; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15882; GFX8-NEXT:    buffer_wbinvl1
15883; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
15884; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
15885; GFX8-NEXT:    v_mov_b32_e32 v3, v2
15886; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
15887; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
15888; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
15889; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
15890; GFX8-NEXT:    s_setpc_b64 s[30:31]
15891;
15892; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
15893; GFX7:       ; %bb.0:
15894; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15895; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
15896; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
15897; GFX7-NEXT:    flat_load_dword v4, v[0:1]
15898; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
15899; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v2
15900; GFX7-NEXT:    s_mov_b64 s[4:5], 0
15901; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
15902; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
15903; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15904; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
15905; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
15906; GFX7-NEXT:  .LBB58_1: ; %atomicrmw.start
15907; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
15908; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
15909; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
15910; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
15911; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
15912; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
15913; GFX7-NEXT:    v_min_f32_e32 v6, v6, v2
15914; GFX7-NEXT:    v_min_f32_e32 v7, v7, v3
15915; GFX7-NEXT:    v_alignbit_b32 v5, v5, v4, 16
15916; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
15917; GFX7-NEXT:    v_alignbit_b32 v4, v4, v7, 16
15918; GFX7-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
15919; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
15920; GFX7-NEXT:    buffer_wbinvl1
15921; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
15922; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
15923; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
15924; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
15925; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
15926; GFX7-NEXT:    s_cbranch_execnz .LBB58_1
15927; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
15928; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
15929; GFX7-NEXT:    s_setpc_b64 s[30:31]
15930  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
15931  %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
15932  ret void
15933}
15934
15935define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
15936; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
15937; GFX12:       ; %bb.0:
15938; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15939; GFX12-NEXT:    s_wait_expcnt 0x0
15940; GFX12-NEXT:    s_wait_samplecnt 0x0
15941; GFX12-NEXT:    s_wait_bvhcnt 0x0
15942; GFX12-NEXT:    s_wait_kmcnt 0x0
15943; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
15944; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15945; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15946; GFX12-NEXT:    s_mov_b32 s1, 0
15947; GFX12-NEXT:  .LBB59_1: ; %atomicrmw.start
15948; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
15949; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15950; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
15951; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
15952; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15953; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
15954; GFX12-NEXT:    v_min_num_f32_e32 v6, v6, v5
15955; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15956; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
15957; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
15958; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
15959; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
15960; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
15961; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
15962; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
15963; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
15964; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
15965; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
15966; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
15967; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
15968; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
15969; GFX12-NEXT:    s_wait_storecnt 0x0
15970; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
15971; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
15972; GFX12-NEXT:    global_inv scope:SCOPE_DEV
15973; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
15974; GFX12-NEXT:    v_mov_b32_e32 v3, v2
15975; GFX12-NEXT:    s_wait_alu 0xfffe
15976; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
15977; GFX12-NEXT:    s_wait_alu 0xfffe
15978; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
15979; GFX12-NEXT:    s_cbranch_execnz .LBB59_1
15980; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
15981; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
15982; GFX12-NEXT:    s_wait_alu 0xfffe
15983; GFX12-NEXT:    s_setpc_b64 s[30:31]
15984;
15985; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
15986; GFX940:       ; %bb.0:
15987; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15988; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
15989; GFX940-NEXT:    s_movk_i32 s0, 0xf800
15990; GFX940-NEXT:    s_nop 0
15991; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
15992; GFX940-NEXT:    flat_load_dword v3, v[4:5]
15993; GFX940-NEXT:    s_mov_b32 s1, -1
15994; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
15995; GFX940-NEXT:    s_mov_b64 s[2:3], 0
15996; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
15997; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
15998; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
15999; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
16000; GFX940-NEXT:  .LBB59_1: ; %atomicrmw.start
16001; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
16002; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16003; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16004; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16005; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
16006; GFX940-NEXT:    v_min_f32_e32 v6, v6, v5
16007; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
16008; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
16009; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
16010; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16011; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
16012; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
16013; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
16014; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
16015; GFX940-NEXT:    s_nop 0
16016; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
16017; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
16018; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
16019; GFX940-NEXT:    buffer_wbl2 sc1
16020; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
16021; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16022; GFX940-NEXT:    buffer_inv sc1
16023; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
16024; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
16025; GFX940-NEXT:    v_mov_b32_e32 v3, v2
16026; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
16027; GFX940-NEXT:    s_cbranch_execnz .LBB59_1
16028; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
16029; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
16030; GFX940-NEXT:    s_setpc_b64 s[30:31]
16031;
16032; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
16033; GFX11:       ; %bb.0:
16034; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16035; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
16036; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
16037; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
16038; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
16039; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
16040; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16041; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16042; GFX11-NEXT:    s_mov_b32 s1, 0
16043; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
16044; GFX11-NEXT:    .p2align 6
16045; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
16046; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
16047; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16048; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16049; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16050; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16051; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
16052; GFX11-NEXT:    v_min_f32_e32 v6, v6, v5
16053; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16054; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
16055; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
16056; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
16057; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16058; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
16059; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
16060; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
16061; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
16062; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16063; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
16064; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
16065; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
16066; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
16067; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
16068; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
16069; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16070; GFX11-NEXT:    buffer_gl1_inv
16071; GFX11-NEXT:    buffer_gl0_inv
16072; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
16073; GFX11-NEXT:    v_mov_b32_e32 v3, v2
16074; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
16075; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
16076; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
16077; GFX11-NEXT:    s_cbranch_execnz .LBB59_1
16078; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
16079; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
16080; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
16081; GFX11-NEXT:    s_setpc_b64 s[30:31]
16082;
16083; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
16084; GFX10:       ; %bb.0:
16085; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16086; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
16087; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
16088; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16089; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16090; GFX10-NEXT:    s_mov_b32 s5, 0
16091; GFX10-NEXT:    flat_load_dword v3, v[0:1]
16092; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
16093; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
16094; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16095; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16096; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16097; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
16098; GFX10-NEXT:    v_min_f32_e32 v6, v6, v5
16099; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
16100; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
16101; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
16102; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16103; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
16104; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
16105; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
16106; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
16107; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
16108; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
16109; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
16110; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
16111; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16112; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16113; GFX10-NEXT:    buffer_gl1_inv
16114; GFX10-NEXT:    buffer_gl0_inv
16115; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
16116; GFX10-NEXT:    v_mov_b32_e32 v3, v2
16117; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
16118; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
16119; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
16120; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
16121; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
16122; GFX10-NEXT:    s_setpc_b64 s[30:31]
16123;
16124; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
16125; GFX90A:       ; %bb.0:
16126; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16127; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
16128; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
16129; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
16130; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
16131; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
16132; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
16133; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
16134; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
16135; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16136; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
16137; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
16138; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
16139; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16140; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
16141; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
16142; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v3
16143; GFX90A-NEXT:    v_min_f32_e32 v6, v6, v2
16144; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
16145; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
16146; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
16147; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16148; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s8
16149; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
16150; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
16151; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
16152; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
16153; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
16154; GFX90A-NEXT:    v_perm_b32 v0, v6, v0, s9
16155; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
16156; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16157; GFX90A-NEXT:    buffer_wbinvl1
16158; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
16159; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16160; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
16161; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16162; GFX90A-NEXT:    s_cbranch_execnz .LBB59_1
16163; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
16164; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
16165; GFX90A-NEXT:    s_setpc_b64 s[30:31]
16166;
16167; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
16168; GFX908:       ; %bb.0:
16169; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16170; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
16171; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
16172; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
16173; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
16174; GFX908-NEXT:    flat_load_dword v1, v[0:1]
16175; GFX908-NEXT:    s_mov_b64 s[6:7], 0
16176; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
16177; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
16178; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16179; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
16180; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
16181; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
16182; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16183; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
16184; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
16185; GFX908-NEXT:    v_min_f32_e32 v0, v0, v5
16186; GFX908-NEXT:    v_min_f32_e32 v6, v6, v2
16187; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
16188; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
16189; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
16190; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16191; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
16192; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
16193; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
16194; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
16195; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
16196; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
16197; GFX908-NEXT:    v_perm_b32 v0, v6, v0, s9
16198; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
16199; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16200; GFX908-NEXT:    buffer_wbinvl1
16201; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
16202; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16203; GFX908-NEXT:    v_mov_b32_e32 v1, v0
16204; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16205; GFX908-NEXT:    s_cbranch_execnz .LBB59_1
16206; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
16207; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
16208; GFX908-NEXT:    s_setpc_b64 s[30:31]
16209;
16210; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
16211; GFX8:       ; %bb.0:
16212; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16213; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
16214; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
16215; GFX8-NEXT:    flat_load_dword v3, v[0:1]
16216; GFX8-NEXT:    s_mov_b64 s[6:7], 0
16217; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16218; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16219; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
16220; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
16221; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16222; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16223; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16224; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
16225; GFX8-NEXT:    v_min_f32_e32 v6, v6, v5
16226; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
16227; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
16228; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
16229; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
16230; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
16231; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
16232; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16233; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
16234; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
16235; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
16236; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
16237; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
16238; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
16239; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
16240; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16241; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16242; GFX8-NEXT:    buffer_wbinvl1
16243; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
16244; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16245; GFX8-NEXT:    v_mov_b32_e32 v3, v2
16246; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16247; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
16248; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
16249; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
16250; GFX8-NEXT:    s_setpc_b64 s[30:31]
16251;
16252; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
16253; GFX7:       ; %bb.0:
16254; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16255; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
16256; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
16257; GFX7-NEXT:    flat_load_dword v4, v[0:1]
16258; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
16259; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v2
16260; GFX7-NEXT:    s_mov_b64 s[4:5], 0
16261; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
16262; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
16263; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16264; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
16265; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
16266; GFX7-NEXT:  .LBB59_1: ; %atomicrmw.start
16267; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
16268; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
16269; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
16270; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
16271; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
16272; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
16273; GFX7-NEXT:    v_min_f32_e32 v6, v6, v2
16274; GFX7-NEXT:    v_min_f32_e32 v7, v7, v3
16275; GFX7-NEXT:    v_alignbit_b32 v5, v5, v4, 16
16276; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
16277; GFX7-NEXT:    v_alignbit_b32 v4, v4, v7, 16
16278; GFX7-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
16279; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16280; GFX7-NEXT:    buffer_wbinvl1
16281; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
16282; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
16283; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16284; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
16285; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16286; GFX7-NEXT:    s_cbranch_execnz .LBB59_1
16287; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
16288; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
16289; GFX7-NEXT:    s_setpc_b64 s[30:31]
16290  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
16291  %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
16292  ret void
16293}
16294
16295define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
16296; GFX12-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16297; GFX12:       ; %bb.0:
16298; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16299; GFX12-NEXT:    s_wait_expcnt 0x0
16300; GFX12-NEXT:    s_wait_samplecnt 0x0
16301; GFX12-NEXT:    s_wait_bvhcnt 0x0
16302; GFX12-NEXT:    s_wait_kmcnt 0x0
16303; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
16304; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16305; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16306; GFX12-NEXT:    s_mov_b32 s1, 0
16307; GFX12-NEXT:  .LBB60_1: ; %atomicrmw.start
16308; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
16309; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16310; GFX12-NEXT:    v_mov_b32_e32 v6, v3
16311; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16312; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16313; GFX12-NEXT:    v_min_num_f32_e32 v5, v5, v2
16314; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
16315; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16316; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
16317; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v4
16318; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16319; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
16320; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
16321; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
16322; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
16323; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
16324; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
16325; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
16326; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
16327; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
16328; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16329; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
16330; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
16331; GFX12-NEXT:    global_wb scope:SCOPE_SYS
16332; GFX12-NEXT:    s_wait_storecnt 0x0
16333; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
16334; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16335; GFX12-NEXT:    global_inv scope:SCOPE_SYS
16336; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
16337; GFX12-NEXT:    s_wait_alu 0xfffe
16338; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
16339; GFX12-NEXT:    s_wait_alu 0xfffe
16340; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
16341; GFX12-NEXT:    s_cbranch_execnz .LBB60_1
16342; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
16343; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
16344; GFX12-NEXT:    v_mov_b32_e32 v0, v3
16345; GFX12-NEXT:    s_wait_alu 0xfffe
16346; GFX12-NEXT:    s_setpc_b64 s[30:31]
16347;
16348; GFX940-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16349; GFX940:       ; %bb.0:
16350; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16351; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
16352; GFX940-NEXT:    s_mov_b64 s[2:3], 0
16353; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16354; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
16355; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16356; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
16357; GFX940-NEXT:  .LBB60_1: ; %atomicrmw.start
16358; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
16359; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16360; GFX940-NEXT:    v_mov_b32_e32 v7, v3
16361; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
16362; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
16363; GFX940-NEXT:    v_min_f32_e32 v3, v3, v4
16364; GFX940-NEXT:    v_min_f32_e32 v5, v5, v2
16365; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
16366; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
16367; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
16368; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16369; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
16370; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
16371; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
16372; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
16373; GFX940-NEXT:    s_nop 0
16374; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
16375; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
16376; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
16377; GFX940-NEXT:    buffer_wbl2 sc0 sc1
16378; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1
16379; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16380; GFX940-NEXT:    buffer_inv sc0 sc1
16381; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
16382; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
16383; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
16384; GFX940-NEXT:    s_cbranch_execnz .LBB60_1
16385; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
16386; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
16387; GFX940-NEXT:    v_mov_b32_e32 v0, v3
16388; GFX940-NEXT:    s_setpc_b64 s[30:31]
16389;
16390; GFX11-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16391; GFX11:       ; %bb.0:
16392; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16393; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
16394; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16395; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16396; GFX11-NEXT:    s_mov_b32 s1, 0
16397; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
16398; GFX11-NEXT:    .p2align 6
16399; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
16400; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
16401; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16402; GFX11-NEXT:    v_mov_b32_e32 v6, v3
16403; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16404; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16405; GFX11-NEXT:    v_min_f32_e32 v5, v5, v2
16406; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
16407; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16408; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
16409; GFX11-NEXT:    v_min_f32_e32 v3, v3, v4
16410; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16411; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
16412; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
16413; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
16414; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
16415; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
16416; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
16417; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
16418; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
16419; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
16420; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16421; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
16422; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
16423; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
16424; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
16425; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16426; GFX11-NEXT:    buffer_gl1_inv
16427; GFX11-NEXT:    buffer_gl0_inv
16428; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
16429; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
16430; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
16431; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
16432; GFX11-NEXT:    s_cbranch_execnz .LBB60_1
16433; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
16434; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
16435; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
16436; GFX11-NEXT:    v_mov_b32_e32 v0, v3
16437; GFX11-NEXT:    s_setpc_b64 s[30:31]
16438;
16439; GFX10-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16440; GFX10:       ; %bb.0:
16441; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16442; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
16443; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
16444; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
16445; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16446; GFX10-NEXT:    s_mov_b32 s5, 0
16447; GFX10-NEXT:    flat_load_dword v0, v[3:4]
16448; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
16449; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
16450; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16451; GFX10-NEXT:    v_mov_b32_e32 v6, v0
16452; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
16453; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16454; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
16455; GFX10-NEXT:    v_min_f32_e32 v5, v5, v2
16456; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
16457; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
16458; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
16459; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16460; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
16461; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
16462; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
16463; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
16464; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
16465; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
16466; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
16467; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
16468; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
16469; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16470; GFX10-NEXT:    buffer_gl1_inv
16471; GFX10-NEXT:    buffer_gl0_inv
16472; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
16473; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
16474; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
16475; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
16476; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
16477; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
16478; GFX10-NEXT:    s_setpc_b64 s[30:31]
16479;
16480; GFX90A-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16481; GFX90A:       ; %bb.0:
16482; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16483; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
16484; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
16485; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16486; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
16487; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16488; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
16489; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
16490; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
16491; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16492; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
16493; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
16494; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
16495; GFX90A-NEXT:    v_min_f32_e32 v3, v3, v4
16496; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v2
16497; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
16498; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
16499; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
16500; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16501; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
16502; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
16503; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
16504; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
16505; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
16506; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
16507; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
16508; GFX90A-NEXT:    buffer_wbl2
16509; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
16510; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16511; GFX90A-NEXT:    buffer_invl2
16512; GFX90A-NEXT:    buffer_wbinvl1
16513; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
16514; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16515; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16516; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
16517; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
16518; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
16519; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
16520; GFX90A-NEXT:    s_setpc_b64 s[30:31]
16521;
16522; GFX908-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16523; GFX908:       ; %bb.0:
16524; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16525; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
16526; GFX908-NEXT:    s_mov_b64 s[6:7], 0
16527; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16528; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
16529; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16530; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
16531; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
16532; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
16533; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16534; GFX908-NEXT:    v_mov_b32_e32 v6, v3
16535; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
16536; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16537; GFX908-NEXT:    v_min_f32_e32 v3, v3, v4
16538; GFX908-NEXT:    v_min_f32_e32 v5, v5, v2
16539; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
16540; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
16541; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
16542; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16543; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
16544; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
16545; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
16546; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
16547; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
16548; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
16549; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
16550; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
16551; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16552; GFX908-NEXT:    buffer_wbinvl1
16553; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
16554; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16555; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16556; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
16557; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
16558; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
16559; GFX908-NEXT:    v_mov_b32_e32 v0, v3
16560; GFX908-NEXT:    s_setpc_b64 s[30:31]
16561;
16562; GFX8-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16563; GFX8:       ; %bb.0:
16564; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16565; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
16566; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
16567; GFX8-NEXT:    flat_load_dword v0, v[3:4]
16568; GFX8-NEXT:    s_mov_b64 s[6:7], 0
16569; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
16570; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
16571; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
16572; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
16573; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16574; GFX8-NEXT:    v_mov_b32_e32 v6, v0
16575; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
16576; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
16577; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
16578; GFX8-NEXT:    v_min_f32_e32 v5, v5, v2
16579; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
16580; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
16581; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
16582; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
16583; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
16584; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
16585; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
16586; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
16587; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
16588; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
16589; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
16590; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
16591; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
16592; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
16593; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
16594; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16595; GFX8-NEXT:    buffer_wbinvl1
16596; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
16597; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16598; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16599; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
16600; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
16601; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
16602; GFX8-NEXT:    s_setpc_b64 s[30:31]
16603;
16604; GFX7-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16605; GFX7:       ; %bb.0:
16606; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16607; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fc, v0
16608; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
16609; GFX7-NEXT:    flat_load_dword v0, v[4:5]
16610; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v3
16611; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v2
16612; GFX7-NEXT:    s_mov_b64 s[4:5], 0
16613; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
16614; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
16615; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16616; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
16617; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
16618; GFX7-NEXT:  .LBB60_1: ; %atomicrmw.start
16619; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
16620; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
16621; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
16622; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
16623; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
16624; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
16625; GFX7-NEXT:    v_min_f32_e32 v6, v6, v2
16626; GFX7-NEXT:    v_min_f32_e32 v7, v7, v3
16627; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
16628; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
16629; GFX7-NEXT:    v_alignbit_b32 v0, v0, v7, 16
16630; GFX7-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
16631; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16632; GFX7-NEXT:    buffer_wbinvl1
16633; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
16634; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
16635; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16636; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
16637; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16638; GFX7-NEXT:    s_cbranch_execnz .LBB60_1
16639; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
16640; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
16641; GFX7-NEXT:    s_setpc_b64 s[30:31]
16642  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
16643  %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
16644  ret <2 x bfloat> %result
16645}
16646
16647define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
16648; GFX12-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16649; GFX12:       ; %bb.0:
16650; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16651; GFX12-NEXT:    s_wait_expcnt 0x0
16652; GFX12-NEXT:    s_wait_samplecnt 0x0
16653; GFX12-NEXT:    s_wait_bvhcnt 0x0
16654; GFX12-NEXT:    s_wait_kmcnt 0x0
16655; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
16656; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16657; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16658; GFX12-NEXT:    s_mov_b32 s1, 0
16659; GFX12-NEXT:  .LBB61_1: ; %atomicrmw.start
16660; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
16661; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16662; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16663; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16664; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16665; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
16666; GFX12-NEXT:    v_min_num_f32_e32 v6, v6, v5
16667; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16668; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
16669; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
16670; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
16671; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16672; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
16673; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
16674; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
16675; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
16676; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16677; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
16678; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
16679; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
16680; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
16681; GFX12-NEXT:    global_wb scope:SCOPE_SYS
16682; GFX12-NEXT:    s_wait_storecnt 0x0
16683; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
16684; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
16685; GFX12-NEXT:    global_inv scope:SCOPE_SYS
16686; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
16687; GFX12-NEXT:    v_mov_b32_e32 v3, v2
16688; GFX12-NEXT:    s_wait_alu 0xfffe
16689; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
16690; GFX12-NEXT:    s_wait_alu 0xfffe
16691; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
16692; GFX12-NEXT:    s_cbranch_execnz .LBB61_1
16693; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
16694; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
16695; GFX12-NEXT:    s_wait_alu 0xfffe
16696; GFX12-NEXT:    s_setpc_b64 s[30:31]
16697;
16698; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16699; GFX940:       ; %bb.0:
16700; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16701; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:2044
16702; GFX940-NEXT:    s_mov_b64 s[2:3], 0
16703; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16704; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
16705; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16706; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
16707; GFX940-NEXT:  .LBB61_1: ; %atomicrmw.start
16708; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
16709; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16710; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16711; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16712; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
16713; GFX940-NEXT:    v_min_f32_e32 v6, v6, v5
16714; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
16715; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
16716; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
16717; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16718; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
16719; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
16720; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
16721; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
16722; GFX940-NEXT:    s_nop 0
16723; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
16724; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
16725; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
16726; GFX940-NEXT:    buffer_wbl2 sc0 sc1
16727; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1
16728; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16729; GFX940-NEXT:    buffer_inv sc0 sc1
16730; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
16731; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
16732; GFX940-NEXT:    v_mov_b32_e32 v3, v2
16733; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
16734; GFX940-NEXT:    s_cbranch_execnz .LBB61_1
16735; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
16736; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
16737; GFX940-NEXT:    s_setpc_b64 s[30:31]
16738;
16739; GFX11-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16740; GFX11:       ; %bb.0:
16741; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16742; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
16743; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16744; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16745; GFX11-NEXT:    s_mov_b32 s1, 0
16746; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
16747; GFX11-NEXT:    .p2align 6
16748; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
16749; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
16750; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16751; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16752; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16753; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16754; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
16755; GFX11-NEXT:    v_min_f32_e32 v6, v6, v5
16756; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16757; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
16758; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
16759; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
16760; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16761; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
16762; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
16763; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
16764; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
16765; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
16766; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
16767; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
16768; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
16769; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
16770; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
16771; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
16772; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16773; GFX11-NEXT:    buffer_gl1_inv
16774; GFX11-NEXT:    buffer_gl0_inv
16775; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
16776; GFX11-NEXT:    v_mov_b32_e32 v3, v2
16777; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
16778; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
16779; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
16780; GFX11-NEXT:    s_cbranch_execnz .LBB61_1
16781; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
16782; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
16783; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
16784; GFX11-NEXT:    s_setpc_b64 s[30:31]
16785;
16786; GFX10-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16787; GFX10:       ; %bb.0:
16788; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16789; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
16790; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
16791; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16792; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16793; GFX10-NEXT:    s_mov_b32 s5, 0
16794; GFX10-NEXT:    flat_load_dword v3, v[0:1]
16795; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
16796; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
16797; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16798; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16799; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16800; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
16801; GFX10-NEXT:    v_min_f32_e32 v6, v6, v5
16802; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
16803; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
16804; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
16805; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16806; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
16807; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
16808; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
16809; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
16810; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
16811; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
16812; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
16813; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
16814; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16815; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16816; GFX10-NEXT:    buffer_gl1_inv
16817; GFX10-NEXT:    buffer_gl0_inv
16818; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
16819; GFX10-NEXT:    v_mov_b32_e32 v3, v2
16820; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
16821; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
16822; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
16823; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
16824; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
16825; GFX10-NEXT:    s_setpc_b64 s[30:31]
16826;
16827; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16828; GFX90A:       ; %bb.0:
16829; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16830; GFX90A-NEXT:    flat_load_dword v3, v[0:1] offset:2044
16831; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
16832; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16833; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
16834; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16835; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
16836; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
16837; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
16838; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16839; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16840; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16841; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
16842; GFX90A-NEXT:    v_min_f32_e32 v6, v6, v5
16843; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
16844; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
16845; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
16846; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16847; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
16848; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
16849; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
16850; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
16851; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
16852; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
16853; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
16854; GFX90A-NEXT:    buffer_wbl2
16855; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
16856; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16857; GFX90A-NEXT:    buffer_invl2
16858; GFX90A-NEXT:    buffer_wbinvl1
16859; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
16860; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16861; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
16862; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16863; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
16864; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
16865; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
16866; GFX90A-NEXT:    s_setpc_b64 s[30:31]
16867;
16868; GFX908-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16869; GFX908:       ; %bb.0:
16870; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16871; GFX908-NEXT:    flat_load_dword v3, v[0:1] offset:2044
16872; GFX908-NEXT:    s_mov_b64 s[6:7], 0
16873; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16874; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
16875; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16876; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
16877; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
16878; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
16879; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16880; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16881; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16882; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
16883; GFX908-NEXT:    v_min_f32_e32 v6, v6, v5
16884; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
16885; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
16886; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
16887; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16888; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
16889; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
16890; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
16891; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
16892; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
16893; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
16894; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
16895; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
16896; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16897; GFX908-NEXT:    buffer_wbinvl1
16898; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
16899; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16900; GFX908-NEXT:    v_mov_b32_e32 v3, v2
16901; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16902; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
16903; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
16904; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
16905; GFX908-NEXT:    s_setpc_b64 s[30:31]
16906;
16907; GFX8-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16908; GFX8:       ; %bb.0:
16909; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16910; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
16911; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
16912; GFX8-NEXT:    flat_load_dword v3, v[0:1]
16913; GFX8-NEXT:    s_mov_b64 s[6:7], 0
16914; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
16915; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
16916; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
16917; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
16918; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16919; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
16920; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
16921; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
16922; GFX8-NEXT:    v_min_f32_e32 v6, v6, v5
16923; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
16924; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
16925; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
16926; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
16927; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
16928; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
16929; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
16930; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
16931; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
16932; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
16933; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
16934; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
16935; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
16936; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
16937; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16938; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16939; GFX8-NEXT:    buffer_wbinvl1
16940; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
16941; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
16942; GFX8-NEXT:    v_mov_b32_e32 v3, v2
16943; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
16944; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
16945; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
16946; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
16947; GFX8-NEXT:    s_setpc_b64 s[30:31]
16948;
16949; GFX7-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
16950; GFX7:       ; %bb.0:
16951; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16952; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
16953; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
16954; GFX7-NEXT:    flat_load_dword v4, v[0:1]
16955; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
16956; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v2
16957; GFX7-NEXT:    s_mov_b64 s[4:5], 0
16958; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
16959; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
16960; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16961; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
16962; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
16963; GFX7-NEXT:  .LBB61_1: ; %atomicrmw.start
16964; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
16965; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
16966; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
16967; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
16968; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
16969; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
16970; GFX7-NEXT:    v_min_f32_e32 v6, v6, v2
16971; GFX7-NEXT:    v_min_f32_e32 v7, v7, v3
16972; GFX7-NEXT:    v_alignbit_b32 v5, v5, v4, 16
16973; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
16974; GFX7-NEXT:    v_alignbit_b32 v4, v4, v7, 16
16975; GFX7-NEXT:    flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
16976; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
16977; GFX7-NEXT:    buffer_wbinvl1
16978; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
16979; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
16980; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
16981; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
16982; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
16983; GFX7-NEXT:    s_cbranch_execnz .LBB61_1
16984; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
16985; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
16986; GFX7-NEXT:    s_setpc_b64 s[30:31]
16987  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
16988  %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
16989  ret void
16990}
16991
16992attributes #0 = { nounwind }
16993attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
16994
16995!0 = !{}
16996