xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll (revision eeac0ffaf46cf9f9b0f680b9940cc4b68a0286d8)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
7; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
8; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
9; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
10
11; TODO: Delete this and add run lines to use *-atomicrmw-fmax.ll tests
12
13define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) {
14; GFX12-LABEL: local_atomic_fmax_ret_f32:
15; GFX12:       ; %bb.0:
16; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
17; GFX12-NEXT:    s_wait_expcnt 0x0
18; GFX12-NEXT:    s_wait_samplecnt 0x0
19; GFX12-NEXT:    s_wait_bvhcnt 0x0
20; GFX12-NEXT:    s_wait_kmcnt 0x0
21; GFX12-NEXT:    s_wait_storecnt 0x0
22; GFX12-NEXT:    ds_max_num_rtn_f32 v0, v0, v1
23; GFX12-NEXT:    s_wait_dscnt 0x0
24; GFX12-NEXT:    global_inv scope:SCOPE_SE
25; GFX12-NEXT:    s_setpc_b64 s[30:31]
26;
27; GFX940-LABEL: local_atomic_fmax_ret_f32:
28; GFX940:       ; %bb.0:
29; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30; GFX940-NEXT:    ds_max_rtn_f32 v0, v0, v1
31; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
32; GFX940-NEXT:    s_setpc_b64 s[30:31]
33;
34; GFX11-LABEL: local_atomic_fmax_ret_f32:
35; GFX11:       ; %bb.0:
36; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
38; GFX11-NEXT:    ds_max_rtn_f32 v0, v0, v1
39; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX11-NEXT:    buffer_gl0_inv
41; GFX11-NEXT:    s_setpc_b64 s[30:31]
42;
43; GFX10-LABEL: local_atomic_fmax_ret_f32:
44; GFX10:       ; %bb.0:
45; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
47; GFX10-NEXT:    ds_max_rtn_f32 v0, v0, v1
48; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
49; GFX10-NEXT:    buffer_gl0_inv
50; GFX10-NEXT:    s_setpc_b64 s[30:31]
51;
52; GFX90A-LABEL: local_atomic_fmax_ret_f32:
53; GFX90A:       ; %bb.0:
54; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
55; GFX90A-NEXT:    ds_max_rtn_f32 v0, v0, v1
56; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
57; GFX90A-NEXT:    s_setpc_b64 s[30:31]
58;
59; GFX908-LABEL: local_atomic_fmax_ret_f32:
60; GFX908:       ; %bb.0:
61; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62; GFX908-NEXT:    ds_max_rtn_f32 v0, v0, v1
63; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
64; GFX908-NEXT:    s_setpc_b64 s[30:31]
65;
66; GFX8-LABEL: local_atomic_fmax_ret_f32:
67; GFX8:       ; %bb.0:
68; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
69; GFX8-NEXT:    s_mov_b32 m0, -1
70; GFX8-NEXT:    ds_max_rtn_f32 v0, v0, v1
71; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
72; GFX8-NEXT:    s_setpc_b64 s[30:31]
73;
74; GFX7-LABEL: local_atomic_fmax_ret_f32:
75; GFX7:       ; %bb.0:
76; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
77; GFX7-NEXT:    s_mov_b32 m0, -1
78; GFX7-NEXT:    ds_max_rtn_f32 v0, v0, v1
79; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
80; GFX7-NEXT:    s_setpc_b64 s[30:31]
81  %result = atomicrmw fmax ptr addrspace(3) %ptr, float %val seq_cst
82  ret float %result
83}
84
85define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) {
86; GFX12-LABEL: local_atomic_fmax_noret_f32:
87; GFX12:       ; %bb.0:
88; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
89; GFX12-NEXT:    s_wait_expcnt 0x0
90; GFX12-NEXT:    s_wait_samplecnt 0x0
91; GFX12-NEXT:    s_wait_bvhcnt 0x0
92; GFX12-NEXT:    s_wait_kmcnt 0x0
93; GFX12-NEXT:    s_wait_storecnt 0x0
94; GFX12-NEXT:    ds_max_num_f32 v0, v1
95; GFX12-NEXT:    s_wait_dscnt 0x0
96; GFX12-NEXT:    global_inv scope:SCOPE_SE
97; GFX12-NEXT:    s_setpc_b64 s[30:31]
98;
99; GFX940-LABEL: local_atomic_fmax_noret_f32:
100; GFX940:       ; %bb.0:
101; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102; GFX940-NEXT:    ds_max_f32 v0, v1
103; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
104; GFX940-NEXT:    s_setpc_b64 s[30:31]
105;
106; GFX11-LABEL: local_atomic_fmax_noret_f32:
107; GFX11:       ; %bb.0:
108; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
110; GFX11-NEXT:    ds_max_f32 v0, v1
111; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
112; GFX11-NEXT:    buffer_gl0_inv
113; GFX11-NEXT:    s_setpc_b64 s[30:31]
114;
115; GFX10-LABEL: local_atomic_fmax_noret_f32:
116; GFX10:       ; %bb.0:
117; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
118; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
119; GFX10-NEXT:    ds_max_f32 v0, v1
120; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX10-NEXT:    buffer_gl0_inv
122; GFX10-NEXT:    s_setpc_b64 s[30:31]
123;
124; GFX90A-LABEL: local_atomic_fmax_noret_f32:
125; GFX90A:       ; %bb.0:
126; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127; GFX90A-NEXT:    ds_max_f32 v0, v1
128; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
129; GFX90A-NEXT:    s_setpc_b64 s[30:31]
130;
131; GFX908-LABEL: local_atomic_fmax_noret_f32:
132; GFX908:       ; %bb.0:
133; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134; GFX908-NEXT:    ds_max_f32 v0, v1
135; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
136; GFX908-NEXT:    s_setpc_b64 s[30:31]
137;
138; GFX8-LABEL: local_atomic_fmax_noret_f32:
139; GFX8:       ; %bb.0:
140; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141; GFX8-NEXT:    s_mov_b32 m0, -1
142; GFX8-NEXT:    ds_max_f32 v0, v1
143; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
144; GFX8-NEXT:    s_setpc_b64 s[30:31]
145;
146; GFX7-LABEL: local_atomic_fmax_noret_f32:
147; GFX7:       ; %bb.0:
148; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149; GFX7-NEXT:    s_mov_b32 m0, -1
150; GFX7-NEXT:    ds_max_f32 v0, v1
151; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
152; GFX7-NEXT:    s_setpc_b64 s[30:31]
153  %unused = atomicrmw fmax ptr addrspace(3) %ptr, float %val seq_cst
154  ret void
155}
156
157define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) {
158; GFX12-LABEL: local_atomic_fmax_ret_f64:
159; GFX12:       ; %bb.0:
160; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
161; GFX12-NEXT:    s_wait_expcnt 0x0
162; GFX12-NEXT:    s_wait_samplecnt 0x0
163; GFX12-NEXT:    s_wait_bvhcnt 0x0
164; GFX12-NEXT:    s_wait_kmcnt 0x0
165; GFX12-NEXT:    s_wait_storecnt 0x0
166; GFX12-NEXT:    ds_max_num_rtn_f64 v[0:1], v0, v[1:2]
167; GFX12-NEXT:    s_wait_dscnt 0x0
168; GFX12-NEXT:    global_inv scope:SCOPE_SE
169; GFX12-NEXT:    s_setpc_b64 s[30:31]
170;
171; GFX940-LABEL: local_atomic_fmax_ret_f64:
172; GFX940:       ; %bb.0:
173; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
174; GFX940-NEXT:    v_mov_b32_e32 v4, v1
175; GFX940-NEXT:    v_mov_b32_e32 v5, v2
176; GFX940-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[4:5]
177; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
178; GFX940-NEXT:    s_setpc_b64 s[30:31]
179;
180; GFX11-LABEL: local_atomic_fmax_ret_f64:
181; GFX11:       ; %bb.0:
182; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
184; GFX11-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[1:2]
185; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
186; GFX11-NEXT:    buffer_gl0_inv
187; GFX11-NEXT:    s_setpc_b64 s[30:31]
188;
189; GFX10-LABEL: local_atomic_fmax_ret_f64:
190; GFX10:       ; %bb.0:
191; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
192; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
193; GFX10-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[1:2]
194; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
195; GFX10-NEXT:    buffer_gl0_inv
196; GFX10-NEXT:    s_setpc_b64 s[30:31]
197;
198; GFX90A-LABEL: local_atomic_fmax_ret_f64:
199; GFX90A:       ; %bb.0:
200; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
201; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
202; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
203; GFX90A-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[4:5]
204; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX90A-NEXT:    s_setpc_b64 s[30:31]
206;
207; GFX908-LABEL: local_atomic_fmax_ret_f64:
208; GFX908:       ; %bb.0:
209; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210; GFX908-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[1:2]
211; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
212; GFX908-NEXT:    s_setpc_b64 s[30:31]
213;
214; GFX8-LABEL: local_atomic_fmax_ret_f64:
215; GFX8:       ; %bb.0:
216; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217; GFX8-NEXT:    s_mov_b32 m0, -1
218; GFX8-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[1:2]
219; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
220; GFX8-NEXT:    s_setpc_b64 s[30:31]
221;
222; GFX7-LABEL: local_atomic_fmax_ret_f64:
223; GFX7:       ; %bb.0:
224; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
225; GFX7-NEXT:    s_mov_b32 m0, -1
226; GFX7-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[1:2]
227; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX7-NEXT:    s_setpc_b64 s[30:31]
229  %result = atomicrmw fmax ptr addrspace(3) %ptr, double %val seq_cst
230  ret double %result
231}
232
233define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) {
234; GFX12-LABEL: local_atomic_fmax_noret_f64:
235; GFX12:       ; %bb.0:
236; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
237; GFX12-NEXT:    s_wait_expcnt 0x0
238; GFX12-NEXT:    s_wait_samplecnt 0x0
239; GFX12-NEXT:    s_wait_bvhcnt 0x0
240; GFX12-NEXT:    s_wait_kmcnt 0x0
241; GFX12-NEXT:    s_wait_storecnt 0x0
242; GFX12-NEXT:    ds_max_num_f64 v0, v[1:2]
243; GFX12-NEXT:    s_wait_dscnt 0x0
244; GFX12-NEXT:    global_inv scope:SCOPE_SE
245; GFX12-NEXT:    s_setpc_b64 s[30:31]
246;
247; GFX940-LABEL: local_atomic_fmax_noret_f64:
248; GFX940:       ; %bb.0:
249; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
250; GFX940-NEXT:    v_mov_b32_e32 v4, v1
251; GFX940-NEXT:    v_mov_b32_e32 v5, v2
252; GFX940-NEXT:    ds_max_f64 v0, v[4:5]
253; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX940-NEXT:    s_setpc_b64 s[30:31]
255;
256; GFX11-LABEL: local_atomic_fmax_noret_f64:
257; GFX11:       ; %bb.0:
258; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
260; GFX11-NEXT:    ds_max_f64 v0, v[1:2]
261; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
262; GFX11-NEXT:    buffer_gl0_inv
263; GFX11-NEXT:    s_setpc_b64 s[30:31]
264;
265; GFX10-LABEL: local_atomic_fmax_noret_f64:
266; GFX10:       ; %bb.0:
267; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
268; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
269; GFX10-NEXT:    ds_max_f64 v0, v[1:2]
270; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
271; GFX10-NEXT:    buffer_gl0_inv
272; GFX10-NEXT:    s_setpc_b64 s[30:31]
273;
274; GFX90A-LABEL: local_atomic_fmax_noret_f64:
275; GFX90A:       ; %bb.0:
276; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
277; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
278; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
279; GFX90A-NEXT:    ds_max_f64 v0, v[4:5]
280; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX90A-NEXT:    s_setpc_b64 s[30:31]
282;
283; GFX908-LABEL: local_atomic_fmax_noret_f64:
284; GFX908:       ; %bb.0:
285; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
286; GFX908-NEXT:    ds_max_f64 v0, v[1:2]
287; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
288; GFX908-NEXT:    s_setpc_b64 s[30:31]
289;
290; GFX8-LABEL: local_atomic_fmax_noret_f64:
291; GFX8:       ; %bb.0:
292; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293; GFX8-NEXT:    s_mov_b32 m0, -1
294; GFX8-NEXT:    ds_max_f64 v0, v[1:2]
295; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
296; GFX8-NEXT:    s_setpc_b64 s[30:31]
297;
298; GFX7-LABEL: local_atomic_fmax_noret_f64:
299; GFX7:       ; %bb.0:
300; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
301; GFX7-NEXT:    s_mov_b32 m0, -1
302; GFX7-NEXT:    ds_max_f64 v0, v[1:2]
303; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
304; GFX7-NEXT:    s_setpc_b64 s[30:31]
305  %unused = atomicrmw fmax ptr addrspace(3) %ptr, double %val seq_cst
306  ret void
307}
308
309define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) {
310; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
311; GFX12:       ; %bb.0:
312; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
313; GFX12-NEXT:    s_wait_expcnt 0x0
314; GFX12-NEXT:    s_wait_samplecnt 0x0
315; GFX12-NEXT:    s_wait_bvhcnt 0x0
316; GFX12-NEXT:    s_wait_kmcnt 0x0
317; GFX12-NEXT:    s_wait_storecnt 0x0
318; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
319; GFX12-NEXT:    s_wait_loadcnt 0x0
320; GFX12-NEXT:    global_inv scope:SCOPE_DEV
321; GFX12-NEXT:    s_setpc_b64 s[30:31]
322;
323; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
324; GFX940:       ; %bb.0:
325; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
326; GFX940-NEXT:    global_load_dword v3, v[0:1], off
327; GFX940-NEXT:    s_mov_b64 s[0:1], 0
328; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
329; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
330; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
331; GFX940-NEXT:    s_waitcnt vmcnt(0)
332; GFX940-NEXT:    v_mov_b32_e32 v5, v3
333; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
334; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
335; GFX940-NEXT:    buffer_wbl2 sc1
336; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
337; GFX940-NEXT:    s_waitcnt vmcnt(0)
338; GFX940-NEXT:    buffer_inv sc1
339; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
340; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
341; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
342; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
343; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
344; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
345; GFX940-NEXT:    v_mov_b32_e32 v0, v3
346; GFX940-NEXT:    s_setpc_b64 s[30:31]
347;
348; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
349; GFX11:       ; %bb.0:
350; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
351; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
352; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off glc
353; GFX11-NEXT:    s_waitcnt vmcnt(0)
354; GFX11-NEXT:    buffer_gl1_inv
355; GFX11-NEXT:    buffer_gl0_inv
356; GFX11-NEXT:    s_setpc_b64 s[30:31]
357;
358; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
359; GFX10:       ; %bb.0:
360; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
361; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
362; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off glc
363; GFX10-NEXT:    s_waitcnt vmcnt(0)
364; GFX10-NEXT:    buffer_gl1_inv
365; GFX10-NEXT:    buffer_gl0_inv
366; GFX10-NEXT:    s_setpc_b64 s[30:31]
367;
368; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
369; GFX90A:       ; %bb.0:
370; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
371; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
372; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
373; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
374; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
375; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
376; GFX90A-NEXT:    s_waitcnt vmcnt(0)
377; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
378; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
379; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
380; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
381; GFX90A-NEXT:    s_waitcnt vmcnt(0)
382; GFX90A-NEXT:    buffer_wbinvl1
383; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
384; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
385; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
386; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
387; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
388; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
389; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
390; GFX90A-NEXT:    s_setpc_b64 s[30:31]
391;
392; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
393; GFX908:       ; %bb.0:
394; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
395; GFX908-NEXT:    global_load_dword v3, v[0:1], off
396; GFX908-NEXT:    s_mov_b64 s[4:5], 0
397; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
398; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
399; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
400; GFX908-NEXT:    s_waitcnt vmcnt(0)
401; GFX908-NEXT:    v_mov_b32_e32 v4, v3
402; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
403; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
404; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
405; GFX908-NEXT:    s_waitcnt vmcnt(0)
406; GFX908-NEXT:    buffer_wbinvl1
407; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
408; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
409; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
410; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
411; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
412; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
413; GFX908-NEXT:    v_mov_b32_e32 v0, v3
414; GFX908-NEXT:    s_setpc_b64 s[30:31]
415;
416; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
417; GFX8:       ; %bb.0:
418; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
419; GFX8-NEXT:    flat_load_dword v3, v[0:1]
420; GFX8-NEXT:    s_mov_b64 s[4:5], 0
421; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
422; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
423; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
424; GFX8-NEXT:    s_waitcnt vmcnt(0)
425; GFX8-NEXT:    v_mov_b32_e32 v4, v3
426; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
427; GFX8-NEXT:    v_max_f32_e32 v3, v3, v2
428; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
429; GFX8-NEXT:    s_waitcnt vmcnt(0)
430; GFX8-NEXT:    buffer_wbinvl1
431; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
432; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
433; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
434; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
435; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
436; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
437; GFX8-NEXT:    v_mov_b32_e32 v0, v3
438; GFX8-NEXT:    s_setpc_b64 s[30:31]
439;
440; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
441; GFX7:       ; %bb.0:
442; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
443; GFX7-NEXT:    s_mov_b32 s6, 0
444; GFX7-NEXT:    s_mov_b32 s7, 0xf000
445; GFX7-NEXT:    s_mov_b64 s[4:5], 0
446; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
447; GFX7-NEXT:    s_waitcnt vmcnt(0)
448; GFX7-NEXT:    buffer_wbinvl1
449; GFX7-NEXT:    v_mov_b32_e32 v0, v2
450; GFX7-NEXT:    s_setpc_b64 s[30:31]
451  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
452  ret float %result
453}
454
455define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) {
456; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
457; GFX12:       ; %bb.0:
458; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
459; GFX12-NEXT:    s_wait_expcnt 0x0
460; GFX12-NEXT:    s_wait_samplecnt 0x0
461; GFX12-NEXT:    s_wait_bvhcnt 0x0
462; GFX12-NEXT:    s_wait_kmcnt 0x0
463; GFX12-NEXT:    s_wait_storecnt 0x0
464; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
465; GFX12-NEXT:    s_wait_storecnt 0x0
466; GFX12-NEXT:    global_inv scope:SCOPE_DEV
467; GFX12-NEXT:    s_setpc_b64 s[30:31]
468;
469; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
470; GFX940:       ; %bb.0:
471; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
472; GFX940-NEXT:    global_load_dword v3, v[0:1], off
473; GFX940-NEXT:    s_mov_b64 s[0:1], 0
474; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
475; GFX940-NEXT:  .LBB5_1: ; %atomicrmw.start
476; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
477; GFX940-NEXT:    s_waitcnt vmcnt(0)
478; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
479; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
480; GFX940-NEXT:    buffer_wbl2 sc1
481; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
482; GFX940-NEXT:    s_waitcnt vmcnt(0)
483; GFX940-NEXT:    buffer_inv sc1
484; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
485; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
486; GFX940-NEXT:    v_mov_b32_e32 v3, v2
487; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
488; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
489; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
490; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
491; GFX940-NEXT:    s_setpc_b64 s[30:31]
492;
493; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
494; GFX11:       ; %bb.0:
495; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
496; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
497; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off
498; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
499; GFX11-NEXT:    buffer_gl1_inv
500; GFX11-NEXT:    buffer_gl0_inv
501; GFX11-NEXT:    s_setpc_b64 s[30:31]
502;
503; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
504; GFX10:       ; %bb.0:
505; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
506; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
507; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off
508; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
509; GFX10-NEXT:    buffer_gl1_inv
510; GFX10-NEXT:    buffer_gl0_inv
511; GFX10-NEXT:    s_setpc_b64 s[30:31]
512;
513; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
514; GFX90A:       ; %bb.0:
515; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
516; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
517; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
518; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
519; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.start
520; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
521; GFX90A-NEXT:    s_waitcnt vmcnt(0)
522; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
523; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
524; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
525; GFX90A-NEXT:    s_waitcnt vmcnt(0)
526; GFX90A-NEXT:    buffer_wbinvl1
527; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
528; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
529; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
530; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
531; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
532; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
533; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
534; GFX90A-NEXT:    s_setpc_b64 s[30:31]
535;
536; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
537; GFX908:       ; %bb.0:
538; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
539; GFX908-NEXT:    global_load_dword v3, v[0:1], off
540; GFX908-NEXT:    s_mov_b64 s[4:5], 0
541; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
542; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
543; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
544; GFX908-NEXT:    s_waitcnt vmcnt(0)
545; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
546; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
547; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
548; GFX908-NEXT:    s_waitcnt vmcnt(0)
549; GFX908-NEXT:    buffer_wbinvl1
550; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
551; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
552; GFX908-NEXT:    v_mov_b32_e32 v3, v2
553; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
554; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
555; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
556; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
557; GFX908-NEXT:    s_setpc_b64 s[30:31]
558;
559; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
560; GFX8:       ; %bb.0:
561; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
562; GFX8-NEXT:    flat_load_dword v3, v[0:1]
563; GFX8-NEXT:    s_mov_b64 s[4:5], 0
564; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
565; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
566; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
567; GFX8-NEXT:    s_waitcnt vmcnt(0)
568; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
569; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
570; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
571; GFX8-NEXT:    s_waitcnt vmcnt(0)
572; GFX8-NEXT:    buffer_wbinvl1
573; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
574; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
575; GFX8-NEXT:    v_mov_b32_e32 v3, v2
576; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
577; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
578; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
579; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
580; GFX8-NEXT:    s_setpc_b64 s[30:31]
581;
582; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
583; GFX7:       ; %bb.0:
584; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
585; GFX7-NEXT:    s_mov_b32 s6, 0
586; GFX7-NEXT:    s_mov_b32 s7, 0xf000
587; GFX7-NEXT:    s_mov_b64 s[4:5], 0
588; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
589; GFX7-NEXT:    s_waitcnt vmcnt(0)
590; GFX7-NEXT:    buffer_wbinvl1
591; GFX7-NEXT:    s_setpc_b64 s[30:31]
592  %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
593  ret void
594}
595
596define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) {
597; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
598; GFX12:       ; %bb.0:
599; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
600; GFX12-NEXT:    s_wait_expcnt 0x0
601; GFX12-NEXT:    s_wait_samplecnt 0x0
602; GFX12-NEXT:    s_wait_bvhcnt 0x0
603; GFX12-NEXT:    s_wait_kmcnt 0x0
604; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
605; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
606; GFX12-NEXT:    s_mov_b32 s0, 0
607; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
608; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
609; GFX12-NEXT:    s_wait_loadcnt 0x0
610; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
611; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
612; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
613; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
614; GFX12-NEXT:    s_wait_storecnt 0x0
615; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
616; GFX12-NEXT:    s_wait_loadcnt 0x0
617; GFX12-NEXT:    global_inv scope:SCOPE_DEV
618; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
619; GFX12-NEXT:    s_wait_alu 0xfffe
620; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
621; GFX12-NEXT:    s_wait_alu 0xfffe
622; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
623; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
624; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
625; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
626; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
627; GFX12-NEXT:    s_wait_alu 0xfffe
628; GFX12-NEXT:    s_setpc_b64 s[30:31]
629;
630; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
631; GFX940:       ; %bb.0:
632; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
633; GFX940-NEXT:    buffer_wbl2 sc1
634; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
635; GFX940-NEXT:    s_waitcnt vmcnt(0)
636; GFX940-NEXT:    buffer_inv sc1
637; GFX940-NEXT:    s_setpc_b64 s[30:31]
638;
639; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
640; GFX11:       ; %bb.0:
641; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
642; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
643; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
644; GFX11-NEXT:    s_mov_b32 s0, 0
645; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
646; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
647; GFX11-NEXT:    s_waitcnt vmcnt(0)
648; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
649; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
650; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
651; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
652; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
653; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
654; GFX11-NEXT:    s_waitcnt vmcnt(0)
655; GFX11-NEXT:    buffer_gl1_inv
656; GFX11-NEXT:    buffer_gl0_inv
657; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
658; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
659; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
660; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
661; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
662; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
663; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
664; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
665; GFX11-NEXT:    s_setpc_b64 s[30:31]
666;
667; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
668; GFX10:       ; %bb.0:
669; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
670; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
671; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
672; GFX10-NEXT:    s_waitcnt vmcnt(0)
673; GFX10-NEXT:    buffer_gl1_inv
674; GFX10-NEXT:    buffer_gl0_inv
675; GFX10-NEXT:    s_setpc_b64 s[30:31]
676;
677; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
678; GFX90A:       ; %bb.0:
679; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
680; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
681; GFX90A-NEXT:    s_waitcnt vmcnt(0)
682; GFX90A-NEXT:    buffer_wbinvl1
683; GFX90A-NEXT:    s_setpc_b64 s[30:31]
684;
685; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
686; GFX908:       ; %bb.0:
687; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
688; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
689; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
690; GFX908-NEXT:    s_mov_b64 s[4:5], 0
691; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
692; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
693; GFX908-NEXT:    s_waitcnt vmcnt(0)
694; GFX908-NEXT:    v_mov_b32_e32 v7, v5
695; GFX908-NEXT:    v_mov_b32_e32 v6, v4
696; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
697; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
698; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
699; GFX908-NEXT:    s_waitcnt vmcnt(0)
700; GFX908-NEXT:    buffer_wbinvl1
701; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
702; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
703; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
704; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
705; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
706; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
707; GFX908-NEXT:    v_mov_b32_e32 v0, v4
708; GFX908-NEXT:    v_mov_b32_e32 v1, v5
709; GFX908-NEXT:    s_setpc_b64 s[30:31]
710;
711; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
712; GFX8:       ; %bb.0:
713; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
714; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
715; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
716; GFX8-NEXT:    s_mov_b64 s[4:5], 0
717; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
718; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
719; GFX8-NEXT:    s_waitcnt vmcnt(0)
720; GFX8-NEXT:    v_mov_b32_e32 v7, v5
721; GFX8-NEXT:    v_mov_b32_e32 v6, v4
722; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
723; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
724; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
725; GFX8-NEXT:    s_waitcnt vmcnt(0)
726; GFX8-NEXT:    buffer_wbinvl1
727; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
728; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
729; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
730; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
731; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
732; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
733; GFX8-NEXT:    v_mov_b32_e32 v0, v4
734; GFX8-NEXT:    v_mov_b32_e32 v1, v5
735; GFX8-NEXT:    s_setpc_b64 s[30:31]
736;
737; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
738; GFX7:       ; %bb.0:
739; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
740; GFX7-NEXT:    s_mov_b32 s6, 0
741; GFX7-NEXT:    s_mov_b32 s7, 0xf000
742; GFX7-NEXT:    s_mov_b64 s[4:5], 0
743; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
744; GFX7-NEXT:    s_waitcnt vmcnt(0)
745; GFX7-NEXT:    buffer_wbinvl1
746; GFX7-NEXT:    v_mov_b32_e32 v0, v2
747; GFX7-NEXT:    v_mov_b32_e32 v1, v3
748; GFX7-NEXT:    s_setpc_b64 s[30:31]
749  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
750  ret double %result
751}
752
753define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) {
754; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
755; GFX12:       ; %bb.0:
756; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
757; GFX12-NEXT:    s_wait_expcnt 0x0
758; GFX12-NEXT:    s_wait_samplecnt 0x0
759; GFX12-NEXT:    s_wait_bvhcnt 0x0
760; GFX12-NEXT:    s_wait_kmcnt 0x0
761; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
762; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
763; GFX12-NEXT:    s_mov_b32 s0, 0
764; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
765; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
766; GFX12-NEXT:    s_wait_loadcnt 0x0
767; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
768; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
769; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
770; GFX12-NEXT:    s_wait_storecnt 0x0
771; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
772; GFX12-NEXT:    s_wait_loadcnt 0x0
773; GFX12-NEXT:    global_inv scope:SCOPE_DEV
774; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
775; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
776; GFX12-NEXT:    s_wait_alu 0xfffe
777; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
778; GFX12-NEXT:    s_wait_alu 0xfffe
779; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
780; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
781; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
782; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
783; GFX12-NEXT:    s_wait_alu 0xfffe
784; GFX12-NEXT:    s_setpc_b64 s[30:31]
785;
786; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
787; GFX940:       ; %bb.0:
788; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
789; GFX940-NEXT:    buffer_wbl2 sc1
790; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
791; GFX940-NEXT:    s_waitcnt vmcnt(0)
792; GFX940-NEXT:    buffer_inv sc1
793; GFX940-NEXT:    s_setpc_b64 s[30:31]
794;
795; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
796; GFX11:       ; %bb.0:
797; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
798; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
799; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
800; GFX11-NEXT:    s_mov_b32 s0, 0
801; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
802; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
803; GFX11-NEXT:    s_waitcnt vmcnt(0)
804; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
805; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
806; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
807; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
808; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
809; GFX11-NEXT:    s_waitcnt vmcnt(0)
810; GFX11-NEXT:    buffer_gl1_inv
811; GFX11-NEXT:    buffer_gl0_inv
812; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
813; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
814; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
815; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
816; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
817; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
818; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
819; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
820; GFX11-NEXT:    s_setpc_b64 s[30:31]
821;
822; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
823; GFX10:       ; %bb.0:
824; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
825; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
826; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off
827; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
828; GFX10-NEXT:    buffer_gl1_inv
829; GFX10-NEXT:    buffer_gl0_inv
830; GFX10-NEXT:    s_setpc_b64 s[30:31]
831;
832; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
833; GFX90A:       ; %bb.0:
834; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
835; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
836; GFX90A-NEXT:    s_waitcnt vmcnt(0)
837; GFX90A-NEXT:    buffer_wbinvl1
838; GFX90A-NEXT:    s_setpc_b64 s[30:31]
839;
840; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
841; GFX908:       ; %bb.0:
842; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
843; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
844; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
845; GFX908-NEXT:    s_mov_b64 s[4:5], 0
846; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
847; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
848; GFX908-NEXT:    s_waitcnt vmcnt(0)
849; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
850; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
851; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
852; GFX908-NEXT:    s_waitcnt vmcnt(0)
853; GFX908-NEXT:    buffer_wbinvl1
854; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
855; GFX908-NEXT:    v_mov_b32_e32 v5, v3
856; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
857; GFX908-NEXT:    v_mov_b32_e32 v4, v2
858; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
859; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
860; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
861; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
862; GFX908-NEXT:    s_setpc_b64 s[30:31]
863;
864; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
865; GFX8:       ; %bb.0:
866; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
867; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
868; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
869; GFX8-NEXT:    s_mov_b64 s[4:5], 0
870; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
871; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
872; GFX8-NEXT:    s_waitcnt vmcnt(0)
873; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
874; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
875; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
876; GFX8-NEXT:    s_waitcnt vmcnt(0)
877; GFX8-NEXT:    buffer_wbinvl1
878; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
879; GFX8-NEXT:    v_mov_b32_e32 v5, v3
880; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
881; GFX8-NEXT:    v_mov_b32_e32 v4, v2
882; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
883; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
884; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
885; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
886; GFX8-NEXT:    s_setpc_b64 s[30:31]
887;
888; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
889; GFX7:       ; %bb.0:
890; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
891; GFX7-NEXT:    s_mov_b32 s6, 0
892; GFX7-NEXT:    s_mov_b32 s7, 0xf000
893; GFX7-NEXT:    s_mov_b64 s[4:5], 0
894; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
895; GFX7-NEXT:    s_waitcnt vmcnt(0)
896; GFX7-NEXT:    buffer_wbinvl1
897; GFX7-NEXT:    s_setpc_b64 s[30:31]
898  %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
899  ret void
900}
901
902define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) {
903; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
904; GFX12:       ; %bb.0:
905; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
906; GFX12-NEXT:    s_wait_expcnt 0x0
907; GFX12-NEXT:    s_wait_samplecnt 0x0
908; GFX12-NEXT:    s_wait_bvhcnt 0x0
909; GFX12-NEXT:    s_wait_kmcnt 0x0
910; GFX12-NEXT:    s_wait_storecnt 0x0
911; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
912; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
913; GFX12-NEXT:    global_inv scope:SCOPE_DEV
914; GFX12-NEXT:    s_setpc_b64 s[30:31]
915;
916; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
917; GFX940:       ; %bb.0:
918; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
919; GFX940-NEXT:    flat_load_dword v3, v[0:1]
920; GFX940-NEXT:    s_mov_b64 s[0:1], 0
921; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
922; GFX940-NEXT:  .LBB8_1: ; %atomicrmw.start
923; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
924; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
925; GFX940-NEXT:    v_mov_b32_e32 v5, v3
926; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
927; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
928; GFX940-NEXT:    buffer_wbl2 sc1
929; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
930; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
931; GFX940-NEXT:    buffer_inv sc1
932; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
933; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
934; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
935; GFX940-NEXT:    s_cbranch_execnz .LBB8_1
936; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
937; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
938; GFX940-NEXT:    v_mov_b32_e32 v0, v3
939; GFX940-NEXT:    s_setpc_b64 s[30:31]
940;
941; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
942; GFX11:       ; %bb.0:
943; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
944; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
945; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 glc
946; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
947; GFX11-NEXT:    buffer_gl1_inv
948; GFX11-NEXT:    buffer_gl0_inv
949; GFX11-NEXT:    s_setpc_b64 s[30:31]
950;
951; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
952; GFX10:       ; %bb.0:
953; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
954; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
955; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
956; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
957; GFX10-NEXT:    buffer_gl1_inv
958; GFX10-NEXT:    buffer_gl0_inv
959; GFX10-NEXT:    s_setpc_b64 s[30:31]
960;
961; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
962; GFX90A:       ; %bb.0:
963; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
964; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
965; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
966; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
967; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
968; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
969; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
970; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
971; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
972; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
973; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
974; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
975; GFX90A-NEXT:    buffer_wbinvl1
976; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
977; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
978; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
979; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
980; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
981; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
982; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
983; GFX90A-NEXT:    s_setpc_b64 s[30:31]
984;
985; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
986; GFX908:       ; %bb.0:
987; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
988; GFX908-NEXT:    flat_load_dword v3, v[0:1]
989; GFX908-NEXT:    s_mov_b64 s[4:5], 0
990; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
991; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
992; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
993; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
994; GFX908-NEXT:    v_mov_b32_e32 v4, v3
995; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
996; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
997; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
998; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
999; GFX908-NEXT:    buffer_wbinvl1
1000; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1001; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1002; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1003; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
1004; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1005; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1006; GFX908-NEXT:    v_mov_b32_e32 v0, v3
1007; GFX908-NEXT:    s_setpc_b64 s[30:31]
1008;
1009; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
1010; GFX8:       ; %bb.0:
1011; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1012; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1013; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1014; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
1015; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
1016; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1017; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1018; GFX8-NEXT:    v_mov_b32_e32 v4, v3
1019; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
1020; GFX8-NEXT:    v_max_f32_e32 v3, v3, v2
1021; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1022; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1023; GFX8-NEXT:    buffer_wbinvl1
1024; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
1025; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1026; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1027; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
1028; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1029; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1030; GFX8-NEXT:    v_mov_b32_e32 v0, v3
1031; GFX8-NEXT:    s_setpc_b64 s[30:31]
1032;
1033; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
1034; GFX7:       ; %bb.0:
1035; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1036; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
1037; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1038; GFX7-NEXT:    buffer_wbinvl1
1039; GFX7-NEXT:    s_setpc_b64 s[30:31]
1040  %result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1041  ret float %result
1042}
1043
1044define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) {
1045; GFX12-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1046; GFX12:       ; %bb.0:
1047; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1048; GFX12-NEXT:    s_wait_expcnt 0x0
1049; GFX12-NEXT:    s_wait_samplecnt 0x0
1050; GFX12-NEXT:    s_wait_bvhcnt 0x0
1051; GFX12-NEXT:    s_wait_kmcnt 0x0
1052; GFX12-NEXT:    s_wait_storecnt 0x0
1053; GFX12-NEXT:    flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV
1054; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
1055; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1056; GFX12-NEXT:    s_setpc_b64 s[30:31]
1057;
1058; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1059; GFX940:       ; %bb.0:
1060; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1061; GFX940-NEXT:    flat_load_dword v3, v[0:1]
1062; GFX940-NEXT:    s_mov_b64 s[0:1], 0
1063; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
1064; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
1065; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
1066; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1067; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
1068; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
1069; GFX940-NEXT:    buffer_wbl2 sc1
1070; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
1071; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1072; GFX940-NEXT:    buffer_inv sc1
1073; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
1074; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1075; GFX940-NEXT:    v_mov_b32_e32 v3, v2
1076; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1077; GFX940-NEXT:    s_cbranch_execnz .LBB9_1
1078; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
1079; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
1080; GFX940-NEXT:    s_setpc_b64 s[30:31]
1081;
1082; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1083; GFX11:       ; %bb.0:
1084; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1085; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1086; GFX11-NEXT:    flat_atomic_max_f32 v[0:1], v2
1087; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1088; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1089; GFX11-NEXT:    buffer_gl1_inv
1090; GFX11-NEXT:    buffer_gl0_inv
1091; GFX11-NEXT:    s_setpc_b64 s[30:31]
1092;
1093; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1094; GFX10:       ; %bb.0:
1095; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1096; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1097; GFX10-NEXT:    flat_atomic_fmax v[0:1], v2
1098; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1099; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1100; GFX10-NEXT:    buffer_gl1_inv
1101; GFX10-NEXT:    buffer_gl0_inv
1102; GFX10-NEXT:    s_setpc_b64 s[30:31]
1103;
1104; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1105; GFX90A:       ; %bb.0:
1106; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1107; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
1108; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1109; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
1110; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
1111; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1112; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1113; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
1114; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
1115; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
1116; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1117; GFX90A-NEXT:    buffer_wbinvl1
1118; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
1119; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1120; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
1121; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1122; GFX90A-NEXT:    s_cbranch_execnz .LBB9_1
1123; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1124; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1125; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1126;
1127; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1128; GFX908:       ; %bb.0:
1129; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1130; GFX908-NEXT:    flat_load_dword v3, v[0:1]
1131; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1132; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
1133; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
1134; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1135; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1136; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
1137; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
1138; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
1139; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1140; GFX908-NEXT:    buffer_wbinvl1
1141; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
1142; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1143; GFX908-NEXT:    v_mov_b32_e32 v3, v2
1144; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1145; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
1146; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1147; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1148; GFX908-NEXT:    s_setpc_b64 s[30:31]
1149;
1150; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1151; GFX8:       ; %bb.0:
1152; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1153; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1154; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1155; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
1156; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
1157; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1158; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1159; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
1160; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
1161; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
1162; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1163; GFX8-NEXT:    buffer_wbinvl1
1164; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
1165; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1166; GFX8-NEXT:    v_mov_b32_e32 v3, v2
1167; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1168; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
1169; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1170; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1171; GFX8-NEXT:    s_setpc_b64 s[30:31]
1172;
1173; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1174; GFX7:       ; %bb.0:
1175; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1176; GFX7-NEXT:    flat_atomic_fmax v[0:1], v2
1177; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1178; GFX7-NEXT:    buffer_wbinvl1
1179; GFX7-NEXT:    s_setpc_b64 s[30:31]
1180  %unused = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1181  ret void
1182}
1183
1184define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) {
1185; GFX12-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1186; GFX12:       ; %bb.0:
1187; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1188; GFX12-NEXT:    s_wait_expcnt 0x0
1189; GFX12-NEXT:    s_wait_samplecnt 0x0
1190; GFX12-NEXT:    s_wait_bvhcnt 0x0
1191; GFX12-NEXT:    s_wait_kmcnt 0x0
1192; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
1193; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
1194; GFX12-NEXT:    s_mov_b32 s0, 0
1195; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
1196; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1197; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1198; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
1199; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1200; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
1201; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
1202; GFX12-NEXT:    s_wait_storecnt 0x0
1203; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1204; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1205; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1206; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
1207; GFX12-NEXT:    s_wait_alu 0xfffe
1208; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
1209; GFX12-NEXT:    s_wait_alu 0xfffe
1210; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1211; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
1212; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1213; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1214; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
1215; GFX12-NEXT:    s_wait_alu 0xfffe
1216; GFX12-NEXT:    s_setpc_b64 s[30:31]
1217;
1218; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1219; GFX940:       ; %bb.0:
1220; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1221; GFX940-NEXT:    buffer_wbl2 sc1
1222; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
1223; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1224; GFX940-NEXT:    buffer_inv sc1
1225; GFX940-NEXT:    s_setpc_b64 s[30:31]
1226;
1227; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1228; GFX11:       ; %bb.0:
1229; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1230; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
1231; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
1232; GFX11-NEXT:    s_mov_b32 s0, 0
1233; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
1234; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1235; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1236; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
1237; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1238; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
1239; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
1240; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1241; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
1242; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1243; GFX11-NEXT:    buffer_gl1_inv
1244; GFX11-NEXT:    buffer_gl0_inv
1245; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
1246; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
1247; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1248; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1249; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
1250; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1251; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1252; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
1253; GFX11-NEXT:    s_setpc_b64 s[30:31]
1254;
1255; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1256; GFX10:       ; %bb.0:
1257; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1258; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1259; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
1260; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1261; GFX10-NEXT:    buffer_gl1_inv
1262; GFX10-NEXT:    buffer_gl0_inv
1263; GFX10-NEXT:    s_setpc_b64 s[30:31]
1264;
1265; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1266; GFX90A:       ; %bb.0:
1267; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1268; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
1269; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1270; GFX90A-NEXT:    buffer_wbinvl1
1271; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1272;
1273; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1274; GFX908:       ; %bb.0:
1275; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1276; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
1277; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
1278; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1279; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
1280; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1281; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1282; GFX908-NEXT:    v_mov_b32_e32 v7, v5
1283; GFX908-NEXT:    v_mov_b32_e32 v6, v4
1284; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
1285; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
1286; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
1287; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1288; GFX908-NEXT:    buffer_wbinvl1
1289; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
1290; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1291; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1292; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
1293; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1294; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1295; GFX908-NEXT:    v_mov_b32_e32 v0, v4
1296; GFX908-NEXT:    v_mov_b32_e32 v1, v5
1297; GFX908-NEXT:    s_setpc_b64 s[30:31]
1298;
1299; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1300; GFX8:       ; %bb.0:
1301; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1302; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
1303; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
1304; GFX8-NEXT:    flat_load_dword v4, v[0:1]
1305; GFX8-NEXT:    flat_load_dword v5, v[5:6]
1306; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
1307; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1308; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
1309; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1310; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1311; GFX8-NEXT:    v_mov_b32_e32 v7, v5
1312; GFX8-NEXT:    v_mov_b32_e32 v6, v4
1313; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
1314; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
1315; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
1316; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1317; GFX8-NEXT:    buffer_wbinvl1
1318; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
1319; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1320; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1321; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
1322; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1323; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1324; GFX8-NEXT:    v_mov_b32_e32 v0, v4
1325; GFX8-NEXT:    v_mov_b32_e32 v1, v5
1326; GFX8-NEXT:    s_setpc_b64 s[30:31]
1327;
1328; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1329; GFX7:       ; %bb.0:
1330; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1331; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
1332; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1333; GFX7-NEXT:    buffer_wbinvl1
1334; GFX7-NEXT:    s_setpc_b64 s[30:31]
1335  %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
1336  ret double %result
1337}
1338
1339define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) {
1340; GFX12-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
1341; GFX12:       ; %bb.0:
1342; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1343; GFX12-NEXT:    s_wait_expcnt 0x0
1344; GFX12-NEXT:    s_wait_samplecnt 0x0
1345; GFX12-NEXT:    s_wait_bvhcnt 0x0
1346; GFX12-NEXT:    s_wait_kmcnt 0x0
1347; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
1348; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
1349; GFX12-NEXT:    s_mov_b32 s0, 0
1350; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
1351; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1352; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1353; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
1354; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1355; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
1356; GFX12-NEXT:    s_wait_storecnt 0x0
1357; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1358; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1359; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1360; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
1361; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
1362; GFX12-NEXT:    s_wait_alu 0xfffe
1363; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
1364; GFX12-NEXT:    s_wait_alu 0xfffe
1365; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1366; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
1367; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1368; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1369; GFX12-NEXT:    s_wait_alu 0xfffe
1370; GFX12-NEXT:    s_setpc_b64 s[30:31]
1371;
1372; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
1373; GFX940:       ; %bb.0:
1374; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1375; GFX940-NEXT:    buffer_wbl2 sc1
1376; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
1377; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1378; GFX940-NEXT:    buffer_inv sc1
1379; GFX940-NEXT:    s_setpc_b64 s[30:31]
1380;
1381; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
1382; GFX11:       ; %bb.0:
1383; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1384; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
1385; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
1386; GFX11-NEXT:    s_mov_b32 s0, 0
1387; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
1388; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1389; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1390; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
1391; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1392; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
1393; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1394; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc
1395; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1396; GFX11-NEXT:    buffer_gl1_inv
1397; GFX11-NEXT:    buffer_gl0_inv
1398; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
1399; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
1400; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
1401; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1402; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
1403; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
1404; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1405; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1406; GFX11-NEXT:    s_setpc_b64 s[30:31]
1407;
1408; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
1409; GFX10:       ; %bb.0:
1410; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1411; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1412; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[2:3]
1413; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1414; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1415; GFX10-NEXT:    buffer_gl1_inv
1416; GFX10-NEXT:    buffer_gl0_inv
1417; GFX10-NEXT:    s_setpc_b64 s[30:31]
1418;
1419; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
1420; GFX90A:       ; %bb.0:
1421; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1422; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
1423; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1424; GFX90A-NEXT:    buffer_wbinvl1
1425; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1426;
1427; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
1428; GFX908:       ; %bb.0:
1429; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1430; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
1431; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
1432; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1433; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
1434; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1435; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1436; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
1437; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
1438; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
1439; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1440; GFX908-NEXT:    buffer_wbinvl1
1441; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
1442; GFX908-NEXT:    v_mov_b32_e32 v5, v3
1443; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1444; GFX908-NEXT:    v_mov_b32_e32 v4, v2
1445; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1446; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
1447; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1448; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1449; GFX908-NEXT:    s_setpc_b64 s[30:31]
1450;
1451; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
1452; GFX8:       ; %bb.0:
1453; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1454; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
1455; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
1456; GFX8-NEXT:    flat_load_dword v4, v[0:1]
1457; GFX8-NEXT:    flat_load_dword v5, v[5:6]
1458; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
1459; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1460; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
1461; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1462; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1463; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
1464; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
1465; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
1466; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1467; GFX8-NEXT:    buffer_wbinvl1
1468; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
1469; GFX8-NEXT:    v_mov_b32_e32 v5, v3
1470; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1471; GFX8-NEXT:    v_mov_b32_e32 v4, v2
1472; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1473; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
1474; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1475; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1476; GFX8-NEXT:    s_setpc_b64 s[30:31]
1477;
1478; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
1479; GFX7:       ; %bb.0:
1480; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1481; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[2:3]
1482; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1483; GFX7-NEXT:    buffer_wbinvl1
1484; GFX7-NEXT:    s_setpc_b64 s[30:31]
1485  %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
1486  ret void
1487}
1488
1489define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) {
1490; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
1491; GFX12:       ; %bb.0:
1492; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1493; GFX12-NEXT:    s_wait_expcnt 0x0
1494; GFX12-NEXT:    s_wait_samplecnt 0x0
1495; GFX12-NEXT:    s_wait_bvhcnt 0x0
1496; GFX12-NEXT:    s_wait_kmcnt 0x0
1497; GFX12-NEXT:    v_mov_b32_e32 v1, s16
1498; GFX12-NEXT:    s_wait_storecnt 0x0
1499; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
1500; GFX12-NEXT:    s_wait_loadcnt 0x0
1501; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1502; GFX12-NEXT:    s_setpc_b64 s[30:31]
1503;
1504; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
1505; GFX940:       ; %bb.0:
1506; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1507; GFX940-NEXT:    v_mov_b32_e32 v2, s16
1508; GFX940-NEXT:    v_mov_b32_e32 v1, v0
1509; GFX940-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
1510; GFX940-NEXT:    s_mov_b64 s[4:5], 0
1511; GFX940-NEXT:    v_max_f32_e32 v3, v1, v1
1512; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
1513; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
1514; GFX940-NEXT:    s_waitcnt vmcnt(0)
1515; GFX940-NEXT:    v_mov_b32_e32 v5, v0
1516; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
1517; GFX940-NEXT:    v_max_f32_e32 v4, v0, v3
1518; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
1519; GFX940-NEXT:    buffer_wbl2 sc1
1520; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0
1521; GFX940-NEXT:    s_waitcnt vmcnt(0)
1522; GFX940-NEXT:    buffer_inv sc1
1523; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1524; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1525; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1526; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
1527; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
1528; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
1529; GFX940-NEXT:    s_setpc_b64 s[30:31]
1530;
1531; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
1532; GFX11:       ; %bb.0:
1533; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1534; GFX11-NEXT:    v_mov_b32_e32 v1, s16
1535; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1536; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc
1537; GFX11-NEXT:    s_waitcnt vmcnt(0)
1538; GFX11-NEXT:    buffer_gl1_inv
1539; GFX11-NEXT:    buffer_gl0_inv
1540; GFX11-NEXT:    s_setpc_b64 s[30:31]
1541;
1542; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
1543; GFX10:       ; %bb.0:
1544; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1545; GFX10-NEXT:    v_mov_b32_e32 v1, s20
1546; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1547; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[16:19], 0 offen glc
1548; GFX10-NEXT:    s_waitcnt vmcnt(0)
1549; GFX10-NEXT:    buffer_gl1_inv
1550; GFX10-NEXT:    buffer_gl0_inv
1551; GFX10-NEXT:    s_setpc_b64 s[30:31]
1552;
1553; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
1554; GFX90A:       ; %bb.0:
1555; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1556; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
1557; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
1558; GFX90A-NEXT:    buffer_load_dword v0, v2, s[16:19], 0 offen
1559; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1560; GFX90A-NEXT:    v_max_f32_e32 v3, v1, v1
1561; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
1562; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1563; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1564; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
1565; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
1566; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v3
1567; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
1568; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
1569; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1570; GFX90A-NEXT:    buffer_wbinvl1
1571; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1572; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1573; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1574; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
1575; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1576; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1577; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1578;
1579; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
1580; GFX908:       ; %bb.0:
1581; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1582; GFX908-NEXT:    v_mov_b32_e32 v2, s20
1583; GFX908-NEXT:    v_mov_b32_e32 v1, v0
1584; GFX908-NEXT:    buffer_load_dword v0, v2, s[16:19], 0 offen
1585; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1586; GFX908-NEXT:    v_max_f32_e32 v3, v1, v1
1587; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
1588; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1589; GFX908-NEXT:    s_waitcnt vmcnt(0)
1590; GFX908-NEXT:    v_mov_b32_e32 v5, v0
1591; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
1592; GFX908-NEXT:    v_max_f32_e32 v4, v0, v3
1593; GFX908-NEXT:    v_mov_b32_e32 v0, v4
1594; GFX908-NEXT:    v_mov_b32_e32 v1, v5
1595; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
1596; GFX908-NEXT:    s_waitcnt vmcnt(0)
1597; GFX908-NEXT:    buffer_wbinvl1
1598; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1599; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1600; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1601; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
1602; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1603; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1604; GFX908-NEXT:    s_setpc_b64 s[30:31]
1605;
1606; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
1607; GFX8:       ; %bb.0:
1608; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1609; GFX8-NEXT:    v_mov_b32_e32 v2, s20
1610; GFX8-NEXT:    v_mov_b32_e32 v1, v0
1611; GFX8-NEXT:    buffer_load_dword v0, v2, s[16:19], 0 offen
1612; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1613; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v1
1614; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
1615; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1616; GFX8-NEXT:    s_waitcnt vmcnt(0)
1617; GFX8-NEXT:    v_mov_b32_e32 v5, v0
1618; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
1619; GFX8-NEXT:    v_max_f32_e32 v4, v0, v3
1620; GFX8-NEXT:    v_mov_b32_e32 v0, v4
1621; GFX8-NEXT:    v_mov_b32_e32 v1, v5
1622; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
1623; GFX8-NEXT:    s_waitcnt vmcnt(0)
1624; GFX8-NEXT:    buffer_wbinvl1
1625; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
1626; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1627; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1628; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
1629; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1630; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1631; GFX8-NEXT:    s_setpc_b64 s[30:31]
1632;
1633; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
1634; GFX7:       ; %bb.0:
1635; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1636; GFX7-NEXT:    v_mov_b32_e32 v1, s20
1637; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[16:19], 0 offen glc
1638; GFX7-NEXT:    s_waitcnt vmcnt(0)
1639; GFX7-NEXT:    buffer_wbinvl1
1640; GFX7-NEXT:    s_setpc_b64 s[30:31]
1641  %result = atomicrmw fmax ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1642  ret float %result
1643}
1644
1645define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) {
1646; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1647; GFX12:       ; %bb.0:
1648; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1649; GFX12-NEXT:    s_wait_expcnt 0x0
1650; GFX12-NEXT:    s_wait_samplecnt 0x0
1651; GFX12-NEXT:    s_wait_bvhcnt 0x0
1652; GFX12-NEXT:    s_wait_kmcnt 0x0
1653; GFX12-NEXT:    v_mov_b32_e32 v1, s16
1654; GFX12-NEXT:    s_wait_storecnt 0x0
1655; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen
1656; GFX12-NEXT:    s_wait_storecnt 0x0
1657; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1658; GFX12-NEXT:    s_setpc_b64 s[30:31]
1659;
1660; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1661; GFX940:       ; %bb.0:
1662; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1663; GFX940-NEXT:    v_mov_b32_e32 v2, s16
1664; GFX940-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
1665; GFX940-NEXT:    s_mov_b64 s[4:5], 0
1666; GFX940-NEXT:    v_max_f32_e32 v3, v0, v0
1667; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
1668; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
1669; GFX940-NEXT:    s_waitcnt vmcnt(0)
1670; GFX940-NEXT:    v_max_f32_e32 v0, v1, v1
1671; GFX940-NEXT:    v_max_f32_e32 v0, v0, v3
1672; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
1673; GFX940-NEXT:    buffer_wbl2 sc1
1674; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
1675; GFX940-NEXT:    s_waitcnt vmcnt(0)
1676; GFX940-NEXT:    buffer_inv sc1
1677; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
1678; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1679; GFX940-NEXT:    v_mov_b32_e32 v1, v4
1680; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1681; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
1682; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
1683; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
1684; GFX940-NEXT:    s_setpc_b64 s[30:31]
1685;
1686; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1687; GFX11:       ; %bb.0:
1688; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1689; GFX11-NEXT:    v_mov_b32_e32 v1, s16
1690; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1691; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
1692; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1693; GFX11-NEXT:    buffer_gl1_inv
1694; GFX11-NEXT:    buffer_gl0_inv
1695; GFX11-NEXT:    s_setpc_b64 s[30:31]
1696;
1697; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1698; GFX10:       ; %bb.0:
1699; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1700; GFX10-NEXT:    v_mov_b32_e32 v1, s20
1701; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1702; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[16:19], 0 offen
1703; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1704; GFX10-NEXT:    buffer_gl1_inv
1705; GFX10-NEXT:    buffer_gl0_inv
1706; GFX10-NEXT:    s_setpc_b64 s[30:31]
1707;
1708; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1709; GFX90A:       ; %bb.0:
1710; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1711; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
1712; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
1713; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
1714; GFX90A-NEXT:    v_max_f32_e32 v3, v0, v0
1715; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
1716; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
1717; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1718; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
1719; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v3
1720; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
1721; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
1722; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1723; GFX90A-NEXT:    buffer_wbinvl1
1724; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
1725; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1726; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
1727; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1728; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
1729; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
1730; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
1731; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1732;
1733; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1734; GFX908:       ; %bb.0:
1735; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1736; GFX908-NEXT:    v_mov_b32_e32 v2, s20
1737; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
1738; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1739; GFX908-NEXT:    v_max_f32_e32 v3, v0, v0
1740; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
1741; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1742; GFX908-NEXT:    s_waitcnt vmcnt(0)
1743; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
1744; GFX908-NEXT:    v_max_f32_e32 v0, v0, v3
1745; GFX908-NEXT:    v_mov_b32_e32 v5, v1
1746; GFX908-NEXT:    v_mov_b32_e32 v4, v0
1747; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
1748; GFX908-NEXT:    s_waitcnt vmcnt(0)
1749; GFX908-NEXT:    buffer_wbinvl1
1750; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
1751; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1752; GFX908-NEXT:    v_mov_b32_e32 v1, v4
1753; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1754; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
1755; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1756; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1757; GFX908-NEXT:    s_setpc_b64 s[30:31]
1758;
1759; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1760; GFX8:       ; %bb.0:
1761; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1762; GFX8-NEXT:    v_mov_b32_e32 v2, s20
1763; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
1764; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1765; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v0
1766; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
1767; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1768; GFX8-NEXT:    s_waitcnt vmcnt(0)
1769; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v1
1770; GFX8-NEXT:    v_max_f32_e32 v0, v0, v3
1771; GFX8-NEXT:    v_mov_b32_e32 v5, v1
1772; GFX8-NEXT:    v_mov_b32_e32 v4, v0
1773; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
1774; GFX8-NEXT:    s_waitcnt vmcnt(0)
1775; GFX8-NEXT:    buffer_wbinvl1
1776; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
1777; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1778; GFX8-NEXT:    v_mov_b32_e32 v1, v4
1779; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1780; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
1781; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1782; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1783; GFX8-NEXT:    s_setpc_b64 s[30:31]
1784;
1785; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
1786; GFX7:       ; %bb.0:
1787; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1788; GFX7-NEXT:    v_mov_b32_e32 v1, s20
1789; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[16:19], 0 offen
1790; GFX7-NEXT:    s_waitcnt vmcnt(0)
1791; GFX7-NEXT:    buffer_wbinvl1
1792; GFX7-NEXT:    s_setpc_b64 s[30:31]
1793  %unused = atomicrmw fmax ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1794  ret void
1795}
1796
1797define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) {
1798; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1799; GFX12:       ; %bb.0:
1800; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1801; GFX12-NEXT:    s_wait_expcnt 0x0
1802; GFX12-NEXT:    s_wait_samplecnt 0x0
1803; GFX12-NEXT:    s_wait_bvhcnt 0x0
1804; GFX12-NEXT:    s_wait_kmcnt 0x0
1805; GFX12-NEXT:    v_mov_b32_e32 v6, s16
1806; GFX12-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
1807; GFX12-NEXT:    s_mov_b32 s4, 0
1808; GFX12-NEXT:    buffer_load_b64 v[0:1], v6, s[0:3], null offen
1809; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
1810; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
1811; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1812; GFX12-NEXT:    s_wait_loadcnt 0x0
1813; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1814; GFX12-NEXT:    s_wait_storecnt 0x0
1815; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1816; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
1817; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
1818; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1819; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
1820; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
1821; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
1822; GFX12-NEXT:    s_wait_loadcnt 0x0
1823; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1824; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
1825; GFX12-NEXT:    s_wait_alu 0xfffe
1826; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
1827; GFX12-NEXT:    s_wait_alu 0xfffe
1828; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
1829; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
1830; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
1831; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1832; GFX12-NEXT:    s_wait_alu 0xfffe
1833; GFX12-NEXT:    s_setpc_b64 s[30:31]
1834;
1835; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1836; GFX940:       ; %bb.0:
1837; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1838; GFX940-NEXT:    v_mov_b32_e32 v2, s16
1839; GFX940-NEXT:    buffer_wbl2 sc1
1840; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0
1841; GFX940-NEXT:    s_waitcnt vmcnt(0)
1842; GFX940-NEXT:    buffer_inv sc1
1843; GFX940-NEXT:    s_setpc_b64 s[30:31]
1844;
1845; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1846; GFX11:       ; %bb.0:
1847; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1848; GFX11-NEXT:    v_mov_b32_e32 v6, s16
1849; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
1850; GFX11-NEXT:    s_mov_b32 s4, 0
1851; GFX11-NEXT:    buffer_load_b64 v[0:1], v6, s[0:3], 0 offen
1852; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
1853; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
1854; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1855; GFX11-NEXT:    s_waitcnt vmcnt(0)
1856; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1857; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1858; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1859; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
1860; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
1861; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1862; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
1863; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
1864; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
1865; GFX11-NEXT:    s_waitcnt vmcnt(0)
1866; GFX11-NEXT:    buffer_gl1_inv
1867; GFX11-NEXT:    buffer_gl0_inv
1868; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
1869; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
1870; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1871; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
1872; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
1873; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
1874; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1875; GFX11-NEXT:    s_setpc_b64 s[30:31]
1876;
1877; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1878; GFX10:       ; %bb.0:
1879; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1880; GFX10-NEXT:    v_mov_b32_e32 v2, s20
1881; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1882; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen glc
1883; GFX10-NEXT:    s_waitcnt vmcnt(0)
1884; GFX10-NEXT:    buffer_gl1_inv
1885; GFX10-NEXT:    buffer_gl0_inv
1886; GFX10-NEXT:    s_setpc_b64 s[30:31]
1887;
1888; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1889; GFX90A:       ; %bb.0:
1890; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1891; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
1892; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[16:19], 0 offen glc
1893; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1894; GFX90A-NEXT:    buffer_wbinvl1
1895; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1896;
1897; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1898; GFX908:       ; %bb.0:
1899; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1900; GFX908-NEXT:    v_mov_b32_e32 v6, s20
1901; GFX908-NEXT:    v_mov_b32_e32 v2, v0
1902; GFX908-NEXT:    v_mov_b32_e32 v3, v1
1903; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
1904; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
1905; GFX908-NEXT:    s_mov_b64 s[4:5], 0
1906; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
1907; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
1908; GFX908-NEXT:    s_waitcnt vmcnt(0)
1909; GFX908-NEXT:    v_mov_b32_e32 v10, v1
1910; GFX908-NEXT:    v_mov_b32_e32 v9, v0
1911; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
1912; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
1913; GFX908-NEXT:    v_mov_b32_e32 v0, v7
1914; GFX908-NEXT:    v_mov_b32_e32 v1, v8
1915; GFX908-NEXT:    v_mov_b32_e32 v2, v9
1916; GFX908-NEXT:    v_mov_b32_e32 v3, v10
1917; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
1918; GFX908-NEXT:    s_waitcnt vmcnt(0)
1919; GFX908-NEXT:    buffer_wbinvl1
1920; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
1921; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1922; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1923; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
1924; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
1925; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
1926; GFX908-NEXT:    s_setpc_b64 s[30:31]
1927;
1928; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1929; GFX8:       ; %bb.0:
1930; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1931; GFX8-NEXT:    v_mov_b32_e32 v6, s20
1932; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1933; GFX8-NEXT:    v_mov_b32_e32 v3, v1
1934; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
1935; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
1936; GFX8-NEXT:    s_mov_b64 s[4:5], 0
1937; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
1938; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
1939; GFX8-NEXT:    s_waitcnt vmcnt(0)
1940; GFX8-NEXT:    v_mov_b32_e32 v10, v1
1941; GFX8-NEXT:    v_mov_b32_e32 v9, v0
1942; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
1943; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
1944; GFX8-NEXT:    v_mov_b32_e32 v0, v7
1945; GFX8-NEXT:    v_mov_b32_e32 v1, v8
1946; GFX8-NEXT:    v_mov_b32_e32 v2, v9
1947; GFX8-NEXT:    v_mov_b32_e32 v3, v10
1948; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
1949; GFX8-NEXT:    s_waitcnt vmcnt(0)
1950; GFX8-NEXT:    buffer_wbinvl1
1951; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
1952; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1953; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1954; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
1955; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
1956; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1957; GFX8-NEXT:    s_setpc_b64 s[30:31]
1958;
1959; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
1960; GFX7:       ; %bb.0:
1961; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1962; GFX7-NEXT:    v_mov_b32_e32 v2, s20
1963; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen glc
1964; GFX7-NEXT:    s_waitcnt vmcnt(0)
1965; GFX7-NEXT:    buffer_wbinvl1
1966; GFX7-NEXT:    s_setpc_b64 s[30:31]
1967  %result = atomicrmw fmax ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1968  ret double %result
1969}
1970
1971define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) {
1972; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
1973; GFX12:       ; %bb.0:
1974; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1975; GFX12-NEXT:    s_wait_expcnt 0x0
1976; GFX12-NEXT:    s_wait_samplecnt 0x0
1977; GFX12-NEXT:    s_wait_bvhcnt 0x0
1978; GFX12-NEXT:    s_wait_kmcnt 0x0
1979; GFX12-NEXT:    v_mov_b32_e32 v6, s16
1980; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
1981; GFX12-NEXT:    s_mov_b32 s4, 0
1982; GFX12-NEXT:    buffer_load_b64 v[2:3], v6, s[0:3], null offen
1983; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
1984; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1985; GFX12-NEXT:    s_wait_loadcnt 0x0
1986; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
1987; GFX12-NEXT:    s_wait_storecnt 0x0
1988; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1989; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
1990; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
1991; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
1992; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
1993; GFX12-NEXT:    s_wait_loadcnt 0x0
1994; GFX12-NEXT:    global_inv scope:SCOPE_DEV
1995; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
1996; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
1997; GFX12-NEXT:    s_wait_alu 0xfffe
1998; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
1999; GFX12-NEXT:    s_wait_alu 0xfffe
2000; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
2001; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
2002; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
2003; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2004; GFX12-NEXT:    s_wait_alu 0xfffe
2005; GFX12-NEXT:    s_setpc_b64 s[30:31]
2006;
2007; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
2008; GFX940:       ; %bb.0:
2009; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2010; GFX940-NEXT:    v_mov_b32_e32 v2, s16
2011; GFX940-NEXT:    buffer_wbl2 sc1
2012; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
2013; GFX940-NEXT:    s_waitcnt vmcnt(0)
2014; GFX940-NEXT:    buffer_inv sc1
2015; GFX940-NEXT:    s_setpc_b64 s[30:31]
2016;
2017; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
2018; GFX11:       ; %bb.0:
2019; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2020; GFX11-NEXT:    v_mov_b32_e32 v6, s16
2021; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
2022; GFX11-NEXT:    s_mov_b32 s4, 0
2023; GFX11-NEXT:    buffer_load_b64 v[2:3], v6, s[0:3], 0 offen
2024; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
2025; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
2026; GFX11-NEXT:    s_waitcnt vmcnt(0)
2027; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
2028; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2029; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2030; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
2031; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
2032; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
2033; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
2034; GFX11-NEXT:    s_waitcnt vmcnt(0)
2035; GFX11-NEXT:    buffer_gl1_inv
2036; GFX11-NEXT:    buffer_gl0_inv
2037; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
2038; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
2039; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
2040; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2041; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
2042; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
2043; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
2044; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2045; GFX11-NEXT:    s_setpc_b64 s[30:31]
2046;
2047; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
2048; GFX10:       ; %bb.0:
2049; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2050; GFX10-NEXT:    v_mov_b32_e32 v2, s20
2051; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2052; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen
2053; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2054; GFX10-NEXT:    buffer_gl1_inv
2055; GFX10-NEXT:    buffer_gl0_inv
2056; GFX10-NEXT:    s_setpc_b64 s[30:31]
2057;
2058; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
2059; GFX90A:       ; %bb.0:
2060; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2061; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
2062; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[16:19], 0 offen
2063; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2064; GFX90A-NEXT:    buffer_wbinvl1
2065; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2066;
2067; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
2068; GFX908:       ; %bb.0:
2069; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2070; GFX908-NEXT:    v_mov_b32_e32 v6, s20
2071; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
2072; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
2073; GFX908-NEXT:    s_mov_b64 s[4:5], 0
2074; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
2075; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
2076; GFX908-NEXT:    s_waitcnt vmcnt(0)
2077; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
2078; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
2079; GFX908-NEXT:    v_mov_b32_e32 v10, v3
2080; GFX908-NEXT:    v_mov_b32_e32 v9, v2
2081; GFX908-NEXT:    v_mov_b32_e32 v8, v1
2082; GFX908-NEXT:    v_mov_b32_e32 v7, v0
2083; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
2084; GFX908-NEXT:    s_waitcnt vmcnt(0)
2085; GFX908-NEXT:    buffer_wbinvl1
2086; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
2087; GFX908-NEXT:    v_mov_b32_e32 v2, v7
2088; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2089; GFX908-NEXT:    v_mov_b32_e32 v3, v8
2090; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2091; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
2092; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
2093; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
2094; GFX908-NEXT:    s_setpc_b64 s[30:31]
2095;
2096; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
2097; GFX8:       ; %bb.0:
2098; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2099; GFX8-NEXT:    v_mov_b32_e32 v6, s20
2100; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
2101; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
2102; GFX8-NEXT:    s_mov_b64 s[4:5], 0
2103; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
2104; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
2105; GFX8-NEXT:    s_waitcnt vmcnt(0)
2106; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
2107; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
2108; GFX8-NEXT:    v_mov_b32_e32 v10, v3
2109; GFX8-NEXT:    v_mov_b32_e32 v9, v2
2110; GFX8-NEXT:    v_mov_b32_e32 v8, v1
2111; GFX8-NEXT:    v_mov_b32_e32 v7, v0
2112; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
2113; GFX8-NEXT:    s_waitcnt vmcnt(0)
2114; GFX8-NEXT:    buffer_wbinvl1
2115; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
2116; GFX8-NEXT:    v_mov_b32_e32 v2, v7
2117; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
2118; GFX8-NEXT:    v_mov_b32_e32 v3, v8
2119; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
2120; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
2121; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
2122; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2123; GFX8-NEXT:    s_setpc_b64 s[30:31]
2124;
2125; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
2126; GFX7:       ; %bb.0:
2127; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2128; GFX7-NEXT:    v_mov_b32_e32 v2, s20
2129; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen
2130; GFX7-NEXT:    s_waitcnt vmcnt(0)
2131; GFX7-NEXT:    buffer_wbinvl1
2132; GFX7-NEXT:    s_setpc_b64 s[30:31]
2133  %unused = atomicrmw fmax ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
2134  ret void
2135}
2136
2137!0 = !{}
2138!1 = !{i32 5, i32 6}
2139