xref: /llvm-project/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll (revision 9cc298108a7bb34d0126028a529ecdcb0ade9baf)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s
3; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s
4; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s
5; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s
6; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s
7; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s
8; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s
9; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s
10
11;---------------------------------------------------------------------
12; atomicrmw xchg
13;---------------------------------------------------------------------
14
15; xchg is supported over PCIe, so no expansion is necessary
16define float @test_atomicrmw_xchg_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
17; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_agent(
18; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
19; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
20; COMMON-NEXT:    ret float [[RES]]
21;
22  %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
23  ret float %res
24}
25
26; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
27define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
28; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_fine_grained_memory(
29; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
30; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
31; COMMON-NEXT:    ret float [[RES]]
32;
33  %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
34  ret float %res
35}
36
37; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
38define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
39; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_remote_memory(
40; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
41; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
42; COMMON-NEXT:    ret float [[RES]]
43;
44  %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
45  ret float %res
46}
47
48; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
49define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
50; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
51; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
52; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
53; COMMON-NEXT:    ret float [[RES]]
54;
55  %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
56  ret float %res
57}
58
59;---------------------------------------------------------------------
60; atomicrmw fadd
61;---------------------------------------------------------------------
62
63define float @test_atomicrmw_fadd_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
64; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
65; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
66; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
67; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
68; GFX803:       atomicrmw.start:
69; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
70; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
71; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
72; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
73; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
74; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
75; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
76; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
77; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
78; GFX803:       atomicrmw.end:
79; GFX803-NEXT:    ret float [[TMP5]]
80;
81; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
82; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
83; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
84; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
85; GFX906:       atomicrmw.start:
86; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
87; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
88; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
89; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
90; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
91; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
92; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
93; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
94; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
95; GFX906:       atomicrmw.end:
96; GFX906-NEXT:    ret float [[TMP5]]
97;
98; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
99; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
100; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
101; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
102; GFX908:       atomicrmw.start:
103; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
104; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
105; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
106; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
107; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
108; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
109; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
110; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
111; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
112; GFX908:       atomicrmw.end:
113; GFX908-NEXT:    ret float [[TMP5]]
114;
115; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
116; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
117; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
118; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
119; GFX90A:       atomicrmw.start:
120; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
121; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
122; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
123; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
124; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
125; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
126; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
127; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
128; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
129; GFX90A:       atomicrmw.end:
130; GFX90A-NEXT:    ret float [[TMP5]]
131;
132; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
133; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
134; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
135; GFX940-NEXT:    ret float [[RES]]
136;
137; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
138; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
139; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
140; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
141; GFX10:       atomicrmw.start:
142; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
143; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
144; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
145; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
146; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
147; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
148; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
149; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
150; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
151; GFX10:       atomicrmw.end:
152; GFX10-NEXT:    ret float [[TMP5]]
153;
154; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
155; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
156; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
157; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
158; GFX11:       atomicrmw.start:
159; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
160; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
161; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
162; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
163; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
164; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
165; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
166; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
167; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
168; GFX11:       atomicrmw.end:
169; GFX11-NEXT:    ret float [[TMP5]]
170;
171; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
172; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
173; GFX12-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
174; GFX12-NEXT:    ret float [[TMP5]]
175;
176  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
177  ret float %res
178}
179
180define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
181; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
182; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
183; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
184; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
185; GFX803:       atomicrmw.start:
186; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
187; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
188; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
189; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
190; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
191; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
192; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
193; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
194; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
195; GFX803:       atomicrmw.end:
196; GFX803-NEXT:    ret float [[TMP5]]
197;
198; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
199; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
200; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
201; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
202; GFX906:       atomicrmw.start:
203; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
204; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
205; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
206; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
207; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
208; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
209; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
210; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
211; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
212; GFX906:       atomicrmw.end:
213; GFX906-NEXT:    ret float [[TMP5]]
214;
215; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
216; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
217; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
218; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
219; GFX908:       atomicrmw.start:
220; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
221; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
222; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
223; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
224; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
225; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
226; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
227; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
228; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
229; GFX908:       atomicrmw.end:
230; GFX908-NEXT:    ret float [[TMP5]]
231;
232; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
233; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
234; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
235; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
236; GFX90A:       atomicrmw.start:
237; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
238; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
239; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
240; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
241; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
242; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
243; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
244; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
245; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
246; GFX90A:       atomicrmw.end:
247; GFX90A-NEXT:    ret float [[TMP5]]
248;
249; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
250; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
251; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
252; GFX940-NEXT:    ret float [[RES]]
253;
254; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
255; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
256; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
257; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
258; GFX10:       atomicrmw.start:
259; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
260; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
261; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
262; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
263; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
264; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
265; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
266; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
267; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
268; GFX10:       atomicrmw.end:
269; GFX10-NEXT:    ret float [[TMP5]]
270;
271; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
272; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
273; GFX11-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
274; GFX11-NEXT:    ret float [[TMP5]]
275;
276; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
277; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
278; GFX12-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
279; GFX12-NEXT:    ret float [[TMP5]]
280;
281  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
282  ret float %res
283}
284
285define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
286; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
287; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
288; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
289; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
290; GFX803:       atomicrmw.start:
291; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
292; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
293; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
294; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
295; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
296; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
297; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
298; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
299; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
300; GFX803:       atomicrmw.end:
301; GFX803-NEXT:    ret float [[TMP5]]
302;
303; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
304; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
305; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
306; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
307; GFX906:       atomicrmw.start:
308; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
309; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
310; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
311; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
312; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
313; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
314; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
315; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
316; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
317; GFX906:       atomicrmw.end:
318; GFX906-NEXT:    ret float [[TMP5]]
319;
320; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
321; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
322; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
323; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
324; GFX908:       atomicrmw.start:
325; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
326; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
327; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
328; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
329; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
330; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
331; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
332; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
333; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
334; GFX908:       atomicrmw.end:
335; GFX908-NEXT:    ret float [[TMP5]]
336;
337; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
338; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
339; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
340; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
341; GFX90A:       atomicrmw.start:
342; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
343; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
344; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
345; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
346; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
347; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
348; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
349; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
350; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
351; GFX90A:       atomicrmw.end:
352; GFX90A-NEXT:    ret float [[TMP5]]
353;
354; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
355; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
356; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
357; GFX940-NEXT:    ret float [[RES]]
358;
359; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
360; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
361; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
362; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
363; GFX10:       atomicrmw.start:
364; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
365; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
366; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
367; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
368; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
369; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
370; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
371; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
372; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
373; GFX10:       atomicrmw.end:
374; GFX10-NEXT:    ret float [[TMP5]]
375;
376; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
377; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
378; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
379; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
380; GFX11:       atomicrmw.start:
381; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
382; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
383; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
384; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
385; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
386; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
387; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
388; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
389; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
390; GFX11:       atomicrmw.end:
391; GFX11-NEXT:    ret float [[TMP5]]
392;
393; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
394; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
395; GFX12-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
396; GFX12-NEXT:    ret float [[TMP5]]
397;
398  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
399  ret float %res
400}
401
402define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
403; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
404; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
405; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
406; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
407; GFX803:       atomicrmw.start:
408; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
409; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
410; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
411; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
412; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
413; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
414; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
415; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
416; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
417; GFX803:       atomicrmw.end:
418; GFX803-NEXT:    ret float [[TMP5]]
419;
420; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
421; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
422; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
423; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
424; GFX906:       atomicrmw.start:
425; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
426; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
427; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
428; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
429; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
430; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
431; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
432; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
433; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
434; GFX906:       atomicrmw.end:
435; GFX906-NEXT:    ret float [[TMP5]]
436;
437; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
438; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
439; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
440; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
441; GFX908:       atomicrmw.start:
442; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
443; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
444; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
445; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
446; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
447; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
448; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
449; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
450; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
451; GFX908:       atomicrmw.end:
452; GFX908-NEXT:    ret float [[TMP5]]
453;
454; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
455; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
456; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
457; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
458; GFX90A:       atomicrmw.start:
459; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
460; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
461; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
462; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
463; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
464; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
465; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
466; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
467; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
468; GFX90A:       atomicrmw.end:
469; GFX90A-NEXT:    ret float [[TMP5]]
470;
471; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
472; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
473; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
474; GFX940-NEXT:    ret float [[RES]]
475;
476; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
477; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
478; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
479; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
480; GFX10:       atomicrmw.start:
481; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
482; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
483; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
484; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
485; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
486; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
487; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
488; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
489; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
490; GFX10:       atomicrmw.end:
491; GFX10-NEXT:    ret float [[TMP5]]
492;
493; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
494; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
495; GFX11-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
496; GFX11-NEXT:    ret float [[TMP5]]
497;
498; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
499; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
500; GFX12-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
501; GFX12-NEXT:    ret float [[TMP5]]
502;
503  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
504  ret float %res
505}
506
507define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 {
508; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
509; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
510; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
511; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
512; GFX803:       atomicrmw.start:
513; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
514; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
515; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
516; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
517; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
518; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
519; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
520; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
521; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
522; GFX803:       atomicrmw.end:
523; GFX803-NEXT:    ret float [[TMP5]]
524;
525; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
526; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
527; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
528; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
529; GFX906:       atomicrmw.start:
530; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
531; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
532; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
533; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
534; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
535; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
536; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
537; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
538; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
539; GFX906:       atomicrmw.end:
540; GFX906-NEXT:    ret float [[TMP5]]
541;
542; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
543; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
544; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
545; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
546; GFX908:       atomicrmw.start:
547; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
548; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
549; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
550; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
551; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
552; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
553; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
554; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
555; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
556; GFX908:       atomicrmw.end:
557; GFX908-NEXT:    ret float [[TMP5]]
558;
559; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
560; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
561; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
562; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
563; GFX90A:       atomicrmw.start:
564; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
565; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
566; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
567; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
568; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
569; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
570; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
571; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
572; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
573; GFX90A:       atomicrmw.end:
574; GFX90A-NEXT:    ret float [[TMP5]]
575;
576; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
577; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
578; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
579; GFX940-NEXT:    ret float [[RES]]
580;
581; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
582; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
583; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
584; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
585; GFX10:       atomicrmw.start:
586; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
587; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
588; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
589; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
590; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
591; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
592; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
593; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
594; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
595; GFX10:       atomicrmw.end:
596; GFX10-NEXT:    ret float [[TMP5]]
597;
598; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
599; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
600; GFX11-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
601; GFX11-NEXT:    ret float [[TMP5]]
602;
603; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
604; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
605; GFX12-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
606; GFX12-NEXT:    ret float [[TMP5]]
607;
608  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
609  ret float %res
610}
611
612define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
613; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
614; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
615; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
616; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
617; GFX803:       atomicrmw.start:
618; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
619; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
620; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
621; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
622; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
623; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
624; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
625; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
626; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
627; GFX803:       atomicrmw.end:
628; GFX803-NEXT:    ret float [[TMP5]]
629;
630; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
631; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
632; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
633; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
634; GFX906:       atomicrmw.start:
635; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
636; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
637; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
638; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
639; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
640; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
641; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
642; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
643; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
644; GFX906:       atomicrmw.end:
645; GFX906-NEXT:    ret float [[TMP5]]
646;
647; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
648; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
649; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
650; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
651; GFX908:       atomicrmw.start:
652; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
653; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
654; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
655; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
656; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
657; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
658; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
659; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
660; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
661; GFX908:       atomicrmw.end:
662; GFX908-NEXT:    ret float [[TMP5]]
663;
664; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
665; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
666; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
667; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
668; GFX90A:       atomicrmw.start:
669; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
670; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
671; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
672; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
673; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
674; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
675; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
676; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
677; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
678; GFX90A:       atomicrmw.end:
679; GFX90A-NEXT:    ret float [[TMP5]]
680;
681; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
682; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
683; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
684; GFX940-NEXT:    ret float [[RES]]
685;
686; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
687; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
688; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
689; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
690; GFX10:       atomicrmw.start:
691; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
692; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
693; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
694; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
695; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
696; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
697; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
698; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
699; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
700; GFX10:       atomicrmw.end:
701; GFX10-NEXT:    ret float [[TMP5]]
702;
703; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
704; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
705; GFX11-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
706; GFX11-NEXT:    ret float [[TMP5]]
707;
708; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
709; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
710; GFX12-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
711; GFX12-NEXT:    ret float [[TMP5]]
712;
713  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
714  ret float %res
715}
716
717define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
718; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
719; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
720; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
721; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
722; GFX803:       atomicrmw.start:
723; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
724; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
725; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
726; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
727; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
728; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
729; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
730; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
731; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
732; GFX803:       atomicrmw.end:
733; GFX803-NEXT:    ret float [[TMP5]]
734;
735; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
736; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
737; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
738; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
739; GFX906:       atomicrmw.start:
740; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
741; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
742; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
743; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
744; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
745; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
746; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
747; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
748; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
749; GFX906:       atomicrmw.end:
750; GFX906-NEXT:    ret float [[TMP5]]
751;
752; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
753; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
754; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
755; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
756; GFX908:       atomicrmw.start:
757; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
758; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
759; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
760; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
761; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
762; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
763; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
764; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
765; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
766; GFX908:       atomicrmw.end:
767; GFX908-NEXT:    ret float [[TMP5]]
768;
769; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
770; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
771; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
772; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
773; GFX90A:       atomicrmw.start:
774; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
775; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
776; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
777; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
778; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
779; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
780; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
781; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
782; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
783; GFX90A:       atomicrmw.end:
784; GFX90A-NEXT:    ret float [[TMP5]]
785;
786; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
787; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
788; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
789; GFX940-NEXT:    ret float [[RES]]
790;
791; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
792; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
793; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
794; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
795; GFX10:       atomicrmw.start:
796; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
797; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
798; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
799; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
800; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
801; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
802; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
803; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
804; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
805; GFX10:       atomicrmw.end:
806; GFX10-NEXT:    ret float [[TMP5]]
807;
808; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
809; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
810; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
811; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
812; GFX11:       atomicrmw.start:
813; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
814; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
815; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
816; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
817; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
818; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
819; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
820; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
821; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
822; GFX11:       atomicrmw.end:
823; GFX11-NEXT:    ret float [[TMP5]]
824;
825; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
826; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
827; GFX12-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
828; GFX12-NEXT:    ret float [[TMP5]]
829;
830  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
831  ret float %res
832}
833
834define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
835; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
836; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
837; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
838; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
839; GFX803:       atomicrmw.start:
840; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
841; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
842; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
843; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
844; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
845; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
846; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
847; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
848; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
849; GFX803:       atomicrmw.end:
850; GFX803-NEXT:    ret float [[TMP5]]
851;
852; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
853; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
854; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
855; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
856; GFX906:       atomicrmw.start:
857; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
858; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
859; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
860; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
861; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
862; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
863; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
864; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
865; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
866; GFX906:       atomicrmw.end:
867; GFX906-NEXT:    ret float [[TMP5]]
868;
869; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
870; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
871; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
872; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
873; GFX908:       atomicrmw.start:
874; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
875; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
876; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
877; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
878; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
879; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
880; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
881; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
882; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
883; GFX908:       atomicrmw.end:
884; GFX908-NEXT:    ret float [[TMP5]]
885;
886; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
887; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
888; GFX90A-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
889; GFX90A-NEXT:    ret float [[TMP5]]
890;
891; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
892; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
893; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
894; GFX940-NEXT:    ret float [[RES]]
895;
896; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
897; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
898; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
899; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
900; GFX10:       atomicrmw.start:
901; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
902; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
903; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
904; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
905; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
906; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
907; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
908; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
909; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
910; GFX10:       atomicrmw.end:
911; GFX10-NEXT:    ret float [[TMP5]]
912;
913; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
914; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
915; GFX11-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
916; GFX11-NEXT:    ret float [[TMP5]]
917;
918; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
919; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
920; GFX12-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
921; GFX12-NEXT:    ret float [[TMP5]]
922;
923  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
924  ret float %res
925}
926
927define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
928; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
929; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
930; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
931; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
932; GFX803:       atomicrmw.start:
933; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
934; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
935; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
936; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
937; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
938; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
939; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
940; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
941; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
942; GFX803:       atomicrmw.end:
943; GFX803-NEXT:    ret float [[TMP5]]
944;
945; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
946; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
947; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
948; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
949; GFX906:       atomicrmw.start:
950; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
951; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
952; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
953; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
954; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
955; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
956; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
957; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
958; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
959; GFX906:       atomicrmw.end:
960; GFX906-NEXT:    ret float [[TMP5]]
961;
962; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
963; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
964; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
965; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
966; GFX908:       atomicrmw.start:
967; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
968; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
969; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
970; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
971; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
972; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
973; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
974; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
975; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
976; GFX908:       atomicrmw.end:
977; GFX908-NEXT:    ret float [[TMP5]]
978;
979; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
980; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
981; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
982; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
983; GFX90A:       atomicrmw.start:
984; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
985; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
986; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
987; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
988; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
989; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
990; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
991; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
992; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
993; GFX90A:       atomicrmw.end:
994; GFX90A-NEXT:    ret float [[TMP5]]
995;
996; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
997; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
998; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
999; GFX940-NEXT:    ret float [[RES]]
1000;
1001; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
1002; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1003; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1004; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
1005; GFX10:       atomicrmw.start:
1006; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1007; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1008; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1009; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1010; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
1011; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1012; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1013; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1014; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1015; GFX10:       atomicrmw.end:
1016; GFX10-NEXT:    ret float [[TMP5]]
1017;
1018; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
1019; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1020; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1021; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
1022; GFX11:       atomicrmw.start:
1023; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1024; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1025; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1026; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1027; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
1028; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1029; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1030; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1031; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1032; GFX11:       atomicrmw.end:
1033; GFX11-NEXT:    ret float [[TMP5]]
1034;
1035; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
1036; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1037; GFX12-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
1038; GFX12-NEXT:    ret float [[TMP5]]
1039;
1040  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
1041  ret float %res
1042}
1043
1044define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
1045; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1046; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1047; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1048; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
1049; GFX803:       atomicrmw.start:
1050; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1051; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1052; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1053; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1054; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1055; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1056; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1057; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1058; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1059; GFX803:       atomicrmw.end:
1060; GFX803-NEXT:    ret float [[TMP5]]
1061;
1062; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1063; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1064; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1065; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
1066; GFX906:       atomicrmw.start:
1067; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1068; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1069; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1070; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1071; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1072; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1073; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1074; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1075; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1076; GFX906:       atomicrmw.end:
1077; GFX906-NEXT:    ret float [[TMP5]]
1078;
1079; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1080; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1081; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1082; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
1083; GFX908:       atomicrmw.start:
1084; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1085; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1086; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1087; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1088; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1089; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1090; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1091; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1092; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1093; GFX908:       atomicrmw.end:
1094; GFX908-NEXT:    ret float [[TMP5]]
1095;
1096; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1097; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1098; GFX90A-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
1099; GFX90A-NEXT:    ret float [[TMP5]]
1100;
1101; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1102; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1103; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
1104; GFX940-NEXT:    ret float [[RES]]
1105;
1106; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1107; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1108; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1109; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
1110; GFX10:       atomicrmw.start:
1111; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1112; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1113; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1114; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1115; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1116; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1117; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1118; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1119; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1120; GFX10:       atomicrmw.end:
1121; GFX10-NEXT:    ret float [[TMP5]]
1122;
1123; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1124; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1125; GFX11-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
1126; GFX11-NEXT:    ret float [[TMP5]]
1127;
1128; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1129; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1130; GFX12-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
1131; GFX12-NEXT:    ret float [[TMP5]]
1132;
1133  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
1134  ret float %res
1135}
1136
1137define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 {
1138; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
1139; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1140; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1141; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
1142; GFX803:       atomicrmw.start:
1143; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1144; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1145; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1146; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1147; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1148; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1149; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1150; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1151; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1152; GFX803:       atomicrmw.end:
1153; GFX803-NEXT:    ret float [[TMP5]]
1154;
1155; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
1156; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1157; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1158; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
1159; GFX906:       atomicrmw.start:
1160; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1161; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1162; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1163; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1164; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1165; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1166; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1167; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1168; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1169; GFX906:       atomicrmw.end:
1170; GFX906-NEXT:    ret float [[TMP5]]
1171;
1172; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
1173; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1174; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1175; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
1176; GFX908:       atomicrmw.start:
1177; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1178; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1179; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1180; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1181; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1182; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1183; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1184; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1185; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1186; GFX908:       atomicrmw.end:
1187; GFX908-NEXT:    ret float [[TMP5]]
1188;
1189; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
1190; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1191; GFX90A-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
1192; GFX90A-NEXT:    ret float [[TMP5]]
1193;
1194; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
1195; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1196; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
1197; GFX940-NEXT:    ret float [[RES]]
1198;
1199; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
1200; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1201; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1202; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
1203; GFX10:       atomicrmw.start:
1204; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1205; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1206; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1207; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1208; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1209; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1210; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1211; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1212; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1213; GFX10:       atomicrmw.end:
1214; GFX10-NEXT:    ret float [[TMP5]]
1215;
1216; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
1217; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1218; GFX11-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
1219; GFX11-NEXT:    ret float [[TMP5]]
1220;
1221; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
1222; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1223; GFX12-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
1224; GFX12-NEXT:    ret float [[TMP5]]
1225;
1226  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
1227  ret float %res
1228}
1229
1230define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
1231; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
1232; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1233; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1234; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
1235; GFX803:       atomicrmw.start:
1236; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1237; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1238; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1239; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1240; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1241; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1242; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1243; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1244; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1245; GFX803:       atomicrmw.end:
1246; GFX803-NEXT:    ret float [[TMP5]]
1247;
1248; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
1249; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1250; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1251; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
1252; GFX906:       atomicrmw.start:
1253; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1254; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1255; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1256; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1257; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1258; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1259; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1260; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1261; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1262; GFX906:       atomicrmw.end:
1263; GFX906-NEXT:    ret float [[TMP5]]
1264;
1265; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
1266; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1267; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1268; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
1269; GFX908:       atomicrmw.start:
1270; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1271; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1272; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1273; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1274; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1275; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1276; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1277; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1278; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1279; GFX908:       atomicrmw.end:
1280; GFX908-NEXT:    ret float [[TMP5]]
1281;
1282; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
1283; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1284; GFX90A-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
1285; GFX90A-NEXT:    ret float [[TMP5]]
1286;
1287; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
1288; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1289; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
1290; GFX940-NEXT:    ret float [[RES]]
1291;
1292; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
1293; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1294; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1295; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
1296; GFX10:       atomicrmw.start:
1297; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1298; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1299; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1300; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1301; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1302; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1303; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1304; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1305; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1306; GFX10:       atomicrmw.end:
1307; GFX10-NEXT:    ret float [[TMP5]]
1308;
1309; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
1310; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1311; GFX11-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
1312; GFX11-NEXT:    ret float [[TMP5]]
1313;
1314; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
1315; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1316; GFX12-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
1317; GFX12-NEXT:    ret float [[TMP5]]
1318;
1319  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
1320  ret float %res
1321}
1322
1323;---------------------------------------------------------------------
1324; atomicrmw fadd (no return)
1325;---------------------------------------------------------------------
1326
1327define void @test_atomicrmw_fadd_noret_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
1328; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
1329; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1330; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1331; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
1332; GFX803:       atomicrmw.start:
1333; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1334; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1335; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1336; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1337; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
1338; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1339; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1340; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1341; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1342; GFX803:       atomicrmw.end:
1343; GFX803-NEXT:    ret void
1344;
1345; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
1346; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1347; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1348; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
1349; GFX906:       atomicrmw.start:
1350; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1351; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1352; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1353; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1354; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
1355; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1356; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1357; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1358; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1359; GFX906:       atomicrmw.end:
1360; GFX906-NEXT:    ret void
1361;
1362; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
1363; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1364; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1365; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
1366; GFX908:       atomicrmw.start:
1367; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1368; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1369; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1370; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1371; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
1372; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1373; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1374; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1375; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1376; GFX908:       atomicrmw.end:
1377; GFX908-NEXT:    ret void
1378;
1379; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
1380; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1381; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1382; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
1383; GFX90A:       atomicrmw.start:
1384; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1385; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1386; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1387; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1388; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
1389; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1390; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1391; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1392; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1393; GFX90A:       atomicrmw.end:
1394; GFX90A-NEXT:    ret void
1395;
1396; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
1397; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1398; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
1399; GFX940-NEXT:    ret void
1400;
1401; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
1402; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1403; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1404; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
1405; GFX10:       atomicrmw.start:
1406; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1407; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1408; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1409; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1410; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
1411; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1412; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1413; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1414; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1415; GFX10:       atomicrmw.end:
1416; GFX10-NEXT:    ret void
1417;
1418; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
1419; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1420; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1421; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
1422; GFX11:       atomicrmw.start:
1423; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1424; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1425; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1426; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1427; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
1428; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1429; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1430; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1431; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1432; GFX11:       atomicrmw.end:
1433; GFX11-NEXT:    ret void
1434;
1435; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
1436; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1437; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
1438; GFX12-NEXT:    ret void
1439;
1440  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
1441  ret void
1442}
1443
1444define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
1445; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
1446; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1447; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1448; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
1449; GFX803:       atomicrmw.start:
1450; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1451; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1452; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1453; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1454; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
1455; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1456; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1457; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1458; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1459; GFX803:       atomicrmw.end:
1460; GFX803-NEXT:    ret void
1461;
1462; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
1463; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1464; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1465; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
1466; GFX906:       atomicrmw.start:
1467; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1468; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1469; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1470; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1471; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
1472; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1473; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1474; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1475; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1476; GFX906:       atomicrmw.end:
1477; GFX906-NEXT:    ret void
1478;
1479; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
1480; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1481; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1482; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
1483; GFX908:       atomicrmw.start:
1484; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1485; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1486; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1487; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1488; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
1489; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1490; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1491; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1492; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1493; GFX908:       atomicrmw.end:
1494; GFX908-NEXT:    ret void
1495;
1496; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
1497; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1498; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1499; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
1500; GFX90A:       atomicrmw.start:
1501; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1502; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1503; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1504; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1505; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
1506; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1507; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1508; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1509; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1510; GFX90A:       atomicrmw.end:
1511; GFX90A-NEXT:    ret void
1512;
1513; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
1514; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1515; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
1516; GFX940-NEXT:    ret void
1517;
1518; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
1519; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1520; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1521; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
1522; GFX10:       atomicrmw.start:
1523; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1524; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1525; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1526; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1527; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
1528; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1529; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1530; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1531; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1532; GFX10:       atomicrmw.end:
1533; GFX10-NEXT:    ret void
1534;
1535; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
1536; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1537; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
1538; GFX11-NEXT:    ret void
1539;
1540; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
1541; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1542; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
1543; GFX12-NEXT:    ret void
1544;
1545  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1546  ret void
1547}
1548
1549define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
1550; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
1551; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1552; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1553; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
1554; GFX803:       atomicrmw.start:
1555; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1556; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1557; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1558; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1559; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
1560; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1561; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1562; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1563; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1564; GFX803:       atomicrmw.end:
1565; GFX803-NEXT:    ret void
1566;
1567; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
1568; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1569; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1570; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
1571; GFX906:       atomicrmw.start:
1572; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1573; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1574; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1575; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1576; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
1577; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1578; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1579; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1580; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1581; GFX906:       atomicrmw.end:
1582; GFX906-NEXT:    ret void
1583;
1584; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
1585; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1586; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1587; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
1588; GFX908:       atomicrmw.start:
1589; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1590; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1591; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1592; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1593; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
1594; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1595; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1596; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1597; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1598; GFX908:       atomicrmw.end:
1599; GFX908-NEXT:    ret void
1600;
1601; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
1602; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1603; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1604; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
1605; GFX90A:       atomicrmw.start:
1606; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1607; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1608; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1609; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1610; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
1611; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1612; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1613; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1614; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1615; GFX90A:       atomicrmw.end:
1616; GFX90A-NEXT:    ret void
1617;
1618; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
1619; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1620; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
1621; GFX940-NEXT:    ret void
1622;
1623; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
1624; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1625; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1626; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
1627; GFX10:       atomicrmw.start:
1628; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1629; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1630; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1631; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1632; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
1633; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1634; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1635; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1636; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1637; GFX10:       atomicrmw.end:
1638; GFX10-NEXT:    ret void
1639;
1640; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
1641; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1642; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1643; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
1644; GFX11:       atomicrmw.start:
1645; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1646; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1647; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1648; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1649; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
1650; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1651; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1652; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1653; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1654; GFX11:       atomicrmw.end:
1655; GFX11-NEXT:    ret void
1656;
1657; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
1658; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1659; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
1660; GFX12-NEXT:    ret void
1661;
1662  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
1663  ret void
1664}
1665
1666define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
1667; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1668; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1669; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1670; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
1671; GFX803:       atomicrmw.start:
1672; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1673; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1674; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1675; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1676; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1677; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1678; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1679; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1680; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1681; GFX803:       atomicrmw.end:
1682; GFX803-NEXT:    ret void
1683;
1684; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1685; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1686; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1687; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
1688; GFX906:       atomicrmw.start:
1689; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1690; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1691; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1692; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1693; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1694; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1695; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1696; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1697; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1698; GFX906:       atomicrmw.end:
1699; GFX906-NEXT:    ret void
1700;
1701; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1702; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1703; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1704; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
1705; GFX908:       atomicrmw.start:
1706; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1707; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1708; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1709; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1710; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1711; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1712; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1713; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1714; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1715; GFX908:       atomicrmw.end:
1716; GFX908-NEXT:    ret void
1717;
1718; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1719; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1720; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1721; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
1722; GFX90A:       atomicrmw.start:
1723; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1724; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1725; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1726; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1727; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1728; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1729; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1730; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1731; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1732; GFX90A:       atomicrmw.end:
1733; GFX90A-NEXT:    ret void
1734;
1735; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1736; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1737; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1738; GFX940-NEXT:    ret void
1739;
1740; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1741; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1742; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1743; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
1744; GFX10:       atomicrmw.start:
1745; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1746; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1747; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1748; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1749; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1750; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1751; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1752; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1753; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1754; GFX10:       atomicrmw.end:
1755; GFX10-NEXT:    ret void
1756;
1757; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1758; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1759; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1760; GFX11-NEXT:    ret void
1761;
1762; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
1763; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1764; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1765; GFX12-NEXT:    ret void
1766;
1767  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
1768  ret void
1769}
1770
1771define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 {
1772; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
1773; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1774; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1775; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
1776; GFX803:       atomicrmw.start:
1777; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1778; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1779; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1780; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1781; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1782; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1783; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1784; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1785; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1786; GFX803:       atomicrmw.end:
1787; GFX803-NEXT:    ret void
1788;
1789; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
1790; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1791; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1792; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
1793; GFX906:       atomicrmw.start:
1794; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1795; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1796; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1797; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1798; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1799; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1800; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1801; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1802; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1803; GFX906:       atomicrmw.end:
1804; GFX906-NEXT:    ret void
1805;
1806; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
1807; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1808; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1809; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
1810; GFX908:       atomicrmw.start:
1811; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1812; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1813; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1814; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1815; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1816; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1817; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1818; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1819; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1820; GFX908:       atomicrmw.end:
1821; GFX908-NEXT:    ret void
1822;
1823; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
1824; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1825; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1826; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
1827; GFX90A:       atomicrmw.start:
1828; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1829; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1830; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1831; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1832; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1833; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1834; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1835; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1836; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1837; GFX90A:       atomicrmw.end:
1838; GFX90A-NEXT:    ret void
1839;
1840; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
1841; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1842; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1843; GFX940-NEXT:    ret void
1844;
1845; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
1846; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1847; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1848; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
1849; GFX10:       atomicrmw.start:
1850; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1851; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1852; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1853; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1854; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1855; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1856; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1857; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1858; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1859; GFX10:       atomicrmw.end:
1860; GFX10-NEXT:    ret void
1861;
1862; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
1863; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1864; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1865; GFX11-NEXT:    ret void
1866;
1867; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
1868; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
1869; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1870; GFX12-NEXT:    ret void
1871;
1872  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
1873  ret void
1874}
1875
1876define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
1877; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
1878; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1879; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1880; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
1881; GFX803:       atomicrmw.start:
1882; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1883; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1884; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1885; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1886; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1887; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1888; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1889; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1890; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1891; GFX803:       atomicrmw.end:
1892; GFX803-NEXT:    ret void
1893;
1894; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
1895; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1896; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1897; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
1898; GFX906:       atomicrmw.start:
1899; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1900; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1901; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1902; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1903; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1904; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1905; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1906; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1907; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1908; GFX906:       atomicrmw.end:
1909; GFX906-NEXT:    ret void
1910;
1911; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
1912; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1913; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1914; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
1915; GFX908:       atomicrmw.start:
1916; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1917; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1918; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1919; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1920; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1921; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1922; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1923; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1924; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1925; GFX908:       atomicrmw.end:
1926; GFX908-NEXT:    ret void
1927;
1928; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
1929; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1930; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1931; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
1932; GFX90A:       atomicrmw.start:
1933; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1934; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1935; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1936; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1937; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1938; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1939; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1940; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1941; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1942; GFX90A:       atomicrmw.end:
1943; GFX90A-NEXT:    ret void
1944;
1945; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
1946; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1947; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1948; GFX940-NEXT:    ret void
1949;
1950; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
1951; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1952; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1953; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
1954; GFX10:       atomicrmw.start:
1955; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1956; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1957; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1958; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1959; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1960; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1961; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1962; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1963; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1964; GFX10:       atomicrmw.end:
1965; GFX10-NEXT:    ret void
1966;
1967; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
1968; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1969; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1970; GFX11-NEXT:    ret void
1971;
1972; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
1973; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
1974; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
1975; GFX12-NEXT:    ret void
1976;
1977  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
1978  ret void
1979}
1980
1981define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
1982; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
1983; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
1984; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
1985; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
1986; GFX803:       atomicrmw.start:
1987; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
1988; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
1989; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
1990; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
1991; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
1992; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
1993; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
1994; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
1995; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1996; GFX803:       atomicrmw.end:
1997; GFX803-NEXT:    ret void
1998;
1999; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
2000; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2001; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2002; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
2003; GFX906:       atomicrmw.start:
2004; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2005; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2006; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2007; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2008; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
2009; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2010; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2011; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2012; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2013; GFX906:       atomicrmw.end:
2014; GFX906-NEXT:    ret void
2015;
2016; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
2017; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2018; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2019; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
2020; GFX908:       atomicrmw.start:
2021; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2022; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2023; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2024; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2025; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
2026; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2027; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2028; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2029; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2030; GFX908:       atomicrmw.end:
2031; GFX908-NEXT:    ret void
2032;
2033; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
2034; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2035; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2036; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
2037; GFX90A:       atomicrmw.start:
2038; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2039; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2040; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2041; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2042; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
2043; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2044; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2045; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2046; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2047; GFX90A:       atomicrmw.end:
2048; GFX90A-NEXT:    ret void
2049;
2050; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
2051; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2052; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
2053; GFX940-NEXT:    ret void
2054;
2055; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
2056; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2057; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2058; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
2059; GFX10:       atomicrmw.start:
2060; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2061; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2062; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2063; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2064; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
2065; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2066; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2067; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2068; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2069; GFX10:       atomicrmw.end:
2070; GFX10-NEXT:    ret void
2071;
2072; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
2073; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2074; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2075; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
2076; GFX11:       atomicrmw.start:
2077; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2078; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2079; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2080; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2081; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
2082; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2083; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2084; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2085; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2086; GFX11:       atomicrmw.end:
2087; GFX11-NEXT:    ret void
2088;
2089; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
2090; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2091; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
2092; GFX12-NEXT:    ret void
2093;
2094  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
2095  ret void
2096}
2097
2098define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
2099; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
2100; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2101; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2102; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
2103; GFX803:       atomicrmw.start:
2104; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2105; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2106; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2107; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2108; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
2109; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2110; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2111; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2112; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2113; GFX803:       atomicrmw.end:
2114; GFX803-NEXT:    ret void
2115;
2116; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
2117; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2118; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2119; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
2120; GFX906:       atomicrmw.start:
2121; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2122; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2123; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2124; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2125; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
2126; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2127; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2128; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2129; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2130; GFX906:       atomicrmw.end:
2131; GFX906-NEXT:    ret void
2132;
2133; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
2134; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2135; GFX908-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2136; GFX908-NEXT:    ret void
2137;
2138; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
2139; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2140; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2141; GFX90A-NEXT:    ret void
2142;
2143; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
2144; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2145; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2146; GFX940-NEXT:    ret void
2147;
2148; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
2149; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2150; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2151; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
2152; GFX10:       atomicrmw.start:
2153; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2154; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2155; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2156; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2157; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
2158; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2159; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2160; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2161; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2162; GFX10:       atomicrmw.end:
2163; GFX10-NEXT:    ret void
2164;
2165; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
2166; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2167; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2168; GFX11-NEXT:    ret void
2169;
2170; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
2171; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2172; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2173; GFX12-NEXT:    ret void
2174;
2175  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
2176  ret void
2177}
2178
2179define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
2180; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
2181; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2182; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2183; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
2184; GFX803:       atomicrmw.start:
2185; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2186; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2187; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2188; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2189; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
2190; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2191; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2192; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2193; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2194; GFX803:       atomicrmw.end:
2195; GFX803-NEXT:    ret void
2196;
2197; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
2198; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2199; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2200; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
2201; GFX906:       atomicrmw.start:
2202; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2203; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2204; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2205; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2206; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
2207; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2208; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2209; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2210; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2211; GFX906:       atomicrmw.end:
2212; GFX906-NEXT:    ret void
2213;
2214; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
2215; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2216; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2217; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
2218; GFX908:       atomicrmw.start:
2219; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2220; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2221; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2222; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2223; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
2224; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2225; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2226; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2227; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2228; GFX908:       atomicrmw.end:
2229; GFX908-NEXT:    ret void
2230;
2231; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
2232; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2233; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2234; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
2235; GFX90A:       atomicrmw.start:
2236; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2237; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2238; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2239; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2240; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
2241; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2242; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2243; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2244; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2245; GFX90A:       atomicrmw.end:
2246; GFX90A-NEXT:    ret void
2247;
2248; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
2249; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2250; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2251; GFX940-NEXT:    ret void
2252;
2253; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
2254; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2255; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2256; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
2257; GFX10:       atomicrmw.start:
2258; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2259; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2260; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2261; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2262; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
2263; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2264; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2265; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2266; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2267; GFX10:       atomicrmw.end:
2268; GFX10-NEXT:    ret void
2269;
2270; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
2271; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2272; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2273; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
2274; GFX11:       atomicrmw.start:
2275; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2276; GFX11-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2277; GFX11-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2278; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2279; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
2280; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2281; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2282; GFX11-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2283; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2284; GFX11:       atomicrmw.end:
2285; GFX11-NEXT:    ret void
2286;
2287; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
2288; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2289; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2290; GFX12-NEXT:    ret void
2291;
2292  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
2293  ret void
2294}
2295
2296define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
2297; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
2298; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2299; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2300; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
2301; GFX803:       atomicrmw.start:
2302; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2303; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2304; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2305; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2306; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
2307; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2308; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2309; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2310; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2311; GFX803:       atomicrmw.end:
2312; GFX803-NEXT:    ret void
2313;
2314; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
2315; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2316; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2317; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
2318; GFX906:       atomicrmw.start:
2319; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2320; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2321; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2322; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2323; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
2324; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2325; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2326; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2327; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2328; GFX906:       atomicrmw.end:
2329; GFX906-NEXT:    ret void
2330;
2331; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
2332; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2333; GFX908-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2334; GFX908-NEXT:    ret void
2335;
2336; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
2337; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2338; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2339; GFX90A-NEXT:    ret void
2340;
2341; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
2342; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2343; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2344; GFX940-NEXT:    ret void
2345;
2346; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
2347; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2348; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2349; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
2350; GFX10:       atomicrmw.start:
2351; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2352; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2353; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2354; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2355; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
2356; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2357; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2358; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2359; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2360; GFX10:       atomicrmw.end:
2361; GFX10-NEXT:    ret void
2362;
2363; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
2364; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2365; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2366; GFX11-NEXT:    ret void
2367;
2368; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
2369; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2370; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2371; GFX12-NEXT:    ret void
2372;
2373  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
2374  ret void
2375}
2376
2377define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 {
2378; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
2379; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
2380; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2381; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
2382; GFX803:       atomicrmw.start:
2383; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2384; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2385; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2386; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2387; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
2388; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2389; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2390; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2391; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2392; GFX803:       atomicrmw.end:
2393; GFX803-NEXT:    ret void
2394;
2395; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
2396; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
2397; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2398; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
2399; GFX906:       atomicrmw.start:
2400; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2401; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2402; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2403; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2404; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
2405; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2406; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2407; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2408; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2409; GFX906:       atomicrmw.end:
2410; GFX906-NEXT:    ret void
2411;
2412; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
2413; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
2414; GFX908-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2415; GFX908-NEXT:    ret void
2416;
2417; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
2418; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
2419; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2420; GFX90A-NEXT:    ret void
2421;
2422; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
2423; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
2424; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2425; GFX940-NEXT:    ret void
2426;
2427; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
2428; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
2429; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2430; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
2431; GFX10:       atomicrmw.start:
2432; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2433; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2434; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2435; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2436; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
2437; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2438; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2439; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2440; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2441; GFX10:       atomicrmw.end:
2442; GFX10-NEXT:    ret void
2443;
2444; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
2445; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
2446; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2447; GFX11-NEXT:    ret void
2448;
2449; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
2450; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
2451; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2452; GFX12-NEXT:    ret void
2453;
2454  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
2455  ret void
2456}
2457
2458define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
2459; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
2460; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
2461; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2462; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
2463; GFX803:       atomicrmw.start:
2464; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2465; GFX803-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2466; GFX803-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2467; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2468; GFX803-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
2469; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2470; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2471; GFX803-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2472; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2473; GFX803:       atomicrmw.end:
2474; GFX803-NEXT:    ret void
2475;
2476; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
2477; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
2478; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2479; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
2480; GFX906:       atomicrmw.start:
2481; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2482; GFX906-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2483; GFX906-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2484; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2485; GFX906-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
2486; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2487; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2488; GFX906-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2489; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2490; GFX906:       atomicrmw.end:
2491; GFX906-NEXT:    ret void
2492;
2493; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
2494; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
2495; GFX908-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2496; GFX908-NEXT:    ret void
2497;
2498; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
2499; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
2500; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2501; GFX90A-NEXT:    ret void
2502;
2503; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
2504; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
2505; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2506; GFX940-NEXT:    ret void
2507;
2508; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
2509; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
2510; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2511; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
2512; GFX10:       atomicrmw.start:
2513; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2514; GFX10-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
2515; GFX10-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2516; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2517; GFX10-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
2518; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2519; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2520; GFX10-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2521; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2522; GFX10:       atomicrmw.end:
2523; GFX10-NEXT:    ret void
2524;
2525; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
2526; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
2527; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2528; GFX11-NEXT:    ret void
2529;
2530; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
2531; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
2532; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
2533; GFX12-NEXT:    ret void
2534;
2535  %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
2536  ret void
2537}
2538
2539;---------------------------------------------------------------------
2540; atomicrmw fsub
2541;---------------------------------------------------------------------
2542
2543define float @test_atomicrmw_fsub_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
2544; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent(
2545; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2546; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2547; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
2548; COMMON:       atomicrmw.start:
2549; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
2550; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
2551; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2552; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2553; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
2554; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2555; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2556; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
2557; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2558; COMMON:       atomicrmw.end:
2559; COMMON-NEXT:    ret float [[RES]]
2560;
2561  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
2562  ret float %res
2563}
2564
2565define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
2566; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memory(
2567; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2568; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2569; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
2570; COMMON:       atomicrmw.start:
2571; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
2572; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
2573; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2574; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2575; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
2576; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2577; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2578; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
2579; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2580; COMMON:       atomicrmw.end:
2581; COMMON-NEXT:    ret float [[RES]]
2582;
2583  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
2584  ret float %res
2585}
2586
2587define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
2588; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_remote_memory(
2589; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2590; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2591; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
2592; COMMON:       atomicrmw.start:
2593; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
2594; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
2595; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2596; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2597; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
2598; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2599; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2600; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
2601; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2602; COMMON:       atomicrmw.end:
2603; COMMON-NEXT:    ret float [[RES]]
2604;
2605  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
2606  ret float %res
2607}
2608
2609define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
2610; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
2611; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2612; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2613; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
2614; COMMON:       atomicrmw.start:
2615; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
2616; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
2617; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2618; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2619; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
2620; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2621; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2622; COMMON-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
2623; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2624; COMMON:       atomicrmw.end:
2625; COMMON-NEXT:    ret float [[RES]]
2626;
2627  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
2628  ret float %res
2629}
2630
2631define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
2632; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode(
2633; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2634; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2635; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
2636; COMMON:       atomicrmw.start:
2637; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2638; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
2639; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2640; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2641; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
2642; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2643; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2644; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2645; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2646; COMMON:       atomicrmw.end:
2647; COMMON-NEXT:    ret float [[TMP5]]
2648;
2649  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
2650  ret float %res
2651}
2652
2653define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
2654; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
2655; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2656; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2657; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
2658; COMMON:       atomicrmw.start:
2659; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2660; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
2661; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2662; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2663; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
2664; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2665; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2666; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2667; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2668; COMMON:       atomicrmw.end:
2669; COMMON-NEXT:    ret float [[TMP5]]
2670;
2671  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
2672  ret float %res
2673}
2674
2675define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
2676; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
2677; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2678; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2679; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
2680; COMMON:       atomicrmw.start:
2681; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2682; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
2683; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2684; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2685; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
2686; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2687; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2688; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2689; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2690; COMMON:       atomicrmw.end:
2691; COMMON-NEXT:    ret float [[TMP5]]
2692;
2693  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
2694  ret float %res
2695}
2696
2697define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
2698; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
2699; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2700; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2701; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
2702; COMMON:       atomicrmw.start:
2703; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
2704; COMMON-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
2705; COMMON-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
2706; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
2707; COMMON-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
2708; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
2709; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
2710; COMMON-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
2711; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2712; COMMON:       atomicrmw.end:
2713; COMMON-NEXT:    ret float [[TMP5]]
2714;
2715  %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
2716  ret float %res
2717}
2718
2719;---------------------------------------------------------------------
2720; atomicrmw fmax
2721;---------------------------------------------------------------------
2722
2723define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
2724; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
2725; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2726; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2727; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
2728; GFX803:       atomicrmw.start:
2729; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
2730; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
2731; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
2732; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
2733; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
2734; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
2735; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
2736; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
2737; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2738; GFX803:       atomicrmw.end:
2739; GFX803-NEXT:    ret float [[TMP6]]
2740;
2741; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
2742; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2743; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2744; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
2745; GFX906:       atomicrmw.start:
2746; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
2747; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
2748; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
2749; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
2750; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
2751; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
2752; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
2753; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
2754; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2755; GFX906:       atomicrmw.end:
2756; GFX906-NEXT:    ret float [[TMP6]]
2757;
2758; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
2759; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2760; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2761; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
2762; GFX908:       atomicrmw.start:
2763; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
2764; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
2765; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
2766; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
2767; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
2768; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
2769; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
2770; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
2771; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2772; GFX908:       atomicrmw.end:
2773; GFX908-NEXT:    ret float [[TMP6]]
2774;
2775; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
2776; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2777; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2778; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
2779; GFX90A:       atomicrmw.start:
2780; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
2781; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
2782; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
2783; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
2784; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
2785; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
2786; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
2787; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
2788; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2789; GFX90A:       atomicrmw.end:
2790; GFX90A-NEXT:    ret float [[TMP6]]
2791;
2792; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
2793; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2794; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2795; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
2796; GFX940:       atomicrmw.start:
2797; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
2798; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
2799; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
2800; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
2801; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
2802; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
2803; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
2804; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
2805; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2806; GFX940:       atomicrmw.end:
2807; GFX940-NEXT:    ret float [[TMP6]]
2808;
2809; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
2810; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2811; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2812; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
2813; GFX10:       atomicrmw.start:
2814; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
2815; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
2816; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
2817; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
2818; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
2819; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
2820; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
2821; GFX10-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
2822; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2823; GFX10:       atomicrmw.end:
2824; GFX10-NEXT:    ret float [[TMP6]]
2825;
2826; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
2827; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2828; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2829; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
2830; GFX11:       atomicrmw.start:
2831; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
2832; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
2833; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
2834; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
2835; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
2836; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
2837; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
2838; GFX11-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
2839; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2840; GFX11:       atomicrmw.end:
2841; GFX11-NEXT:    ret float [[TMP6]]
2842;
2843; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
2844; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2845; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
2846; GFX12-NEXT:    ret float [[RES]]
2847;
2848  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
2849  ret float %res
2850}
2851
2852define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
2853; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
2854; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2855; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2856; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
2857; GFX803:       atomicrmw.start:
2858; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
2859; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
2860; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
2861; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
2862; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
2863; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
2864; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
2865; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
2866; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2867; GFX803:       atomicrmw.end:
2868; GFX803-NEXT:    ret float [[TMP6]]
2869;
2870; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
2871; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2872; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2873; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
2874; GFX906:       atomicrmw.start:
2875; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
2876; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
2877; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
2878; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
2879; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
2880; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
2881; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
2882; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
2883; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2884; GFX906:       atomicrmw.end:
2885; GFX906-NEXT:    ret float [[TMP6]]
2886;
2887; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
2888; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2889; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2890; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
2891; GFX908:       atomicrmw.start:
2892; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
2893; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
2894; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
2895; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
2896; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
2897; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
2898; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
2899; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
2900; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2901; GFX908:       atomicrmw.end:
2902; GFX908-NEXT:    ret float [[TMP6]]
2903;
2904; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
2905; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2906; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2907; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
2908; GFX90A:       atomicrmw.start:
2909; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
2910; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
2911; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
2912; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
2913; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
2914; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
2915; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
2916; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
2917; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2918; GFX90A:       atomicrmw.end:
2919; GFX90A-NEXT:    ret float [[TMP6]]
2920;
2921; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
2922; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2923; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2924; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
2925; GFX940:       atomicrmw.start:
2926; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
2927; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
2928; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
2929; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
2930; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
2931; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
2932; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
2933; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
2934; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2935; GFX940:       atomicrmw.end:
2936; GFX940-NEXT:    ret float [[TMP6]]
2937;
2938; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
2939; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2940; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
2941; GFX10-NEXT:    ret float [[RES]]
2942;
2943; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
2944; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2945; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
2946; GFX11-NEXT:    ret float [[RES]]
2947;
2948; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
2949; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2950; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
2951; GFX12-NEXT:    ret float [[RES]]
2952;
2953  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
2954  ret float %res
2955}
2956
2957define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
2958; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
2959; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2960; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2961; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
2962; GFX803:       atomicrmw.start:
2963; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
2964; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
2965; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
2966; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
2967; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
2968; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
2969; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
2970; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
2971; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2972; GFX803:       atomicrmw.end:
2973; GFX803-NEXT:    ret float [[TMP6]]
2974;
2975; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
2976; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2977; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2978; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
2979; GFX906:       atomicrmw.start:
2980; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
2981; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
2982; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
2983; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
2984; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
2985; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
2986; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
2987; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
2988; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
2989; GFX906:       atomicrmw.end:
2990; GFX906-NEXT:    ret float [[TMP6]]
2991;
2992; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
2993; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
2994; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
2995; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
2996; GFX908:       atomicrmw.start:
2997; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
2998; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
2999; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3000; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3001; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3002; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3003; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3004; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3005; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3006; GFX908:       atomicrmw.end:
3007; GFX908-NEXT:    ret float [[TMP6]]
3008;
3009; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
3010; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3011; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3012; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
3013; GFX90A:       atomicrmw.start:
3014; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3015; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3016; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3017; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3018; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3019; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3020; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3021; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3022; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3023; GFX90A:       atomicrmw.end:
3024; GFX90A-NEXT:    ret float [[TMP6]]
3025;
3026; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
3027; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3028; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3029; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
3030; GFX940:       atomicrmw.start:
3031; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3032; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3033; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3034; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3035; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3036; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3037; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3038; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3039; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3040; GFX940:       atomicrmw.end:
3041; GFX940-NEXT:    ret float [[TMP6]]
3042;
3043; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
3044; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3045; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3046; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
3047; GFX10:       atomicrmw.start:
3048; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
3049; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3050; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3051; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3052; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3053; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3054; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3055; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
3056; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3057; GFX10:       atomicrmw.end:
3058; GFX10-NEXT:    ret float [[RES]]
3059;
3060; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
3061; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3062; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3063; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
3064; GFX11:       atomicrmw.start:
3065; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
3066; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3067; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3068; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3069; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3070; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3071; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3072; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
3073; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3074; GFX11:       atomicrmw.end:
3075; GFX11-NEXT:    ret float [[RES]]
3076;
3077; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
3078; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3079; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3080; GFX12-NEXT:    ret float [[RES]]
3081;
3082  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
3083  ret float %res
3084}
3085
3086define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
3087; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3088; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3089; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3090; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
3091; GFX803:       atomicrmw.start:
3092; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3093; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3094; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3095; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3096; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
3097; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3098; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3099; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3100; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3101; GFX803:       atomicrmw.end:
3102; GFX803-NEXT:    ret float [[TMP6]]
3103;
3104; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3105; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3106; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3107; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
3108; GFX906:       atomicrmw.start:
3109; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3110; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3111; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3112; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3113; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
3114; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3115; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3116; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3117; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3118; GFX906:       atomicrmw.end:
3119; GFX906-NEXT:    ret float [[TMP6]]
3120;
3121; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3122; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3123; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3124; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
3125; GFX908:       atomicrmw.start:
3126; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3127; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3128; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3129; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3130; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
3131; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3132; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3133; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3134; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3135; GFX908:       atomicrmw.end:
3136; GFX908-NEXT:    ret float [[TMP6]]
3137;
3138; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3139; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3140; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3141; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
3142; GFX90A:       atomicrmw.start:
3143; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3144; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3145; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3146; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3147; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
3148; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3149; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3150; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3151; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3152; GFX90A:       atomicrmw.end:
3153; GFX90A-NEXT:    ret float [[TMP6]]
3154;
3155; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3156; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3157; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3158; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
3159; GFX940:       atomicrmw.start:
3160; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3161; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3162; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3163; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3164; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
3165; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3166; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3167; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3168; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3169; GFX940:       atomicrmw.end:
3170; GFX940-NEXT:    ret float [[TMP6]]
3171;
3172; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3173; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3174; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
3175; GFX10-NEXT:    ret float [[RES]]
3176;
3177; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3178; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3179; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
3180; GFX11-NEXT:    ret float [[RES]]
3181;
3182; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3183; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3184; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
3185; GFX12-NEXT:    ret float [[RES]]
3186;
3187  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
3188  ret float %res
3189}
3190
3191define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
3192; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
3193; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3194; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3195; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
3196; GFX803:       atomicrmw.start:
3197; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3198; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3199; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3200; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3201; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
3202; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3203; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3204; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3205; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3206; GFX803:       atomicrmw.end:
3207; GFX803-NEXT:    ret float [[TMP6]]
3208;
3209; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
3210; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3211; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3212; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
3213; GFX906:       atomicrmw.start:
3214; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3215; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3216; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3217; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3218; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
3219; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3220; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3221; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3222; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3223; GFX906:       atomicrmw.end:
3224; GFX906-NEXT:    ret float [[TMP6]]
3225;
3226; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
3227; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3228; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3229; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
3230; GFX908:       atomicrmw.start:
3231; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3232; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3233; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3234; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3235; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
3236; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3237; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3238; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3239; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3240; GFX908:       atomicrmw.end:
3241; GFX908-NEXT:    ret float [[TMP6]]
3242;
3243; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
3244; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3245; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3246; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
3247; GFX90A:       atomicrmw.start:
3248; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3249; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3250; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3251; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3252; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
3253; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3254; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3255; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3256; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3257; GFX90A:       atomicrmw.end:
3258; GFX90A-NEXT:    ret float [[TMP6]]
3259;
3260; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
3261; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3262; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3263; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
3264; GFX940:       atomicrmw.start:
3265; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3266; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3267; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3268; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3269; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
3270; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3271; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3272; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3273; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3274; GFX940:       atomicrmw.end:
3275; GFX940-NEXT:    ret float [[TMP6]]
3276;
3277; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
3278; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3279; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3280; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
3281; GFX10:       atomicrmw.start:
3282; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3283; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3284; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3285; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3286; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
3287; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3288; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3289; GFX10-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3290; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3291; GFX10:       atomicrmw.end:
3292; GFX10-NEXT:    ret float [[TMP6]]
3293;
3294; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
3295; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3296; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3297; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
3298; GFX11:       atomicrmw.start:
3299; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3300; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3301; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3302; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3303; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
3304; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3305; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3306; GFX11-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3307; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3308; GFX11:       atomicrmw.end:
3309; GFX11-NEXT:    ret float [[TMP6]]
3310;
3311; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
3312; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3313; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
3314; GFX12-NEXT:    ret float [[RES]]
3315;
3316  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
3317  ret float %res
3318}
3319
3320define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
3321; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
3322; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3323; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3324; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
3325; GFX803:       atomicrmw.start:
3326; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3327; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3328; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3329; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3330; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
3331; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3332; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3333; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3334; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3335; GFX803:       atomicrmw.end:
3336; GFX803-NEXT:    ret float [[TMP6]]
3337;
3338; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
3339; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3340; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3341; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
3342; GFX906:       atomicrmw.start:
3343; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3344; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3345; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3346; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3347; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
3348; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3349; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3350; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3351; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3352; GFX906:       atomicrmw.end:
3353; GFX906-NEXT:    ret float [[TMP6]]
3354;
3355; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
3356; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3357; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3358; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
3359; GFX908:       atomicrmw.start:
3360; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3361; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3362; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3363; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3364; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
3365; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3366; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3367; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3368; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3369; GFX908:       atomicrmw.end:
3370; GFX908-NEXT:    ret float [[TMP6]]
3371;
3372; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
3373; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3374; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3375; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
3376; GFX90A:       atomicrmw.start:
3377; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3378; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3379; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3380; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3381; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
3382; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3383; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3384; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3385; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3386; GFX90A:       atomicrmw.end:
3387; GFX90A-NEXT:    ret float [[TMP6]]
3388;
3389; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
3390; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3391; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3392; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
3393; GFX940:       atomicrmw.start:
3394; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3395; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3396; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3397; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3398; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
3399; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3400; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3401; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3402; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3403; GFX940:       atomicrmw.end:
3404; GFX940-NEXT:    ret float [[TMP6]]
3405;
3406; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
3407; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3408; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
3409; GFX10-NEXT:    ret float [[RES]]
3410;
3411; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
3412; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3413; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
3414; GFX11-NEXT:    ret float [[RES]]
3415;
3416; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
3417; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3418; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
3419; GFX12-NEXT:    ret float [[RES]]
3420;
3421  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
3422  ret float %res
3423}
3424
3425define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
3426; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
3427; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3428; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3429; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
3430; GFX803:       atomicrmw.start:
3431; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3432; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3433; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3434; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3435; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3436; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3437; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3438; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3439; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3440; GFX803:       atomicrmw.end:
3441; GFX803-NEXT:    ret float [[TMP6]]
3442;
3443; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
3444; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3445; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3446; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
3447; GFX906:       atomicrmw.start:
3448; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3449; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3450; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3451; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3452; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3453; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3454; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3455; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3456; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3457; GFX906:       atomicrmw.end:
3458; GFX906-NEXT:    ret float [[TMP6]]
3459;
3460; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
3461; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3462; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3463; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
3464; GFX908:       atomicrmw.start:
3465; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3466; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3467; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3468; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3469; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3470; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3471; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3472; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3473; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3474; GFX908:       atomicrmw.end:
3475; GFX908-NEXT:    ret float [[TMP6]]
3476;
3477; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
3478; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3479; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3480; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
3481; GFX90A:       atomicrmw.start:
3482; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3483; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3484; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3485; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3486; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3487; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3488; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3489; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3490; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3491; GFX90A:       atomicrmw.end:
3492; GFX90A-NEXT:    ret float [[TMP6]]
3493;
3494; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
3495; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3496; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3497; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
3498; GFX940:       atomicrmw.start:
3499; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3500; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3501; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3502; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3503; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3504; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3505; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3506; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3507; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3508; GFX940:       atomicrmw.end:
3509; GFX940-NEXT:    ret float [[TMP6]]
3510;
3511; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
3512; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3513; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3514; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
3515; GFX10:       atomicrmw.start:
3516; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
3517; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3518; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3519; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3520; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3521; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3522; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3523; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
3524; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3525; GFX10:       atomicrmw.end:
3526; GFX10-NEXT:    ret float [[RES]]
3527;
3528; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
3529; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3530; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3531; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
3532; GFX11:       atomicrmw.start:
3533; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
3534; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3535; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3536; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3537; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3538; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3539; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3540; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
3541; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3542; GFX11:       atomicrmw.end:
3543; GFX11-NEXT:    ret float [[RES]]
3544;
3545; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
3546; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3547; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
3548; GFX12-NEXT:    ret float [[RES]]
3549;
3550  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
3551  ret float %res
3552}
3553
3554define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
3555; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3556; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3557; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3558; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
3559; GFX803:       atomicrmw.start:
3560; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3561; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3562; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3563; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3564; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
3565; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3566; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3567; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3568; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3569; GFX803:       atomicrmw.end:
3570; GFX803-NEXT:    ret float [[TMP6]]
3571;
3572; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3573; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3574; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3575; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
3576; GFX906:       atomicrmw.start:
3577; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3578; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3579; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3580; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3581; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
3582; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3583; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3584; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3585; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3586; GFX906:       atomicrmw.end:
3587; GFX906-NEXT:    ret float [[TMP6]]
3588;
3589; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3590; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3591; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3592; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
3593; GFX908:       atomicrmw.start:
3594; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3595; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3596; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3597; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3598; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
3599; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3600; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3601; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3602; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3603; GFX908:       atomicrmw.end:
3604; GFX908-NEXT:    ret float [[TMP6]]
3605;
3606; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3607; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3608; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3609; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
3610; GFX90A:       atomicrmw.start:
3611; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3612; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3613; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3614; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3615; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
3616; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3617; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3618; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3619; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3620; GFX90A:       atomicrmw.end:
3621; GFX90A-NEXT:    ret float [[TMP6]]
3622;
3623; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3624; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3625; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3626; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
3627; GFX940:       atomicrmw.start:
3628; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3629; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
3630; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3631; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3632; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
3633; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3634; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3635; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3636; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3637; GFX940:       atomicrmw.end:
3638; GFX940-NEXT:    ret float [[TMP6]]
3639;
3640; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3641; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3642; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
3643; GFX10-NEXT:    ret float [[RES]]
3644;
3645; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3646; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3647; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
3648; GFX11-NEXT:    ret float [[RES]]
3649;
3650; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
3651; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3652; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
3653; GFX12-NEXT:    ret float [[RES]]
3654;
3655  %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
3656  ret float %res
3657}
3658
3659;---------------------------------------------------------------------
3660; atomicrmw fmin
3661;---------------------------------------------------------------------
3662
3663define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
3664; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
3665; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3666; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3667; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
3668; GFX803:       atomicrmw.start:
3669; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3670; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3671; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3672; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3673; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
3674; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3675; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3676; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3677; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3678; GFX803:       atomicrmw.end:
3679; GFX803-NEXT:    ret float [[TMP6]]
3680;
3681; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
3682; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3683; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3684; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
3685; GFX906:       atomicrmw.start:
3686; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3687; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3688; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3689; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3690; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
3691; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3692; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3693; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3694; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3695; GFX906:       atomicrmw.end:
3696; GFX906-NEXT:    ret float [[TMP6]]
3697;
3698; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
3699; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3700; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3701; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
3702; GFX908:       atomicrmw.start:
3703; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3704; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3705; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3706; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3707; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
3708; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3709; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3710; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3711; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3712; GFX908:       atomicrmw.end:
3713; GFX908-NEXT:    ret float [[TMP6]]
3714;
3715; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
3716; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3717; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3718; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
3719; GFX90A:       atomicrmw.start:
3720; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3721; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3722; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3723; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3724; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
3725; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3726; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3727; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3728; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3729; GFX90A:       atomicrmw.end:
3730; GFX90A-NEXT:    ret float [[TMP6]]
3731;
3732; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
3733; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3734; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3735; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
3736; GFX940:       atomicrmw.start:
3737; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3738; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3739; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3740; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3741; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
3742; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3743; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3744; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3745; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3746; GFX940:       atomicrmw.end:
3747; GFX940-NEXT:    ret float [[TMP6]]
3748;
3749; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
3750; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3751; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3752; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
3753; GFX10:       atomicrmw.start:
3754; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3755; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3756; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3757; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3758; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
3759; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3760; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3761; GFX10-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3762; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3763; GFX10:       atomicrmw.end:
3764; GFX10-NEXT:    ret float [[TMP6]]
3765;
3766; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
3767; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3768; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3769; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
3770; GFX11:       atomicrmw.start:
3771; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3772; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3773; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3774; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3775; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
3776; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3777; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3778; GFX11-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3779; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3780; GFX11:       atomicrmw.end:
3781; GFX11-NEXT:    ret float [[TMP6]]
3782;
3783; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
3784; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3785; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
3786; GFX12-NEXT:    ret float [[RES]]
3787;
3788  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
3789  ret float %res
3790}
3791
3792define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
3793; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
3794; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3795; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3796; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
3797; GFX803:       atomicrmw.start:
3798; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3799; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3800; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3801; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3802; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
3803; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3804; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3805; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3806; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3807; GFX803:       atomicrmw.end:
3808; GFX803-NEXT:    ret float [[TMP6]]
3809;
3810; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
3811; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3812; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3813; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
3814; GFX906:       atomicrmw.start:
3815; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3816; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3817; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3818; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3819; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
3820; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3821; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3822; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3823; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3824; GFX906:       atomicrmw.end:
3825; GFX906-NEXT:    ret float [[TMP6]]
3826;
3827; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
3828; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3829; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3830; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
3831; GFX908:       atomicrmw.start:
3832; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3833; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3834; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3835; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3836; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
3837; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3838; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3839; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3840; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3841; GFX908:       atomicrmw.end:
3842; GFX908-NEXT:    ret float [[TMP6]]
3843;
3844; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
3845; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3846; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3847; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
3848; GFX90A:       atomicrmw.start:
3849; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3850; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3851; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3852; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3853; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
3854; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3855; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3856; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3857; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3858; GFX90A:       atomicrmw.end:
3859; GFX90A-NEXT:    ret float [[TMP6]]
3860;
3861; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
3862; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3863; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3864; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
3865; GFX940:       atomicrmw.start:
3866; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3867; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3868; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3869; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3870; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
3871; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3872; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3873; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3874; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3875; GFX940:       atomicrmw.end:
3876; GFX940-NEXT:    ret float [[TMP6]]
3877;
3878; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
3879; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3880; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
3881; GFX10-NEXT:    ret float [[RES]]
3882;
3883; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
3884; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3885; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
3886; GFX11-NEXT:    ret float [[RES]]
3887;
3888; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
3889; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3890; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
3891; GFX12-NEXT:    ret float [[RES]]
3892;
3893  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
3894  ret float %res
3895}
3896
3897define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
3898; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
3899; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3900; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3901; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
3902; GFX803:       atomicrmw.start:
3903; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3904; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3905; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3906; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3907; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3908; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3909; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3910; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3911; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3912; GFX803:       atomicrmw.end:
3913; GFX803-NEXT:    ret float [[TMP6]]
3914;
3915; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
3916; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3917; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3918; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
3919; GFX906:       atomicrmw.start:
3920; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3921; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3922; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3923; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3924; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3925; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3926; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3927; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3928; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3929; GFX906:       atomicrmw.end:
3930; GFX906-NEXT:    ret float [[TMP6]]
3931;
3932; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
3933; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3934; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3935; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
3936; GFX908:       atomicrmw.start:
3937; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3938; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3939; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3940; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3941; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3942; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3943; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3944; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3945; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3946; GFX908:       atomicrmw.end:
3947; GFX908-NEXT:    ret float [[TMP6]]
3948;
3949; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
3950; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3951; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3952; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
3953; GFX90A:       atomicrmw.start:
3954; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3955; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3956; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3957; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3958; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3959; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3960; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3961; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3962; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3963; GFX90A:       atomicrmw.end:
3964; GFX90A-NEXT:    ret float [[TMP6]]
3965;
3966; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
3967; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3968; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3969; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
3970; GFX940:       atomicrmw.start:
3971; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
3972; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3973; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3974; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3975; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3976; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3977; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3978; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
3979; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3980; GFX940:       atomicrmw.end:
3981; GFX940-NEXT:    ret float [[TMP6]]
3982;
3983; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
3984; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
3985; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
3986; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
3987; GFX10:       atomicrmw.start:
3988; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
3989; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
3990; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
3991; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
3992; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
3993; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
3994; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
3995; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
3996; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3997; GFX10:       atomicrmw.end:
3998; GFX10-NEXT:    ret float [[RES]]
3999;
4000; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
4001; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4002; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4003; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
4004; GFX11:       atomicrmw.start:
4005; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
4006; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4007; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4008; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4009; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
4010; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4011; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4012; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
4013; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4014; GFX11:       atomicrmw.end:
4015; GFX11-NEXT:    ret float [[RES]]
4016;
4017; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
4018; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4019; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
4020; GFX12-NEXT:    ret float [[RES]]
4021;
4022  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
4023  ret float %res
4024}
4025
4026define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
4027; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4028; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4029; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4030; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
4031; GFX803:       atomicrmw.start:
4032; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4033; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4034; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4035; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4036; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
4037; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4038; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4039; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4040; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4041; GFX803:       atomicrmw.end:
4042; GFX803-NEXT:    ret float [[TMP6]]
4043;
4044; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4045; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4046; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4047; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
4048; GFX906:       atomicrmw.start:
4049; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4050; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4051; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4052; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4053; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
4054; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4055; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4056; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4057; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4058; GFX906:       atomicrmw.end:
4059; GFX906-NEXT:    ret float [[TMP6]]
4060;
4061; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4062; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4063; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4064; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
4065; GFX908:       atomicrmw.start:
4066; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4067; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4068; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4069; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4070; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
4071; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4072; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4073; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4074; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4075; GFX908:       atomicrmw.end:
4076; GFX908-NEXT:    ret float [[TMP6]]
4077;
4078; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4079; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4080; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4081; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
4082; GFX90A:       atomicrmw.start:
4083; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4084; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4085; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4086; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4087; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
4088; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4089; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4090; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4091; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4092; GFX90A:       atomicrmw.end:
4093; GFX90A-NEXT:    ret float [[TMP6]]
4094;
4095; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4096; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4097; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4098; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
4099; GFX940:       atomicrmw.start:
4100; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4101; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4102; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4103; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4104; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
4105; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4106; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4107; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4108; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4109; GFX940:       atomicrmw.end:
4110; GFX940-NEXT:    ret float [[TMP6]]
4111;
4112; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4113; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4114; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
4115; GFX10-NEXT:    ret float [[RES]]
4116;
4117; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4118; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4119; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
4120; GFX11-NEXT:    ret float [[RES]]
4121;
4122; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4123; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4124; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
4125; GFX12-NEXT:    ret float [[RES]]
4126;
4127  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
4128  ret float %res
4129}
4130
4131define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
4132; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
4133; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4134; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4135; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
4136; GFX803:       atomicrmw.start:
4137; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4138; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4139; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4140; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4141; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
4142; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4143; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4144; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4145; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4146; GFX803:       atomicrmw.end:
4147; GFX803-NEXT:    ret float [[TMP6]]
4148;
4149; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
4150; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4151; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4152; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
4153; GFX906:       atomicrmw.start:
4154; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4155; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4156; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4157; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4158; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
4159; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4160; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4161; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4162; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4163; GFX906:       atomicrmw.end:
4164; GFX906-NEXT:    ret float [[TMP6]]
4165;
4166; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
4167; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4168; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4169; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
4170; GFX908:       atomicrmw.start:
4171; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4172; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4173; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4174; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4175; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
4176; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4177; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4178; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4179; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4180; GFX908:       atomicrmw.end:
4181; GFX908-NEXT:    ret float [[TMP6]]
4182;
4183; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
4184; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4185; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4186; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
4187; GFX90A:       atomicrmw.start:
4188; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4189; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4190; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4191; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4192; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
4193; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4194; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4195; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4196; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4197; GFX90A:       atomicrmw.end:
4198; GFX90A-NEXT:    ret float [[TMP6]]
4199;
4200; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
4201; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4202; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4203; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
4204; GFX940:       atomicrmw.start:
4205; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4206; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4207; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4208; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4209; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
4210; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4211; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4212; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4213; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4214; GFX940:       atomicrmw.end:
4215; GFX940-NEXT:    ret float [[TMP6]]
4216;
4217; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
4218; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4219; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4220; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
4221; GFX10:       atomicrmw.start:
4222; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4223; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4224; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4225; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4226; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
4227; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4228; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4229; GFX10-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4230; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4231; GFX10:       atomicrmw.end:
4232; GFX10-NEXT:    ret float [[TMP6]]
4233;
4234; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
4235; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4236; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4237; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
4238; GFX11:       atomicrmw.start:
4239; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4240; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4241; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4242; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4243; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
4244; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4245; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4246; GFX11-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4247; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4248; GFX11:       atomicrmw.end:
4249; GFX11-NEXT:    ret float [[TMP6]]
4250;
4251; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
4252; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4253; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
4254; GFX12-NEXT:    ret float [[RES]]
4255;
4256  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
4257  ret float %res
4258}
4259
4260define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
4261; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
4262; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4263; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4264; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
4265; GFX803:       atomicrmw.start:
4266; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4267; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4268; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4269; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4270; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
4271; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4272; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4273; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4274; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4275; GFX803:       atomicrmw.end:
4276; GFX803-NEXT:    ret float [[TMP6]]
4277;
4278; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
4279; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4280; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4281; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
4282; GFX906:       atomicrmw.start:
4283; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4284; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4285; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4286; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4287; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
4288; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4289; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4290; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4291; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4292; GFX906:       atomicrmw.end:
4293; GFX906-NEXT:    ret float [[TMP6]]
4294;
4295; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
4296; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4297; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4298; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
4299; GFX908:       atomicrmw.start:
4300; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4301; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4302; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4303; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4304; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
4305; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4306; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4307; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4308; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4309; GFX908:       atomicrmw.end:
4310; GFX908-NEXT:    ret float [[TMP6]]
4311;
4312; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
4313; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4314; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4315; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
4316; GFX90A:       atomicrmw.start:
4317; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4318; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4319; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4320; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4321; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
4322; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4323; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4324; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4325; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4326; GFX90A:       atomicrmw.end:
4327; GFX90A-NEXT:    ret float [[TMP6]]
4328;
4329; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
4330; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4331; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4332; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
4333; GFX940:       atomicrmw.start:
4334; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4335; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4336; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4337; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4338; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
4339; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4340; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4341; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4342; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4343; GFX940:       atomicrmw.end:
4344; GFX940-NEXT:    ret float [[TMP6]]
4345;
4346; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
4347; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4348; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
4349; GFX10-NEXT:    ret float [[RES]]
4350;
4351; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
4352; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4353; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
4354; GFX11-NEXT:    ret float [[RES]]
4355;
4356; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
4357; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4358; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
4359; GFX12-NEXT:    ret float [[RES]]
4360;
4361  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
4362  ret float %res
4363}
4364
4365define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
4366; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
4367; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4368; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4369; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
4370; GFX803:       atomicrmw.start:
4371; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4372; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4373; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4374; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4375; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
4376; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4377; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4378; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4379; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4380; GFX803:       atomicrmw.end:
4381; GFX803-NEXT:    ret float [[TMP6]]
4382;
4383; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
4384; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4385; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4386; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
4387; GFX906:       atomicrmw.start:
4388; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4389; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4390; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4391; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4392; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
4393; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4394; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4395; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4396; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4397; GFX906:       atomicrmw.end:
4398; GFX906-NEXT:    ret float [[TMP6]]
4399;
4400; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
4401; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4402; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4403; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
4404; GFX908:       atomicrmw.start:
4405; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4406; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4407; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4408; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4409; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
4410; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4411; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4412; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4413; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4414; GFX908:       atomicrmw.end:
4415; GFX908-NEXT:    ret float [[TMP6]]
4416;
4417; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
4418; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4419; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4420; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
4421; GFX90A:       atomicrmw.start:
4422; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4423; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4424; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4425; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4426; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
4427; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4428; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4429; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4430; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4431; GFX90A:       atomicrmw.end:
4432; GFX90A-NEXT:    ret float [[TMP6]]
4433;
4434; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
4435; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4436; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4437; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
4438; GFX940:       atomicrmw.start:
4439; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4440; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4441; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4442; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4443; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
4444; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4445; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4446; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4447; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4448; GFX940:       atomicrmw.end:
4449; GFX940-NEXT:    ret float [[TMP6]]
4450;
4451; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
4452; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4453; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4454; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
4455; GFX10:       atomicrmw.start:
4456; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
4457; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4458; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4459; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4460; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
4461; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4462; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4463; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
4464; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4465; GFX10:       atomicrmw.end:
4466; GFX10-NEXT:    ret float [[RES]]
4467;
4468; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
4469; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4470; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4471; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
4472; GFX11:       atomicrmw.start:
4473; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
4474; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4475; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4476; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4477; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
4478; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4479; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4480; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
4481; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4482; GFX11:       atomicrmw.end:
4483; GFX11-NEXT:    ret float [[RES]]
4484;
4485; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
4486; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4487; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
4488; GFX12-NEXT:    ret float [[RES]]
4489;
4490  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
4491  ret float %res
4492}
4493
4494define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
4495; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4496; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4497; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4498; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
4499; GFX803:       atomicrmw.start:
4500; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4501; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4502; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4503; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4504; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
4505; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4506; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4507; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4508; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4509; GFX803:       atomicrmw.end:
4510; GFX803-NEXT:    ret float [[TMP6]]
4511;
4512; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4513; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4514; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4515; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
4516; GFX906:       atomicrmw.start:
4517; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4518; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4519; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4520; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4521; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
4522; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4523; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4524; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4525; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4526; GFX906:       atomicrmw.end:
4527; GFX906-NEXT:    ret float [[TMP6]]
4528;
4529; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4530; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4531; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4532; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
4533; GFX908:       atomicrmw.start:
4534; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4535; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4536; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4537; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4538; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
4539; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4540; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4541; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4542; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4543; GFX908:       atomicrmw.end:
4544; GFX908-NEXT:    ret float [[TMP6]]
4545;
4546; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4547; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4548; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4549; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
4550; GFX90A:       atomicrmw.start:
4551; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4552; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4553; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4554; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4555; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
4556; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4557; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4558; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4559; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4560; GFX90A:       atomicrmw.end:
4561; GFX90A-NEXT:    ret float [[TMP6]]
4562;
4563; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4564; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4565; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
4566; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
4567; GFX940:       atomicrmw.start:
4568; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
4569; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
4570; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
4571; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
4572; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
4573; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
4574; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
4575; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
4576; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
4577; GFX940:       atomicrmw.end:
4578; GFX940-NEXT:    ret float [[TMP6]]
4579;
4580; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4581; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4582; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
4583; GFX10-NEXT:    ret float [[RES]]
4584;
4585; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4586; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4587; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
4588; GFX11-NEXT:    ret float [[RES]]
4589;
4590; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
4591; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
4592; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
4593; GFX12-NEXT:    ret float [[RES]]
4594;
4595  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
4596  ret float %res
4597}
4598
4599attributes #0 = { "denormal-fp-mode-f32"="preserve-sign,preserve-sign" }
4600attributes #1 = { "denormal-fp-mode-f32"="dynamic,dynamic" }
4601
4602!0 = !{}
4603;.
4604; GFX803: [[META0]] = !{}
4605;.
4606; GFX906: [[META0]] = !{}
4607;.
4608; GFX908: [[META0]] = !{}
4609;.
4610; GFX90A: [[META0]] = !{}
4611;.
4612; GFX940: [[META0]] = !{}
4613;.
4614; GFX10: [[META0]] = !{}
4615;.
4616; GFX11: [[META0]] = !{}
4617;.
4618; GFX12: [[META0]] = !{}
4619;.
4620