xref: /llvm-project/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll (revision dfda9c5b9ef24b489f4fff30a6ebc0b57b4e7961)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx900 %s | FileCheck -check-prefixes=CHECK,GFX900 %s
3; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx90a %s | FileCheck -check-prefixes=CHECK,GFX90A %s
4; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx940 %s | FileCheck -check-prefixes=CHECK,GFX940 %s
5
6;---------------------------------------------------------------------
7; atomicrmw fadd
8;---------------------------------------------------------------------
9
10define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align2(ptr addrspace(1) %ptr, <2 x half> %value) {
11; CHECK-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align2(
12; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
13; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x half>, align 4, addrspace(5)
14; CHECK-NEXT:    [[TMP2:%.*]] = alloca <2 x half>, align 4, addrspace(5)
15; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 2
16; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
17; CHECK:       atomicrmw.start:
18; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
19; CHECK-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
20; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
21; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
22; CHECK-NEXT:    store <2 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
23; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
24; CHECK-NEXT:    store <2 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 4
25; CHECK-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
26; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
27; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x half>, ptr addrspace(5) [[TMP1]], align 4
28; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
29; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <2 x half>, i1 } poison, <2 x half> [[TMP6]], 0
30; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <2 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1
31; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <2 x half>, i1 } [[TMP8]], 1
32; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <2 x half>, i1 } [[TMP8]], 0
33; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
34; CHECK:       atomicrmw.end:
35; CHECK-NEXT:    ret <2 x half> [[NEWLOADED]]
36;
37  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 2
38  ret <2 x half> %res
39}
40
41define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align2(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
42; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align2(
43; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
44; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
45; CHECK-NEXT:    [[TMP2:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
46; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 2
47; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
48; CHECK:       atomicrmw.start:
49; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
50; CHECK-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
51; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
52; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
53; CHECK-NEXT:    store <2 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
54; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
55; CHECK-NEXT:    store <2 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 4
56; CHECK-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
57; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
58; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[TMP1]], align 4
59; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
60; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <2 x bfloat>, i1 } poison, <2 x bfloat> [[TMP6]], 0
61; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <2 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1
62; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <2 x bfloat>, i1 } [[TMP8]], 1
63; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <2 x bfloat>, i1 } [[TMP8]], 0
64; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
65; CHECK:       atomicrmw.end:
66; CHECK-NEXT:    ret <2 x bfloat> [[NEWLOADED]]
67;
68  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 2
69  ret <2 x bfloat> %res
70}
71
72define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) {
73; GFX900-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(
74; GFX900-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
75; GFX900-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
76; GFX900-NEXT:    br label [[ATOMICRMW_START:%.*]]
77; GFX900:       atomicrmw.start:
78; GFX900-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
79; GFX900-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
80; GFX900-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
81; GFX900-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
82; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
83; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
84; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
85; GFX900-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
86; GFX900-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
87; GFX900:       atomicrmw.end:
88; GFX900-NEXT:    ret <2 x half> [[TMP5]]
89;
90; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(
91; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
92; GFX90A-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
93; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
94; GFX90A:       atomicrmw.start:
95; GFX90A-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
96; GFX90A-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
97; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
98; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
99; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
100; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
101; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
102; GFX90A-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
103; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
104; GFX90A:       atomicrmw.end:
105; GFX90A-NEXT:    ret <2 x half> [[RES]]
106;
107; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(
108; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
109; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4
110; GFX940-NEXT:    ret <2 x half> [[RES]]
111;
112  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4
113  ret <2 x half> %res
114}
115
116define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
117; GFX900-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(
118; GFX900-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
119; GFX900-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
120; GFX900-NEXT:    br label [[ATOMICRMW_START:%.*]]
121; GFX900:       atomicrmw.start:
122; GFX900-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
123; GFX900-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
124; GFX900-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
125; GFX900-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
126; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
127; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
128; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
129; GFX900-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
130; GFX900-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
131; GFX900:       atomicrmw.end:
132; GFX900-NEXT:    ret <2 x bfloat> [[TMP5]]
133;
134; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(
135; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
136; GFX90A-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
137; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
138; GFX90A:       atomicrmw.start:
139; GFX90A-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
140; GFX90A-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
141; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
142; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
143; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
144; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
145; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
146; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
147; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
148; GFX90A:       atomicrmw.end:
149; GFX90A-NEXT:    ret <2 x bfloat> [[TMP5]]
150;
151; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(
152; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
153; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4
154; GFX940-NEXT:    ret <2 x bfloat> [[RES]]
155;
156  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4
157  ret <2 x bfloat> %res
158}
159
160define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align2(ptr addrspace(1) %ptr, <4 x half> %value) {
161; CHECK-LABEL: define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align2(
162; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
163; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
164; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
165; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 2
166; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
167; CHECK:       atomicrmw.start:
168; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
169; CHECK-NEXT:    [[NEW:%.*]] = fadd <4 x half> [[LOADED]], [[VALUE]]
170; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
171; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
172; CHECK-NEXT:    store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
173; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
174; CHECK-NEXT:    store <4 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
175; CHECK-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
176; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
177; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
178; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
179; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP6]], 0
180; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1
181; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP8]], 1
182; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP8]], 0
183; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
184; CHECK:       atomicrmw.end:
185; CHECK-NEXT:    ret <4 x half> [[NEWLOADED]]
186;
187  %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 2
188  ret <4 x half> %res
189}
190
191define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align2(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
192; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align2(
193; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
194; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
195; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
196; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 2
197; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
198; CHECK:       atomicrmw.start:
199; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
200; CHECK-NEXT:    [[NEW:%.*]] = fadd <4 x bfloat> [[LOADED]], [[VALUE]]
201; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
202; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
203; CHECK-NEXT:    store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
204; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
205; CHECK-NEXT:    store <4 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
206; CHECK-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
207; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
208; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
209; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
210; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP6]], 0
211; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1
212; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 1
213; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 0
214; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
215; CHECK:       atomicrmw.end:
216; CHECK-NEXT:    ret <4 x bfloat> [[NEWLOADED]]
217;
218  %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 2
219  ret <4 x bfloat> %res
220}
221
222define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align4(ptr addrspace(1) %ptr, <4 x half> %value) {
223; CHECK-LABEL: define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align4(
224; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
225; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
226; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
227; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 4
228; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
229; CHECK:       atomicrmw.start:
230; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
231; CHECK-NEXT:    [[NEW:%.*]] = fadd <4 x half> [[LOADED]], [[VALUE]]
232; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
233; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
234; CHECK-NEXT:    store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
235; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
236; CHECK-NEXT:    store <4 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
237; CHECK-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
238; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
239; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
240; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
241; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP6]], 0
242; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1
243; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP8]], 1
244; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP8]], 0
245; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
246; CHECK:       atomicrmw.end:
247; CHECK-NEXT:    ret <4 x half> [[NEWLOADED]]
248;
249  %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 4
250  ret <4 x half> %res
251}
252
253define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align4(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
254; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align4(
255; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
256; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
257; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
258; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 4
259; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
260; CHECK:       atomicrmw.start:
261; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
262; CHECK-NEXT:    [[NEW:%.*]] = fadd <4 x bfloat> [[LOADED]], [[VALUE]]
263; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
264; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
265; CHECK-NEXT:    store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
266; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
267; CHECK-NEXT:    store <4 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
268; CHECK-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
269; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
270; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
271; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
272; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP6]], 0
273; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1
274; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 1
275; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 0
276; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
277; CHECK:       atomicrmw.end:
278; CHECK-NEXT:    ret <4 x bfloat> [[NEWLOADED]]
279;
280  %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 4
281  ret <4 x bfloat> %res
282}
283
284define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) {
285; CHECK-LABEL: define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align8(
286; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
287; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8
288; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
289; CHECK:       atomicrmw.start:
290; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
291; CHECK-NEXT:    [[NEW:%.*]] = fadd <4 x half> [[LOADED]], [[VALUE]]
292; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[NEW]] to i64
293; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[LOADED]] to i64
294; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
295; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
296; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
297; CHECK-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x half>
298; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
299; CHECK:       atomicrmw.end:
300; CHECK-NEXT:    ret <4 x half> [[TMP5]]
301;
302  %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 8
303  ret <4 x half> %res
304}
305
306define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
307; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align8(
308; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
309; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8
310; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
311; CHECK:       atomicrmw.start:
312; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
313; CHECK-NEXT:    [[NEW:%.*]] = fadd <4 x bfloat> [[LOADED]], [[VALUE]]
314; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[NEW]] to i64
315; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x bfloat> [[LOADED]] to i64
316; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
317; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
318; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
319; CHECK-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat>
320; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
321; CHECK:       atomicrmw.end:
322; CHECK-NEXT:    ret <4 x bfloat> [[TMP5]]
323;
324  %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 8
325  ret <4 x bfloat> %res
326}
327
328define <2 x float> @test_atomicrmw_fadd_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) {
329; CHECK-LABEL: define <2 x float> @test_atomicrmw_fadd_v2f32_global_agent_align8(
330; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
331; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8
332; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
333; CHECK:       atomicrmw.start:
334; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x float> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
335; CHECK-NEXT:    [[NEW:%.*]] = fadd <2 x float> [[LOADED]], [[VALUE]]
336; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[NEW]] to i64
337; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LOADED]] to i64
338; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
339; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
340; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
341; CHECK-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to <2 x float>
342; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
343; CHECK:       atomicrmw.end:
344; CHECK-NEXT:    ret <2 x float> [[TMP5]]
345;
346  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x float> %value syncscope("agent") seq_cst, align 8
347  ret <2 x float> %res
348}
349
350;---------------------------------------------------------------------
351; atomicrmw fsub
352;---------------------------------------------------------------------
353
354define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align2(ptr addrspace(1) %ptr, <2 x half> %value) {
355; CHECK-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align2(
356; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
357; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x half>, align 4, addrspace(5)
358; CHECK-NEXT:    [[TMP2:%.*]] = alloca <2 x half>, align 4, addrspace(5)
359; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 2
360; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
361; CHECK:       atomicrmw.start:
362; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
363; CHECK-NEXT:    [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
364; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
365; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
366; CHECK-NEXT:    store <2 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
367; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
368; CHECK-NEXT:    store <2 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 4
369; CHECK-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
370; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
371; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x half>, ptr addrspace(5) [[TMP1]], align 4
372; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
373; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <2 x half>, i1 } poison, <2 x half> [[TMP6]], 0
374; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <2 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1
375; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <2 x half>, i1 } [[TMP8]], 1
376; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <2 x half>, i1 } [[TMP8]], 0
377; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
378; CHECK:       atomicrmw.end:
379; CHECK-NEXT:    ret <2 x half> [[NEWLOADED]]
380;
381  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 2
382  ret <2 x half> %res
383}
384
385define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align2(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
386; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align2(
387; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
388; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
389; CHECK-NEXT:    [[TMP2:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
390; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 2
391; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
392; CHECK:       atomicrmw.start:
393; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
394; CHECK-NEXT:    [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
395; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
396; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
397; CHECK-NEXT:    store <2 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
398; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
399; CHECK-NEXT:    store <2 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 4
400; CHECK-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
401; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
402; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[TMP1]], align 4
403; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
404; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <2 x bfloat>, i1 } poison, <2 x bfloat> [[TMP6]], 0
405; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <2 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1
406; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <2 x bfloat>, i1 } [[TMP8]], 1
407; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <2 x bfloat>, i1 } [[TMP8]], 0
408; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
409; CHECK:       atomicrmw.end:
410; CHECK-NEXT:    ret <2 x bfloat> [[NEWLOADED]]
411;
412  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 2
413  ret <2 x bfloat> %res
414}
415
416define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) {
417; CHECK-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align4(
418; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
419; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
420; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
421; CHECK:       atomicrmw.start:
422; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
423; CHECK-NEXT:    [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
424; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
425; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
426; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
427; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
428; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
429; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
430; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
431; CHECK:       atomicrmw.end:
432; CHECK-NEXT:    ret <2 x half> [[TMP5]]
433;
434  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4
435  ret <2 x half> %res
436}
437
438define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
439; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align4(
440; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
441; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
442; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
443; CHECK:       atomicrmw.start:
444; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
445; CHECK-NEXT:    [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
446; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
447; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
448; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
449; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
450; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
451; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
452; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
453; CHECK:       atomicrmw.end:
454; CHECK-NEXT:    ret <2 x bfloat> [[TMP5]]
455;
456  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4
457  ret <2 x bfloat> %res
458}
459
460define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align2(ptr addrspace(1) %ptr, <4 x half> %value) {
461; CHECK-LABEL: define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align2(
462; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
463; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
464; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
465; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 2
466; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
467; CHECK:       atomicrmw.start:
468; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
469; CHECK-NEXT:    [[NEW:%.*]] = fsub <4 x half> [[LOADED]], [[VALUE]]
470; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
471; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
472; CHECK-NEXT:    store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
473; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
474; CHECK-NEXT:    store <4 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
475; CHECK-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
476; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
477; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
478; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
479; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP6]], 0
480; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1
481; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP8]], 1
482; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP8]], 0
483; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
484; CHECK:       atomicrmw.end:
485; CHECK-NEXT:    ret <4 x half> [[NEWLOADED]]
486;
487  %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 2
488  ret <4 x half> %res
489}
490
491define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align2(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
492; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align2(
493; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
494; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
495; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
496; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 2
497; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
498; CHECK:       atomicrmw.start:
499; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
500; CHECK-NEXT:    [[NEW:%.*]] = fsub <4 x bfloat> [[LOADED]], [[VALUE]]
501; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
502; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
503; CHECK-NEXT:    store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
504; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
505; CHECK-NEXT:    store <4 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
506; CHECK-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
507; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
508; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
509; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
510; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP6]], 0
511; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1
512; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 1
513; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 0
514; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
515; CHECK:       atomicrmw.end:
516; CHECK-NEXT:    ret <4 x bfloat> [[NEWLOADED]]
517;
518  %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 2
519  ret <4 x bfloat> %res
520}
521
522define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align4(ptr addrspace(1) %ptr, <4 x half> %value) {
523; CHECK-LABEL: define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align4(
524; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
525; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
526; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
527; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 4
528; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
529; CHECK:       atomicrmw.start:
530; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
531; CHECK-NEXT:    [[NEW:%.*]] = fsub <4 x half> [[LOADED]], [[VALUE]]
532; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
533; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
534; CHECK-NEXT:    store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
535; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
536; CHECK-NEXT:    store <4 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
537; CHECK-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
538; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
539; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
540; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
541; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP6]], 0
542; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1
543; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP8]], 1
544; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP8]], 0
545; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
546; CHECK:       atomicrmw.end:
547; CHECK-NEXT:    ret <4 x half> [[NEWLOADED]]
548;
549  %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 4
550  ret <4 x half> %res
551}
552
553define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align4(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
554; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align4(
555; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
556; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
557; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
558; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 4
559; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
560; CHECK:       atomicrmw.start:
561; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
562; CHECK-NEXT:    [[NEW:%.*]] = fsub <4 x bfloat> [[LOADED]], [[VALUE]]
563; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
564; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
565; CHECK-NEXT:    store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
566; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
567; CHECK-NEXT:    store <4 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 8
568; CHECK-NEXT:    [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
569; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
570; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
571; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
572; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP6]], 0
573; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1
574; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 1
575; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 0
576; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
577; CHECK:       atomicrmw.end:
578; CHECK-NEXT:    ret <4 x bfloat> [[NEWLOADED]]
579;
580  %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 4
581  ret <4 x bfloat> %res
582}
583
584define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) {
585; CHECK-LABEL: define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align8(
586; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
587; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8
588; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
589; CHECK:       atomicrmw.start:
590; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
591; CHECK-NEXT:    [[NEW:%.*]] = fsub <4 x half> [[LOADED]], [[VALUE]]
592; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[NEW]] to i64
593; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[LOADED]] to i64
594; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
595; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
596; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
597; CHECK-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x half>
598; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
599; CHECK:       atomicrmw.end:
600; CHECK-NEXT:    ret <4 x half> [[TMP5]]
601;
602  %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 8
603  ret <4 x half> %res
604}
605
606define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
607; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align8(
608; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
609; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8
610; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
611; CHECK:       atomicrmw.start:
612; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
613; CHECK-NEXT:    [[NEW:%.*]] = fsub <4 x bfloat> [[LOADED]], [[VALUE]]
614; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[NEW]] to i64
615; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x bfloat> [[LOADED]] to i64
616; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
617; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
618; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
619; CHECK-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat>
620; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
621; CHECK:       atomicrmw.end:
622; CHECK-NEXT:    ret <4 x bfloat> [[TMP5]]
623;
624  %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 8
625  ret <4 x bfloat> %res
626}
627
628define <2 x float> @test_atomicrmw_fsub_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) {
629; CHECK-LABEL: define <2 x float> @test_atomicrmw_fsub_v2f32_global_agent_align8(
630; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
631; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8
632; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
633; CHECK:       atomicrmw.start:
634; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x float> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
635; CHECK-NEXT:    [[NEW:%.*]] = fsub <2 x float> [[LOADED]], [[VALUE]]
636; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[NEW]] to i64
637; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[LOADED]] to i64
638; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
639; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
640; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
641; CHECK-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to <2 x float>
642; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
643; CHECK:       atomicrmw.end:
644; CHECK-NEXT:    ret <2 x float> [[TMP5]]
645;
646  %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x float> %value syncscope("agent") seq_cst, align 8
647  ret <2 x float> %res
648}
649
650;---------------------------------------------------------------------
651; atomicrmw fmin
652;---------------------------------------------------------------------
653
654define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align2(ptr addrspace(1) %ptr, <2 x half> %value) {
655; CHECK-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align2(
656; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
657; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x half>, align 4, addrspace(5)
658; CHECK-NEXT:    [[TMP2:%.*]] = alloca <2 x half>, align 4, addrspace(5)
659; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 2
660; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
661; CHECK:       atomicrmw.start:
662; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
663; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
664; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
665; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
666; CHECK-NEXT:    store <2 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
667; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
668; CHECK-NEXT:    store <2 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 4
669; CHECK-NEXT:    [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
670; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
671; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x half>, ptr addrspace(5) [[TMP1]], align 4
672; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
673; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <2 x half>, i1 } poison, <2 x half> [[TMP7]], 0
674; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1
675; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <2 x half>, i1 } [[TMP9]], 1
676; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <2 x half>, i1 } [[TMP9]], 0
677; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
678; CHECK:       atomicrmw.end:
679; CHECK-NEXT:    ret <2 x half> [[NEWLOADED]]
680;
681  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 2
682  ret <2 x half> %res
683}
684
685define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align2(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
686; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align2(
687; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
688; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
689; CHECK-NEXT:    [[TMP2:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
690; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 2
691; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
692; CHECK:       atomicrmw.start:
693; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
694; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
695; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
696; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
697; CHECK-NEXT:    store <2 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
698; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
699; CHECK-NEXT:    store <2 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 4
700; CHECK-NEXT:    [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
701; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
702; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[TMP1]], align 4
703; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
704; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <2 x bfloat>, i1 } poison, <2 x bfloat> [[TMP7]], 0
705; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1
706; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <2 x bfloat>, i1 } [[TMP9]], 1
707; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <2 x bfloat>, i1 } [[TMP9]], 0
708; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
709; CHECK:       atomicrmw.end:
710; CHECK-NEXT:    ret <2 x bfloat> [[NEWLOADED]]
711;
712  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 2
713  ret <2 x bfloat> %res
714}
715
716define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) {
717; CHECK-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align4(
718; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
719; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
720; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
721; CHECK:       atomicrmw.start:
722; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
723; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
724; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
725; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
726; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
727; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
728; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
729; CHECK-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
730; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
731; CHECK:       atomicrmw.end:
732; CHECK-NEXT:    ret <2 x half> [[TMP6]]
733;
734  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4
735  ret <2 x half> %res
736}
737
738define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
739; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align4(
740; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
741; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
742; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
743; CHECK:       atomicrmw.start:
744; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
745; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
746; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
747; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
748; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
749; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
750; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
751; CHECK-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
752; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
753; CHECK:       atomicrmw.end:
754; CHECK-NEXT:    ret <2 x bfloat> [[TMP6]]
755;
756  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4
757  ret <2 x bfloat> %res
758}
759
760define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align2(ptr addrspace(1) %ptr, <4 x half> %value) {
761; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align2(
762; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
763; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
764; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
765; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 2
766; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
767; CHECK:       atomicrmw.start:
768; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
769; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.minnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]])
770; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
771; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
772; CHECK-NEXT:    store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
773; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
774; CHECK-NEXT:    store <4 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
775; CHECK-NEXT:    [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
776; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
777; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
778; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
779; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP7]], 0
780; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <4 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1
781; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP9]], 1
782; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP9]], 0
783; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
784; CHECK:       atomicrmw.end:
785; CHECK-NEXT:    ret <4 x half> [[NEWLOADED]]
786;
787  %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 2
788  ret <4 x half> %res
789}
790
791define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align2(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
792; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align2(
793; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
794; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
795; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
796; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 2
797; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
798; CHECK:       atomicrmw.start:
799; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
800; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]])
801; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
802; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
803; CHECK-NEXT:    store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
804; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
805; CHECK-NEXT:    store <4 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
806; CHECK-NEXT:    [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
807; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
808; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
809; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
810; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP7]], 0
811; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1
812; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 1
813; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 0
814; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
815; CHECK:       atomicrmw.end:
816; CHECK-NEXT:    ret <4 x bfloat> [[NEWLOADED]]
817;
818  %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 2
819  ret <4 x bfloat> %res
820}
821
822define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align4(ptr addrspace(1) %ptr, <4 x half> %value) {
823; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align4(
824; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
825; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
826; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
827; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 4
828; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
829; CHECK:       atomicrmw.start:
830; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
831; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.minnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]])
832; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
833; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
834; CHECK-NEXT:    store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
835; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
836; CHECK-NEXT:    store <4 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
837; CHECK-NEXT:    [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
838; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
839; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
840; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
841; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP7]], 0
842; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <4 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1
843; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP9]], 1
844; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP9]], 0
845; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
846; CHECK:       atomicrmw.end:
847; CHECK-NEXT:    ret <4 x half> [[NEWLOADED]]
848;
849  %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 4
850  ret <4 x half> %res
851}
852
853define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align4(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
854; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align4(
855; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
856; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
857; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
858; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 4
859; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
860; CHECK:       atomicrmw.start:
861; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
862; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]])
863; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
864; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
865; CHECK-NEXT:    store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
866; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
867; CHECK-NEXT:    store <4 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
868; CHECK-NEXT:    [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
869; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
870; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
871; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
872; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP7]], 0
873; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1
874; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 1
875; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 0
876; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
877; CHECK:       atomicrmw.end:
878; CHECK-NEXT:    ret <4 x bfloat> [[NEWLOADED]]
879;
880  %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 4
881  ret <4 x bfloat> %res
882}
883
884define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) {
885; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align8(
886; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
887; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8
888; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
889; CHECK:       atomicrmw.start:
890; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
891; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x half> @llvm.minnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]])
892; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to i64
893; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half> [[LOADED]] to i64
894; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
895; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
896; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
897; CHECK-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x half>
898; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
899; CHECK:       atomicrmw.end:
900; CHECK-NEXT:    ret <4 x half> [[TMP6]]
901;
902  %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 8
903  ret <4 x half> %res
904}
905
906define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
907; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align8(
908; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
909; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8
910; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
911; CHECK:       atomicrmw.start:
912; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
913; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]])
914; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to i64
915; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x bfloat> [[LOADED]] to i64
916; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
917; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
918; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
919; CHECK-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat>
920; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
921; CHECK:       atomicrmw.end:
922; CHECK-NEXT:    ret <4 x bfloat> [[TMP6]]
923;
924  %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 8
925  ret <4 x bfloat> %res
926}
927
928define <2 x float> @test_atomicrmw_fmin_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) {
929; CHECK-LABEL: define <2 x float> @test_atomicrmw_fmin_v2f32_global_agent_align8(
930; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
931; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8
932; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
933; CHECK:       atomicrmw.start:
934; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x float> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
935; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.minnum.v2f32(<2 x float> [[LOADED]], <2 x float> [[VALUE]])
936; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to i64
937; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x float> [[LOADED]] to i64
938; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
939; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
940; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
941; CHECK-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to <2 x float>
942; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
943; CHECK:       atomicrmw.end:
944; CHECK-NEXT:    ret <2 x float> [[TMP6]]
945;
946  %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x float> %value syncscope("agent") seq_cst, align 8
947  ret <2 x float> %res
948}
949
950;---------------------------------------------------------------------
951; atomicrmw fmax
952;---------------------------------------------------------------------
953
954define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align2(ptr addrspace(1) %ptr, <2 x half> %value) {
955; CHECK-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align2(
956; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
957; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x half>, align 4, addrspace(5)
958; CHECK-NEXT:    [[TMP2:%.*]] = alloca <2 x half>, align 4, addrspace(5)
959; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 2
960; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
961; CHECK:       atomicrmw.start:
962; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
963; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
964; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
965; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
966; CHECK-NEXT:    store <2 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
967; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
968; CHECK-NEXT:    store <2 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 4
969; CHECK-NEXT:    [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
970; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
971; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x half>, ptr addrspace(5) [[TMP1]], align 4
972; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
973; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <2 x half>, i1 } poison, <2 x half> [[TMP7]], 0
974; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1
975; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <2 x half>, i1 } [[TMP9]], 1
976; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <2 x half>, i1 } [[TMP9]], 0
977; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
978; CHECK:       atomicrmw.end:
979; CHECK-NEXT:    ret <2 x half> [[NEWLOADED]]
980;
981  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 2
982  ret <2 x half> %res
983}
984
985define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align2(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
986; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align2(
987; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
988; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
989; CHECK-NEXT:    [[TMP2:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
990; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 2
991; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
992; CHECK:       atomicrmw.start:
993; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
994; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
995; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
996; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]])
997; CHECK-NEXT:    store <2 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4
998; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]])
999; CHECK-NEXT:    store <2 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 4
1000; CHECK-NEXT:    [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
1001; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]])
1002; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[TMP1]], align 4
1003; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]])
1004; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <2 x bfloat>, i1 } poison, <2 x bfloat> [[TMP7]], 0
1005; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1
1006; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <2 x bfloat>, i1 } [[TMP9]], 1
1007; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <2 x bfloat>, i1 } [[TMP9]], 0
1008; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1009; CHECK:       atomicrmw.end:
1010; CHECK-NEXT:    ret <2 x bfloat> [[NEWLOADED]]
1011;
1012  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 2
1013  ret <2 x bfloat> %res
1014}
1015
1016define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) {
1017; CHECK-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align4(
1018; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
1019; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
1020; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
1021; CHECK:       atomicrmw.start:
1022; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
1023; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
1024; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
1025; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
1026; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
1027; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
1028; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
1029; CHECK-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
1030; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1031; CHECK:       atomicrmw.end:
1032; CHECK-NEXT:    ret <2 x half> [[TMP6]]
1033;
1034  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4
1035  ret <2 x half> %res
1036}
1037
1038define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
1039; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align4(
1040; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
1041; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
1042; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
1043; CHECK:       atomicrmw.start:
1044; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
1045; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
1046; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
1047; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
1048; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
1049; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
1050; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
1051; CHECK-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
1052; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1053; CHECK:       atomicrmw.end:
1054; CHECK-NEXT:    ret <2 x bfloat> [[TMP6]]
1055;
1056  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4
1057  ret <2 x bfloat> %res
1058}
1059
1060define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align2(ptr addrspace(1) %ptr, <4 x half> %value) {
1061; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align2(
1062; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
1063; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
1064; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
1065; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 2
1066; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
1067; CHECK:       atomicrmw.start:
1068; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
1069; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.maxnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]])
1070; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
1071; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
1072; CHECK-NEXT:    store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
1073; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
1074; CHECK-NEXT:    store <4 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
1075; CHECK-NEXT:    [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
1076; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
1077; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
1078; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
1079; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP7]], 0
1080; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <4 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1
1081; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP9]], 1
1082; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP9]], 0
1083; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1084; CHECK:       atomicrmw.end:
1085; CHECK-NEXT:    ret <4 x half> [[NEWLOADED]]
1086;
1087  %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 2
1088  ret <4 x half> %res
1089}
1090
1091define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align2(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
1092; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align2(
1093; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
1094; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
1095; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
1096; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 2
1097; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
1098; CHECK:       atomicrmw.start:
1099; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
1100; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]])
1101; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
1102; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
1103; CHECK-NEXT:    store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
1104; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
1105; CHECK-NEXT:    store <4 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
1106; CHECK-NEXT:    [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
1107; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
1108; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
1109; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
1110; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP7]], 0
1111; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1
1112; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 1
1113; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 0
1114; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1115; CHECK:       atomicrmw.end:
1116; CHECK-NEXT:    ret <4 x bfloat> [[NEWLOADED]]
1117;
1118  %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 2
1119  ret <4 x bfloat> %res
1120}
1121
1122define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align4(ptr addrspace(1) %ptr, <4 x half> %value) {
1123; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align4(
1124; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
1125; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5)
1126; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5)
1127; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 4
1128; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
1129; CHECK:       atomicrmw.start:
1130; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
1131; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.maxnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]])
1132; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
1133; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
1134; CHECK-NEXT:    store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
1135; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
1136; CHECK-NEXT:    store <4 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
1137; CHECK-NEXT:    [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
1138; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
1139; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8
1140; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
1141; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP7]], 0
1142; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <4 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1
1143; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP9]], 1
1144; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP9]], 0
1145; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1146; CHECK:       atomicrmw.end:
1147; CHECK-NEXT:    ret <4 x half> [[NEWLOADED]]
1148;
1149  %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 4
1150  ret <4 x half> %res
1151}
1152
1153define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align4(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
1154; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align4(
1155; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
1156; CHECK-NEXT:    [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
1157; CHECK-NEXT:    [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5)
1158; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 4
1159; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
1160; CHECK:       atomicrmw.start:
1161; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
1162; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]])
1163; CHECK-NEXT:    [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
1164; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]])
1165; CHECK-NEXT:    store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8
1166; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]])
1167; CHECK-NEXT:    store <4 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8
1168; CHECK-NEXT:    [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5)
1169; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]])
1170; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8
1171; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]])
1172; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP7]], 0
1173; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1
1174; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 1
1175; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 0
1176; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1177; CHECK:       atomicrmw.end:
1178; CHECK-NEXT:    ret <4 x bfloat> [[NEWLOADED]]
1179;
1180  %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 4
1181  ret <4 x bfloat> %res
1182}
1183
1184define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) {
1185; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align8(
1186; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
1187; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8
1188; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
1189; CHECK:       atomicrmw.start:
1190; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
1191; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x half> @llvm.maxnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]])
1192; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to i64
1193; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half> [[LOADED]] to i64
1194; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
1195; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
1196; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
1197; CHECK-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x half>
1198; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1199; CHECK:       atomicrmw.end:
1200; CHECK-NEXT:    ret <4 x half> [[TMP6]]
1201;
1202  %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 8
1203  ret <4 x half> %res
1204}
1205
1206define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
1207; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align8(
1208; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
1209; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8
1210; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
1211; CHECK:       atomicrmw.start:
1212; CHECK-NEXT:    [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
1213; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]])
1214; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to i64
1215; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x bfloat> [[LOADED]] to i64
1216; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
1217; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
1218; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
1219; CHECK-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat>
1220; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1221; CHECK:       atomicrmw.end:
1222; CHECK-NEXT:    ret <4 x bfloat> [[TMP6]]
1223;
1224  %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 8
1225  ret <4 x bfloat> %res
1226}
1227
1228define <2 x float> @test_atomicrmw_fmax_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) {
1229; CHECK-LABEL: define <2 x float> @test_atomicrmw_fmax_v2f32_global_agent_align8(
1230; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
1231; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8
1232; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
1233; CHECK:       atomicrmw.start:
1234; CHECK-NEXT:    [[LOADED:%.*]] = phi <2 x float> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
1235; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[LOADED]], <2 x float> [[VALUE]])
1236; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to i64
1237; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x float> [[LOADED]] to i64
1238; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
1239; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
1240; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
1241; CHECK-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to <2 x float>
1242; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
1243; CHECK:       atomicrmw.end:
1244; CHECK-NEXT:    ret <2 x float> [[TMP6]]
1245;
1246  %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x float> %value syncscope("agent") seq_cst, align 8
1247  ret <2 x float> %res
1248}
1249