1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 2; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx900 %s | FileCheck -check-prefixes=CHECK,GFX900 %s 3; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx90a %s | FileCheck -check-prefixes=CHECK,GFX90A %s 4; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx940 %s | FileCheck -check-prefixes=CHECK,GFX940 %s 5 6;--------------------------------------------------------------------- 7; atomicrmw fadd 8;--------------------------------------------------------------------- 9 10define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align2(ptr addrspace(1) %ptr, <2 x half> %value) { 11; CHECK-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align2( 12; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { 13; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x half>, align 4, addrspace(5) 14; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x half>, align 4, addrspace(5) 15; CHECK-NEXT: [[TMP3:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 2 16; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 17; CHECK: atomicrmw.start: 18; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 19; CHECK-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 20; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 21; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]]) 22; CHECK-NEXT: store <2 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4 23; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]]) 24; CHECK-NEXT: store <2 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 4 25; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 26; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]]) 27; CHECK-NEXT: [[TMP6:%.*]] = load <2 x half>, ptr addrspace(5) [[TMP1]], align 4 28; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]]) 29; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <2 x half>, i1 } poison, <2 x half> [[TMP6]], 0 30; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1 31; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x half>, i1 } [[TMP8]], 1 32; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x half>, i1 } [[TMP8]], 0 33; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 34; CHECK: atomicrmw.end: 35; CHECK-NEXT: ret <2 x half> [[NEWLOADED]] 36; 37 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 2 38 ret <2 x half> %res 39} 40 41define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align2(ptr addrspace(1) %ptr, <2 x bfloat> %value) { 42; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align2( 43; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 44; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) 45; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) 46; CHECK-NEXT: [[TMP3:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 2 47; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 48; CHECK: atomicrmw.start: 49; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 50; CHECK-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] 51; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 52; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]]) 53; CHECK-NEXT: store <2 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4 54; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]]) 55; CHECK-NEXT: store <2 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 4 56; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 57; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]]) 58; CHECK-NEXT: [[TMP6:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[TMP1]], align 4 59; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]]) 60; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <2 x bfloat>, i1 } poison, <2 x bfloat> [[TMP6]], 0 61; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1 62; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x bfloat>, i1 } [[TMP8]], 1 63; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x bfloat>, i1 } [[TMP8]], 0 64; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 65; CHECK: atomicrmw.end: 66; CHECK-NEXT: ret <2 x bfloat> [[NEWLOADED]] 67; 68 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 2 69 ret <2 x bfloat> %res 70} 71 72define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) { 73; GFX900-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4( 74; GFX900-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 75; GFX900-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 76; GFX900-NEXT: br label [[ATOMICRMW_START:%.*]] 77; GFX900: atomicrmw.start: 78; GFX900-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 79; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 80; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 81; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 82; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 83; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 84; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 85; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 86; GFX900-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 87; GFX900: atomicrmw.end: 88; GFX900-NEXT: ret <2 x half> [[TMP5]] 89; 90; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4( 91; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 92; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 93; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] 94; GFX90A: atomicrmw.start: 95; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 96; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 97; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 98; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 99; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 100; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 101; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 102; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 103; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 104; GFX90A: atomicrmw.end: 105; GFX90A-NEXT: ret <2 x half> [[RES]] 106; 107; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4( 108; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 109; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4 110; GFX940-NEXT: ret <2 x half> [[RES]] 111; 112 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4 113 ret <2 x half> %res 114} 115 116define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) { 117; GFX900-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4( 118; GFX900-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 119; GFX900-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 120; GFX900-NEXT: br label [[ATOMICRMW_START:%.*]] 121; GFX900: atomicrmw.start: 122; GFX900-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 123; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] 124; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 125; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 126; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 127; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 128; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 129; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> 130; GFX900-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 131; GFX900: atomicrmw.end: 132; GFX900-NEXT: ret <2 x bfloat> [[TMP5]] 133; 134; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4( 135; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 136; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 137; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] 138; GFX90A: atomicrmw.start: 139; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 140; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] 141; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 142; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 143; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 144; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 145; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 146; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> 147; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 148; GFX90A: atomicrmw.end: 149; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] 150; 151; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4( 152; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 153; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4 154; GFX940-NEXT: ret <2 x bfloat> [[RES]] 155; 156 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4 157 ret <2 x bfloat> %res 158} 159 160define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align2(ptr addrspace(1) %ptr, <4 x half> %value) { 161; CHECK-LABEL: define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align2( 162; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] { 163; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5) 164; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5) 165; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 2 166; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 167; CHECK: atomicrmw.start: 168; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 169; CHECK-NEXT: [[NEW:%.*]] = fadd <4 x half> [[LOADED]], [[VALUE]] 170; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 171; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 172; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 173; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 174; CHECK-NEXT: store <4 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 8 175; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 176; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 177; CHECK-NEXT: [[TMP6:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8 178; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 179; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP6]], 0 180; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1 181; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP8]], 1 182; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP8]], 0 183; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 184; CHECK: atomicrmw.end: 185; CHECK-NEXT: ret <4 x half> [[NEWLOADED]] 186; 187 %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 2 188 ret <4 x half> %res 189} 190 191define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align2(ptr addrspace(1) %ptr, <4 x bfloat> %value) { 192; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align2( 193; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 194; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 195; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 196; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 2 197; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 198; CHECK: atomicrmw.start: 199; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 200; CHECK-NEXT: [[NEW:%.*]] = fadd <4 x bfloat> [[LOADED]], [[VALUE]] 201; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 202; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 203; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 204; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 205; CHECK-NEXT: store <4 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 8 206; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 207; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 208; CHECK-NEXT: [[TMP6:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8 209; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 210; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP6]], 0 211; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1 212; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 1 213; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 0 214; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 215; CHECK: atomicrmw.end: 216; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]] 217; 218 %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 2 219 ret <4 x bfloat> %res 220} 221 222define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align4(ptr addrspace(1) %ptr, <4 x half> %value) { 223; CHECK-LABEL: define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align4( 224; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] { 225; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5) 226; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5) 227; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 4 228; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 229; CHECK: atomicrmw.start: 230; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 231; CHECK-NEXT: [[NEW:%.*]] = fadd <4 x half> [[LOADED]], [[VALUE]] 232; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 233; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 234; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 235; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 236; CHECK-NEXT: store <4 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 8 237; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 238; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 239; CHECK-NEXT: [[TMP6:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8 240; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 241; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP6]], 0 242; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1 243; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP8]], 1 244; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP8]], 0 245; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 246; CHECK: atomicrmw.end: 247; CHECK-NEXT: ret <4 x half> [[NEWLOADED]] 248; 249 %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 4 250 ret <4 x half> %res 251} 252 253define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align4(ptr addrspace(1) %ptr, <4 x bfloat> %value) { 254; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align4( 255; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 256; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 257; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 258; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 4 259; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 260; CHECK: atomicrmw.start: 261; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 262; CHECK-NEXT: [[NEW:%.*]] = fadd <4 x bfloat> [[LOADED]], [[VALUE]] 263; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 264; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 265; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 266; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 267; CHECK-NEXT: store <4 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 8 268; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 269; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 270; CHECK-NEXT: [[TMP6:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8 271; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 272; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP6]], 0 273; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1 274; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 1 275; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 0 276; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 277; CHECK: atomicrmw.end: 278; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]] 279; 280 %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 4 281 ret <4 x bfloat> %res 282} 283 284define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) { 285; CHECK-LABEL: define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align8( 286; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] { 287; CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8 288; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 289; CHECK: atomicrmw.start: 290; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 291; CHECK-NEXT: [[NEW:%.*]] = fadd <4 x half> [[LOADED]], [[VALUE]] 292; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[NEW]] to i64 293; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LOADED]] to i64 294; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 295; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 296; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 297; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x half> 298; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 299; CHECK: atomicrmw.end: 300; CHECK-NEXT: ret <4 x half> [[TMP5]] 301; 302 %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 8 303 ret <4 x half> %res 304} 305 306define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) { 307; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align8( 308; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 309; CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8 310; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 311; CHECK: atomicrmw.start: 312; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 313; CHECK-NEXT: [[NEW:%.*]] = fadd <4 x bfloat> [[LOADED]], [[VALUE]] 314; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[NEW]] to i64 315; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[LOADED]] to i64 316; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 317; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 318; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 319; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat> 320; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 321; CHECK: atomicrmw.end: 322; CHECK-NEXT: ret <4 x bfloat> [[TMP5]] 323; 324 %res = atomicrmw fadd ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 8 325 ret <4 x bfloat> %res 326} 327 328define <2 x float> @test_atomicrmw_fadd_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) { 329; CHECK-LABEL: define <2 x float> @test_atomicrmw_fadd_v2f32_global_agent_align8( 330; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] { 331; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8 332; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 333; CHECK: atomicrmw.start: 334; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x float> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 335; CHECK-NEXT: [[NEW:%.*]] = fadd <2 x float> [[LOADED]], [[VALUE]] 336; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[NEW]] to i64 337; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LOADED]] to i64 338; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 339; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 340; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 341; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <2 x float> 342; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 343; CHECK: atomicrmw.end: 344; CHECK-NEXT: ret <2 x float> [[TMP5]] 345; 346 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x float> %value syncscope("agent") seq_cst, align 8 347 ret <2 x float> %res 348} 349 350;--------------------------------------------------------------------- 351; atomicrmw fsub 352;--------------------------------------------------------------------- 353 354define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align2(ptr addrspace(1) %ptr, <2 x half> %value) { 355; CHECK-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align2( 356; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 357; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x half>, align 4, addrspace(5) 358; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x half>, align 4, addrspace(5) 359; CHECK-NEXT: [[TMP3:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 2 360; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 361; CHECK: atomicrmw.start: 362; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 363; CHECK-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] 364; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 365; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]]) 366; CHECK-NEXT: store <2 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4 367; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]]) 368; CHECK-NEXT: store <2 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 4 369; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 370; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]]) 371; CHECK-NEXT: [[TMP6:%.*]] = load <2 x half>, ptr addrspace(5) [[TMP1]], align 4 372; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]]) 373; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <2 x half>, i1 } poison, <2 x half> [[TMP6]], 0 374; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1 375; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x half>, i1 } [[TMP8]], 1 376; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x half>, i1 } [[TMP8]], 0 377; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 378; CHECK: atomicrmw.end: 379; CHECK-NEXT: ret <2 x half> [[NEWLOADED]] 380; 381 %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 2 382 ret <2 x half> %res 383} 384 385define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align2(ptr addrspace(1) %ptr, <2 x bfloat> %value) { 386; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align2( 387; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 388; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) 389; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) 390; CHECK-NEXT: [[TMP3:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 2 391; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 392; CHECK: atomicrmw.start: 393; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 394; CHECK-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] 395; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 396; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]]) 397; CHECK-NEXT: store <2 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4 398; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]]) 399; CHECK-NEXT: store <2 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 4 400; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 401; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]]) 402; CHECK-NEXT: [[TMP6:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[TMP1]], align 4 403; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]]) 404; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <2 x bfloat>, i1 } poison, <2 x bfloat> [[TMP6]], 0 405; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1 406; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x bfloat>, i1 } [[TMP8]], 1 407; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x bfloat>, i1 } [[TMP8]], 0 408; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 409; CHECK: atomicrmw.end: 410; CHECK-NEXT: ret <2 x bfloat> [[NEWLOADED]] 411; 412 %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 2 413 ret <2 x bfloat> %res 414} 415 416define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) { 417; CHECK-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align4( 418; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 419; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 420; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 421; CHECK: atomicrmw.start: 422; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 423; CHECK-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] 424; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 425; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 426; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 427; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 428; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 429; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 430; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 431; CHECK: atomicrmw.end: 432; CHECK-NEXT: ret <2 x half> [[TMP5]] 433; 434 %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4 435 ret <2 x half> %res 436} 437 438define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) { 439; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align4( 440; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 441; CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 442; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 443; CHECK: atomicrmw.start: 444; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 445; CHECK-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] 446; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 447; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 448; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 449; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 450; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 451; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> 452; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 453; CHECK: atomicrmw.end: 454; CHECK-NEXT: ret <2 x bfloat> [[TMP5]] 455; 456 %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4 457 ret <2 x bfloat> %res 458} 459 460define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align2(ptr addrspace(1) %ptr, <4 x half> %value) { 461; CHECK-LABEL: define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align2( 462; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] { 463; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5) 464; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5) 465; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 2 466; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 467; CHECK: atomicrmw.start: 468; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 469; CHECK-NEXT: [[NEW:%.*]] = fsub <4 x half> [[LOADED]], [[VALUE]] 470; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 471; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 472; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 473; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 474; CHECK-NEXT: store <4 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 8 475; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 476; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 477; CHECK-NEXT: [[TMP6:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8 478; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 479; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP6]], 0 480; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1 481; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP8]], 1 482; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP8]], 0 483; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 484; CHECK: atomicrmw.end: 485; CHECK-NEXT: ret <4 x half> [[NEWLOADED]] 486; 487 %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 2 488 ret <4 x half> %res 489} 490 491define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align2(ptr addrspace(1) %ptr, <4 x bfloat> %value) { 492; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align2( 493; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 494; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 495; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 496; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 2 497; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 498; CHECK: atomicrmw.start: 499; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 500; CHECK-NEXT: [[NEW:%.*]] = fsub <4 x bfloat> [[LOADED]], [[VALUE]] 501; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 502; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 503; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 504; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 505; CHECK-NEXT: store <4 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 8 506; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 507; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 508; CHECK-NEXT: [[TMP6:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8 509; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 510; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP6]], 0 511; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1 512; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 1 513; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 0 514; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 515; CHECK: atomicrmw.end: 516; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]] 517; 518 %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 2 519 ret <4 x bfloat> %res 520} 521 522define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align4(ptr addrspace(1) %ptr, <4 x half> %value) { 523; CHECK-LABEL: define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align4( 524; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] { 525; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5) 526; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5) 527; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 4 528; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 529; CHECK: atomicrmw.start: 530; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 531; CHECK-NEXT: [[NEW:%.*]] = fsub <4 x half> [[LOADED]], [[VALUE]] 532; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 533; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 534; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 535; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 536; CHECK-NEXT: store <4 x half> [[NEW]], ptr addrspace(5) [[TMP2]], align 8 537; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 538; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 539; CHECK-NEXT: [[TMP6:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8 540; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 541; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP6]], 0 542; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } [[TMP7]], i1 [[TMP5]], 1 543; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP8]], 1 544; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP8]], 0 545; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 546; CHECK: atomicrmw.end: 547; CHECK-NEXT: ret <4 x half> [[NEWLOADED]] 548; 549 %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 4 550 ret <4 x half> %res 551} 552 553define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align4(ptr addrspace(1) %ptr, <4 x bfloat> %value) { 554; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align4( 555; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 556; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 557; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 558; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 4 559; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 560; CHECK: atomicrmw.start: 561; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 562; CHECK-NEXT: [[NEW:%.*]] = fsub <4 x bfloat> [[LOADED]], [[VALUE]] 563; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 564; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 565; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 566; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 567; CHECK-NEXT: store <4 x bfloat> [[NEW]], ptr addrspace(5) [[TMP2]], align 8 568; CHECK-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 569; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 570; CHECK-NEXT: [[TMP6:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8 571; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 572; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP6]], 0 573; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP7]], i1 [[TMP5]], 1 574; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 1 575; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP8]], 0 576; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 577; CHECK: atomicrmw.end: 578; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]] 579; 580 %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 4 581 ret <4 x bfloat> %res 582} 583 584define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) { 585; CHECK-LABEL: define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align8( 586; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] { 587; CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8 588; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 589; CHECK: atomicrmw.start: 590; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 591; CHECK-NEXT: [[NEW:%.*]] = fsub <4 x half> [[LOADED]], [[VALUE]] 592; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[NEW]] to i64 593; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LOADED]] to i64 594; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 595; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 596; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 597; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x half> 598; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 599; CHECK: atomicrmw.end: 600; CHECK-NEXT: ret <4 x half> [[TMP5]] 601; 602 %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 8 603 ret <4 x half> %res 604} 605 606define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) { 607; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align8( 608; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 609; CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8 610; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 611; CHECK: atomicrmw.start: 612; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 613; CHECK-NEXT: [[NEW:%.*]] = fsub <4 x bfloat> [[LOADED]], [[VALUE]] 614; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[NEW]] to i64 615; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[LOADED]] to i64 616; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 617; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 618; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 619; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat> 620; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 621; CHECK: atomicrmw.end: 622; CHECK-NEXT: ret <4 x bfloat> [[TMP5]] 623; 624 %res = atomicrmw fsub ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 8 625 ret <4 x bfloat> %res 626} 627 628define <2 x float> @test_atomicrmw_fsub_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) { 629; CHECK-LABEL: define <2 x float> @test_atomicrmw_fsub_v2f32_global_agent_align8( 630; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] { 631; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8 632; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 633; CHECK: atomicrmw.start: 634; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x float> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 635; CHECK-NEXT: [[NEW:%.*]] = fsub <2 x float> [[LOADED]], [[VALUE]] 636; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[NEW]] to i64 637; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LOADED]] to i64 638; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 639; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 640; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 641; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <2 x float> 642; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 643; CHECK: atomicrmw.end: 644; CHECK-NEXT: ret <2 x float> [[TMP5]] 645; 646 %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x float> %value syncscope("agent") seq_cst, align 8 647 ret <2 x float> %res 648} 649 650;--------------------------------------------------------------------- 651; atomicrmw fmin 652;--------------------------------------------------------------------- 653 654define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align2(ptr addrspace(1) %ptr, <2 x half> %value) { 655; CHECK-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align2( 656; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 657; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x half>, align 4, addrspace(5) 658; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x half>, align 4, addrspace(5) 659; CHECK-NEXT: [[TMP3:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 2 660; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 661; CHECK: atomicrmw.start: 662; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 663; CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 664; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 665; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]]) 666; CHECK-NEXT: store <2 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4 667; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]]) 668; CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 4 669; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 670; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]]) 671; CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(5) [[TMP1]], align 4 672; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]]) 673; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x half>, i1 } poison, <2 x half> [[TMP7]], 0 674; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <2 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1 675; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x half>, i1 } [[TMP9]], 1 676; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x half>, i1 } [[TMP9]], 0 677; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 678; CHECK: atomicrmw.end: 679; CHECK-NEXT: ret <2 x half> [[NEWLOADED]] 680; 681 %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 2 682 ret <2 x half> %res 683} 684 685define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align2(ptr addrspace(1) %ptr, <2 x bfloat> %value) { 686; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align2( 687; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 688; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) 689; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) 690; CHECK-NEXT: [[TMP3:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 2 691; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 692; CHECK: atomicrmw.start: 693; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 694; CHECK-NEXT: [[TMP4:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) 695; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 696; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]]) 697; CHECK-NEXT: store <2 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4 698; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]]) 699; CHECK-NEXT: store <2 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 4 700; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 701; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]]) 702; CHECK-NEXT: [[TMP7:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[TMP1]], align 4 703; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]]) 704; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x bfloat>, i1 } poison, <2 x bfloat> [[TMP7]], 0 705; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <2 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1 706; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x bfloat>, i1 } [[TMP9]], 1 707; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x bfloat>, i1 } [[TMP9]], 0 708; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 709; CHECK: atomicrmw.end: 710; CHECK-NEXT: ret <2 x bfloat> [[NEWLOADED]] 711; 712 %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 2 713 ret <2 x bfloat> %res 714} 715 716define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) { 717; CHECK-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align4( 718; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 719; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 720; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 721; CHECK: atomicrmw.start: 722; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 723; CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 724; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 725; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 726; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 727; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 728; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 729; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> 730; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 731; CHECK: atomicrmw.end: 732; CHECK-NEXT: ret <2 x half> [[TMP6]] 733; 734 %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4 735 ret <2 x half> %res 736} 737 738define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) { 739; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align4( 740; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 741; CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 742; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 743; CHECK: atomicrmw.start: 744; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 745; CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) 746; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 747; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 748; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 749; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 750; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 751; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> 752; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 753; CHECK: atomicrmw.end: 754; CHECK-NEXT: ret <2 x bfloat> [[TMP6]] 755; 756 %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4 757 ret <2 x bfloat> %res 758} 759 760define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align2(ptr addrspace(1) %ptr, <4 x half> %value) { 761; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align2( 762; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] { 763; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5) 764; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5) 765; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 2 766; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 767; CHECK: atomicrmw.start: 768; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 769; CHECK-NEXT: [[TMP4:%.*]] = call <4 x half> @llvm.minnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]]) 770; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 771; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 772; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 773; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 774; CHECK-NEXT: store <4 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8 775; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 776; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 777; CHECK-NEXT: [[TMP7:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8 778; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 779; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP7]], 0 780; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1 781; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP9]], 1 782; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP9]], 0 783; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 784; CHECK: atomicrmw.end: 785; CHECK-NEXT: ret <4 x half> [[NEWLOADED]] 786; 787 %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 2 788 ret <4 x half> %res 789} 790 791define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align2(ptr addrspace(1) %ptr, <4 x bfloat> %value) { 792; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align2( 793; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 794; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 795; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 796; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 2 797; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 798; CHECK: atomicrmw.start: 799; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 800; CHECK-NEXT: [[TMP4:%.*]] = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]]) 801; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 802; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 803; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 804; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 805; CHECK-NEXT: store <4 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8 806; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 807; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 808; CHECK-NEXT: [[TMP7:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8 809; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 810; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP7]], 0 811; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1 812; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 1 813; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 0 814; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 815; CHECK: atomicrmw.end: 816; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]] 817; 818 %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 2 819 ret <4 x bfloat> %res 820} 821 822define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align4(ptr addrspace(1) %ptr, <4 x half> %value) { 823; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align4( 824; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] { 825; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5) 826; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5) 827; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 4 828; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 829; CHECK: atomicrmw.start: 830; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 831; CHECK-NEXT: [[TMP4:%.*]] = call <4 x half> @llvm.minnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]]) 832; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 833; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 834; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 835; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 836; CHECK-NEXT: store <4 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8 837; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 838; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 839; CHECK-NEXT: [[TMP7:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8 840; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 841; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP7]], 0 842; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1 843; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP9]], 1 844; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP9]], 0 845; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 846; CHECK: atomicrmw.end: 847; CHECK-NEXT: ret <4 x half> [[NEWLOADED]] 848; 849 %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 4 850 ret <4 x half> %res 851} 852 853define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align4(ptr addrspace(1) %ptr, <4 x bfloat> %value) { 854; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align4( 855; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 856; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 857; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 858; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 4 859; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 860; CHECK: atomicrmw.start: 861; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 862; CHECK-NEXT: [[TMP4:%.*]] = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]]) 863; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 864; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 865; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 866; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 867; CHECK-NEXT: store <4 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8 868; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 869; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 870; CHECK-NEXT: [[TMP7:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8 871; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 872; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP7]], 0 873; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1 874; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 1 875; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 0 876; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 877; CHECK: atomicrmw.end: 878; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]] 879; 880 %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 4 881 ret <4 x bfloat> %res 882} 883 884define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) { 885; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align8( 886; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] { 887; CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8 888; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 889; CHECK: atomicrmw.start: 890; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 891; CHECK-NEXT: [[TMP2:%.*]] = call <4 x half> @llvm.minnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]]) 892; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to i64 893; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[LOADED]] to i64 894; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 895; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 896; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 897; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x half> 898; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 899; CHECK: atomicrmw.end: 900; CHECK-NEXT: ret <4 x half> [[TMP6]] 901; 902 %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 8 903 ret <4 x half> %res 904} 905 906define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) { 907; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align8( 908; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 909; CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8 910; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 911; CHECK: atomicrmw.start: 912; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 913; CHECK-NEXT: [[TMP2:%.*]] = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]]) 914; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to i64 915; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[LOADED]] to i64 916; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 917; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 918; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 919; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat> 920; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 921; CHECK: atomicrmw.end: 922; CHECK-NEXT: ret <4 x bfloat> [[TMP6]] 923; 924 %res = atomicrmw fmin ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 8 925 ret <4 x bfloat> %res 926} 927 928define <2 x float> @test_atomicrmw_fmin_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) { 929; CHECK-LABEL: define <2 x float> @test_atomicrmw_fmin_v2f32_global_agent_align8( 930; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] { 931; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8 932; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 933; CHECK: atomicrmw.start: 934; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x float> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 935; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.minnum.v2f32(<2 x float> [[LOADED]], <2 x float> [[VALUE]]) 936; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to i64 937; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[LOADED]] to i64 938; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 939; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 940; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 941; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <2 x float> 942; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 943; CHECK: atomicrmw.end: 944; CHECK-NEXT: ret <2 x float> [[TMP6]] 945; 946 %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x float> %value syncscope("agent") seq_cst, align 8 947 ret <2 x float> %res 948} 949 950;--------------------------------------------------------------------- 951; atomicrmw fmax 952;--------------------------------------------------------------------- 953 954define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align2(ptr addrspace(1) %ptr, <2 x half> %value) { 955; CHECK-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align2( 956; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 957; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x half>, align 4, addrspace(5) 958; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x half>, align 4, addrspace(5) 959; CHECK-NEXT: [[TMP3:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 2 960; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 961; CHECK: atomicrmw.start: 962; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 963; CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 964; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 965; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]]) 966; CHECK-NEXT: store <2 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4 967; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]]) 968; CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 4 969; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 970; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]]) 971; CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(5) [[TMP1]], align 4 972; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]]) 973; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x half>, i1 } poison, <2 x half> [[TMP7]], 0 974; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <2 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1 975; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x half>, i1 } [[TMP9]], 1 976; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x half>, i1 } [[TMP9]], 0 977; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 978; CHECK: atomicrmw.end: 979; CHECK-NEXT: ret <2 x half> [[NEWLOADED]] 980; 981 %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 2 982 ret <2 x half> %res 983} 984 985define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align2(ptr addrspace(1) %ptr, <2 x bfloat> %value) { 986; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align2( 987; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 988; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) 989; CHECK-NEXT: [[TMP2:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) 990; CHECK-NEXT: [[TMP3:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 2 991; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 992; CHECK: atomicrmw.start: 993; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 994; CHECK-NEXT: [[TMP4:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) 995; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 996; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP1]]) 997; CHECK-NEXT: store <2 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 4 998; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP2]]) 999; CHECK-NEXT: store <2 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 4 1000; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 4, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 1001; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP2]]) 1002; CHECK-NEXT: [[TMP7:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[TMP1]], align 4 1003; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP1]]) 1004; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <2 x bfloat>, i1 } poison, <2 x bfloat> [[TMP7]], 0 1005; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <2 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1 1006; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <2 x bfloat>, i1 } [[TMP9]], 1 1007; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <2 x bfloat>, i1 } [[TMP9]], 0 1008; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1009; CHECK: atomicrmw.end: 1010; CHECK-NEXT: ret <2 x bfloat> [[NEWLOADED]] 1011; 1012 %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 2 1013 ret <2 x bfloat> %res 1014} 1015 1016define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) { 1017; CHECK-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align4( 1018; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1019; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1020; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 1021; CHECK: atomicrmw.start: 1022; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 1023; CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1024; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1025; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1026; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 1027; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1028; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1029; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1030; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1031; CHECK: atomicrmw.end: 1032; CHECK-NEXT: ret <2 x half> [[TMP6]] 1033; 1034 %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4 1035 ret <2 x half> %res 1036} 1037 1038define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) { 1039; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align4( 1040; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 1041; CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 1042; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 1043; CHECK: atomicrmw.start: 1044; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 1045; CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) 1046; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 1047; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 1048; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 1049; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1050; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1051; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> 1052; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1053; CHECK: atomicrmw.end: 1054; CHECK-NEXT: ret <2 x bfloat> [[TMP6]] 1055; 1056 %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4 1057 ret <2 x bfloat> %res 1058} 1059 1060define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align2(ptr addrspace(1) %ptr, <4 x half> %value) { 1061; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align2( 1062; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1063; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5) 1064; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5) 1065; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 2 1066; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 1067; CHECK: atomicrmw.start: 1068; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 1069; CHECK-NEXT: [[TMP4:%.*]] = call <4 x half> @llvm.maxnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]]) 1070; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 1071; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 1072; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 1073; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 1074; CHECK-NEXT: store <4 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8 1075; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 1076; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 1077; CHECK-NEXT: [[TMP7:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8 1078; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 1079; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP7]], 0 1080; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1 1081; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP9]], 1 1082; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP9]], 0 1083; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1084; CHECK: atomicrmw.end: 1085; CHECK-NEXT: ret <4 x half> [[NEWLOADED]] 1086; 1087 %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 2 1088 ret <4 x half> %res 1089} 1090 1091define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align2(ptr addrspace(1) %ptr, <4 x bfloat> %value) { 1092; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align2( 1093; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 1094; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 1095; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 1096; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 2 1097; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 1098; CHECK: atomicrmw.start: 1099; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 1100; CHECK-NEXT: [[TMP4:%.*]] = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]]) 1101; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 1102; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 1103; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 1104; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 1105; CHECK-NEXT: store <4 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8 1106; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 1107; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 1108; CHECK-NEXT: [[TMP7:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8 1109; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 1110; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP7]], 0 1111; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1 1112; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 1 1113; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 0 1114; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1115; CHECK: atomicrmw.end: 1116; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]] 1117; 1118 %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 2 1119 ret <4 x bfloat> %res 1120} 1121 1122define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align4(ptr addrspace(1) %ptr, <4 x half> %value) { 1123; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align4( 1124; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1125; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x half>, align 8, addrspace(5) 1126; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x half>, align 8, addrspace(5) 1127; CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 4 1128; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 1129; CHECK: atomicrmw.start: 1130; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 1131; CHECK-NEXT: [[TMP4:%.*]] = call <4 x half> @llvm.maxnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]]) 1132; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 1133; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 1134; CHECK-NEXT: store <4 x half> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 1135; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 1136; CHECK-NEXT: store <4 x half> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8 1137; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 1138; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 1139; CHECK-NEXT: [[TMP7:%.*]] = load <4 x half>, ptr addrspace(5) [[TMP1]], align 8 1140; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 1141; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x half>, i1 } poison, <4 x half> [[TMP7]], 0 1142; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x half>, i1 } [[TMP8]], i1 [[TMP6]], 1 1143; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x half>, i1 } [[TMP9]], 1 1144; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x half>, i1 } [[TMP9]], 0 1145; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1146; CHECK: atomicrmw.end: 1147; CHECK-NEXT: ret <4 x half> [[NEWLOADED]] 1148; 1149 %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 4 1150 ret <4 x half> %res 1151} 1152 1153define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align4(ptr addrspace(1) %ptr, <4 x bfloat> %value) { 1154; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align4( 1155; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 1156; CHECK-NEXT: [[TMP1:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 1157; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x bfloat>, align 8, addrspace(5) 1158; CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 4 1159; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 1160; CHECK: atomicrmw.start: 1161; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] 1162; CHECK-NEXT: [[TMP4:%.*]] = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]]) 1163; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr 1164; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) 1165; CHECK-NEXT: store <4 x bfloat> [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 1166; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) 1167; CHECK-NEXT: store <4 x bfloat> [[TMP4]], ptr addrspace(5) [[TMP2]], align 8 1168; CHECK-NEXT: [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) 1169; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) 1170; CHECK-NEXT: [[TMP7:%.*]] = load <4 x bfloat>, ptr addrspace(5) [[TMP1]], align 8 1171; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) 1172; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <4 x bfloat>, i1 } poison, <4 x bfloat> [[TMP7]], 0 1173; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { <4 x bfloat>, i1 } [[TMP8]], i1 [[TMP6]], 1 1174; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 1 1175; CHECK-NEXT: [[NEWLOADED]] = extractvalue { <4 x bfloat>, i1 } [[TMP9]], 0 1176; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1177; CHECK: atomicrmw.end: 1178; CHECK-NEXT: ret <4 x bfloat> [[NEWLOADED]] 1179; 1180 %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 4 1181 ret <4 x bfloat> %res 1182} 1183 1184define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) { 1185; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align8( 1186; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1187; CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8 1188; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 1189; CHECK: atomicrmw.start: 1190; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 1191; CHECK-NEXT: [[TMP2:%.*]] = call <4 x half> @llvm.maxnum.v4f16(<4 x half> [[LOADED]], <4 x half> [[VALUE]]) 1192; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP2]] to i64 1193; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[LOADED]] to i64 1194; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 1195; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 1196; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 1197; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x half> 1198; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1199; CHECK: atomicrmw.end: 1200; CHECK-NEXT: ret <4 x half> [[TMP6]] 1201; 1202 %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x half> %value syncscope("agent") seq_cst, align 8 1203 ret <4 x half> %res 1204} 1205 1206define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) { 1207; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align8( 1208; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { 1209; CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8 1210; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 1211; CHECK: atomicrmw.start: 1212; CHECK-NEXT: [[LOADED:%.*]] = phi <4 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 1213; CHECK-NEXT: [[TMP2:%.*]] = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> [[LOADED]], <4 x bfloat> [[VALUE]]) 1214; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to i64 1215; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[LOADED]] to i64 1216; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 1217; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 1218; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 1219; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat> 1220; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1221; CHECK: atomicrmw.end: 1222; CHECK-NEXT: ret <4 x bfloat> [[TMP6]] 1223; 1224 %res = atomicrmw fmax ptr addrspace(1) %ptr, <4 x bfloat> %value syncscope("agent") seq_cst, align 8 1225 ret <4 x bfloat> %res 1226} 1227 1228define <2 x float> @test_atomicrmw_fmax_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) { 1229; CHECK-LABEL: define <2 x float> @test_atomicrmw_fmax_v2f32_global_agent_align8( 1230; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] { 1231; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8 1232; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] 1233; CHECK: atomicrmw.start: 1234; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x float> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 1235; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[LOADED]], <2 x float> [[VALUE]]) 1236; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[TMP2]] to i64 1237; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[LOADED]] to i64 1238; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 1239; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 1240; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 1241; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <2 x float> 1242; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1243; CHECK: atomicrmw.end: 1244; CHECK-NEXT: ret <2 x float> [[TMP6]] 1245; 1246 %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x float> %value syncscope("agent") seq_cst, align 8 1247 ret <2 x float> %res 1248} 1249