1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s 3 4%struct.__tile_str = type { i16, i16, <256 x i32> } 5 6@buf = dso_local global [1024 x i8] zeroinitializer, align 64 7@buf2 = dso_local global [1024 x i8] zeroinitializer, align 64 8 9; test bitcast x86_amx to <256 x i32> 10define dso_local void @test_user_empty(i16 %m, i16 %n, ptr%buf, i64 %s) { 11; CHECK-LABEL: @test_user_empty( 12; CHECK-NEXT: entry: 13; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]]) 14; CHECK-NEXT: ret void 15; 16entry: 17 %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %n, ptr %buf, i64 %s) 18 %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) 19 ret void 20} 21 22; test bitcast <256 x i32> to x86_amx 23define dso_local void @test_user_empty2(<256 x i32> %in) { 24; CHECK-LABEL: @test_user_empty2( 25; CHECK-NEXT: entry: 26; CHECK-NEXT: ret void 27; 28entry: 29 %t = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %in) 30 ret void 31} 32 33define dso_local <256 x i32> @test_amx_load_bitcast_v256i32(ptr %in, i16 %m, i16 %n, ptr%buf, i64 %s) { 34; CHECK-LABEL: @test_amx_load_bitcast_v256i32( 35; CHECK-NEXT: entry: 36; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 37; CHECK-NEXT: [[T1:%.*]] = load <256 x i32>, ptr [[IN:%.*]], align 64 38; CHECK-NEXT: store <256 x i32> [[T1]], ptr [[TMP0]], align 1024 39; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[N:%.*]] to i64 40; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N]], ptr [[TMP0]], i64 [[TMP1]]) 41; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP2]]) 42; CHECK-NEXT: ret <256 x i32> [[T1]] 43; 44entry: 45 %t1 = load <256 x i32>, ptr %in, align 64 46 %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1) 47 call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, ptr %buf, i64 %s, x86_amx %t2) 48 ret <256 x i32> %t1 49} 50 51define dso_local <225 x i32> @test_amx_load_bitcast_v225i32(ptr %in, i16 %m, i16 %n, ptr%buf, i64 %s) { 52; CHECK-LABEL: @test_amx_load_bitcast_v225i32( 53; CHECK-NEXT: entry: 54; CHECK-NEXT: [[TMP0:%.*]] = alloca <225 x i32>, align 64 55; CHECK-NEXT: [[T1:%.*]] = load <225 x i32>, ptr [[IN:%.*]], align 64 56; CHECK-NEXT: store <225 x i32> [[T1]], ptr [[TMP0]], align 1024 57; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[N:%.*]] to i64 58; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N]], ptr [[TMP0]], i64 [[TMP1]]) 59; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP2]]) 60; CHECK-NEXT: ret <225 x i32> [[T1]] 61; 62entry: 63 %t1 = load <225 x i32>, ptr %in, align 64 64 %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v225i32(<225 x i32> %t1) 65 call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, ptr %buf, i64 %s, x86_amx %t2) 66 ret <225 x i32> %t1 67} 68 69define dso_local <256 x i32> @test_amx_bitcast_store(ptr %out, i16 %m, i16 %n, ptr%buf, i64 %s) { 70; CHECK-LABEL: @test_amx_bitcast_store( 71; CHECK-NEXT: entry: 72; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 73; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[M]], ptr [[BUF:%.*]], i64 [[S:%.*]]) 74; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[M]] to i64 75; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], ptr [[TMP0]], i64 [[TMP1]], x86_amx [[T1]]) 76; CHECK-NEXT: [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024 77; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[M]] to i64 78; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], ptr [[OUT:%.*]], i64 [[TMP3]], x86_amx [[T1]]) 79; CHECK-NEXT: ret <256 x i32> [[TMP2]] 80; 81entry: 82 %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %m, ptr %buf, i64 %s) 83 %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) 84 store <256 x i32> %t2, ptr %out 85 ret <256 x i32> %t2 86} 87 88define dso_local void @test_src_add(<256 x i32> %x, <256 x i32> %y, i16 %r, i16 %c, ptr %buf, i64 %s) { 89; CHECK-LABEL: @test_src_add( 90; CHECK-NEXT: entry: 91; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 92; CHECK-NEXT: [[ADD:%.*]] = add <256 x i32> [[Y:%.*]], [[X:%.*]] 93; CHECK-NEXT: store <256 x i32> [[ADD]], ptr [[TMP0]], align 1024 94; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[C:%.*]] to i64 95; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C]], ptr [[TMP0]], i64 [[TMP1]]) 96; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP2]]) 97; CHECK-NEXT: ret void 98; 99entry: 100 %add = add <256 x i32> %y, %x 101 %t = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %add) 102 call void @llvm.x86.tilestored64.internal(i16 %r, i16 %c, ptr %buf, i64 %s, x86_amx %t) 103 ret void 104} 105 106define dso_local void @test_src_add2(<256 x i32> %x, i16 %r, i16 %c, ptr %buf, i64 %s) { 107; CHECK-LABEL: @test_src_add2( 108; CHECK-NEXT: entry: 109; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 110; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]]) 111; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[C]] to i64 112; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[TMP0]], i64 [[TMP1]], x86_amx [[T1]]) 113; CHECK-NEXT: [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024 114; CHECK-NEXT: [[ADD:%.*]] = add <256 x i32> [[TMP2]], [[X:%.*]] 115; CHECK-NEXT: ret void 116; 117entry: 118 %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %r, i16 %c, ptr %buf, i64 %s) 119 %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) 120 %add = add <256 x i32> %t2, %x 121 ret void 122} 123 124define dso_local void @__tile_loadd(ptr nocapture %0, ptr %1, i64 %2) local_unnamed_addr { 125; CHECK-LABEL: @__tile_loadd( 126; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[TMP0:%.*]], align 64 127; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP0]], i64 0, i32 1 128; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2 129; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP2:%.*]], 32 130; CHECK-NEXT: [[TMP8:%.*]] = ashr exact i64 [[TMP7]], 32 131; CHECK-NEXT: [[TMP9:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP1:%.*]], i64 [[TMP8]]) 132; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP0]], i64 0, i32 2 133; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP6]] to i64 134; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 [[TMP11]], x86_amx [[TMP9]]) 135; CHECK-NEXT: ret void 136; 137 %4 = load i16, ptr %0, align 64 138 %5 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 1 139 %6 = load i16, ptr %5, align 2 140 %7 = shl i64 %2, 32 141 %8 = ashr exact i64 %7, 32 142 %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %4, i16 %6, ptr %1, i64 %8) 143 %10 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %9) 144 %11 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 2 145 store <256 x i32> %10, ptr %11, align 64 146 ret void 147} 148 149define dso_local void @__tile_dpbssd(ptr nocapture %0, ptr nocapture readonly byval(%struct.__tile_str) align 64 %1, ptr nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr { 150; CHECK-LABEL: @__tile_dpbssd( 151; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[TMP1:%.*]], align 64 152; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP2:%.*]], i64 0, i32 1 153; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2 154; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP1]], i64 0, i32 1 155; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[TMP7]], align 2 156; CHECK-NEXT: [[TMP9:%.*]] = udiv i16 [[TMP8]], 4 157; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP0:%.*]], i64 0, i32 2 158; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP6]] to i64 159; CHECK-NEXT: [[TMP12:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 [[TMP11]]) 160; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP1]], i64 0, i32 2 161; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP8]] to i64 162; CHECK-NEXT: [[TMP15:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP8]], ptr [[TMP13]], i64 [[TMP14]]) 163; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP2]], i64 0, i32 2 164; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[TMP6]] to i64 165; CHECK-NEXT: [[TMP18:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP6]], ptr [[TMP16]], i64 [[TMP17]]) 166; CHECK-NEXT: [[TMP19:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[TMP4]], i16 [[TMP6]], i16 [[TMP8]], x86_amx [[TMP12]], x86_amx [[TMP15]], x86_amx [[TMP18]]) 167; CHECK-NEXT: [[TMP20:%.*]] = sext i16 [[TMP6]] to i64 168; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 [[TMP20]], x86_amx [[TMP19]]) 169; CHECK-NEXT: ret void 170; 171 %4 = load i16, ptr %1, align 64 172 %5 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 1 173 %6 = load i16, ptr %5, align 2 174 %7 = getelementptr inbounds %struct.__tile_str, ptr %1, i64 0, i32 1 175 %8 = load i16, ptr %7, align 2 176 %9 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 2 177 %10 = load <256 x i32>, ptr %9, align 64 178 %11 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %10) 179 %12 = getelementptr inbounds %struct.__tile_str, ptr %1, i64 0, i32 2 180 %13 = load <256 x i32>, ptr %12, align 64 181 %14 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %13) 182 %15 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 2 183 %16 = load <256 x i32>, ptr %15, align 64 184 %17 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %16) 185 %18 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %4, i16 %6, i16 %8, x86_amx %11, x86_amx %14, x86_amx %17) 186 %19 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %18) 187 store <256 x i32> %19, ptr %9, align 64 188 ret void 189} 190 191define dso_local void @__tile_dpbsud(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { 192; CHECK-LABEL: @__tile_dpbsud( 193; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 194; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[K]] to i64 195; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 [[TMP2]]) 196; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[N:%.*]] to i64 197; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N]], ptr [[PB:%.*]], i64 [[TMP4]]) 198; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[N]] to i64 199; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 [[TMP6]]) 200; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]]) 201; CHECK-NEXT: [[TMP8:%.*]] = sext i16 [[N]] to i64 202; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 [[TMP8]], x86_amx [[T6]]) 203; CHECK-NEXT: ret void 204; 205 %t0 = load <256 x i32>, ptr %pa, align 64 206 %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) 207 %t2 = load <256 x i32>, ptr %pb, align 64 208 %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) 209 %t4 = load <256 x i32>, ptr %pc, align 64 210 %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) 211 %t6 = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) 212 %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6) 213 store <256 x i32> %t7, ptr %pc, align 64 214 ret void 215} 216 217define dso_local void @__tile_dpbusd(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { 218; CHECK-LABEL: @__tile_dpbusd( 219; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 220; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[K]] to i64 221; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 [[TMP2]]) 222; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[N:%.*]] to i64 223; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N]], ptr [[PB:%.*]], i64 [[TMP4]]) 224; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[N]] to i64 225; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 [[TMP6]]) 226; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]]) 227; CHECK-NEXT: [[TMP8:%.*]] = sext i16 [[N]] to i64 228; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 [[TMP8]], x86_amx [[T6]]) 229; CHECK-NEXT: ret void 230; 231 %t0 = load <256 x i32>, ptr %pa, align 64 232 %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) 233 %t2 = load <256 x i32>, ptr %pb, align 64 234 %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) 235 %t4 = load <256 x i32>, ptr %pc, align 64 236 %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) 237 %t6 = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) 238 %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6) 239 store <256 x i32> %t7, ptr %pc, align 64 240 ret void 241} 242 243define dso_local void @__tile_dpbuud(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { 244; CHECK-LABEL: @__tile_dpbuud( 245; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 246; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[K]] to i64 247; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 [[TMP2]]) 248; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[N:%.*]] to i64 249; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N]], ptr [[PB:%.*]], i64 [[TMP4]]) 250; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[N]] to i64 251; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 [[TMP6]]) 252; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]]) 253; CHECK-NEXT: [[TMP8:%.*]] = sext i16 [[N]] to i64 254; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 [[TMP8]], x86_amx [[T6]]) 255; CHECK-NEXT: ret void 256; 257 %t0 = load <256 x i32>, ptr %pa, align 64 258 %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) 259 %t2 = load <256 x i32>, ptr %pb, align 64 260 %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) 261 %t4 = load <256 x i32>, ptr %pc, align 64 262 %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) 263 %t6 = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) 264 %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6) 265 store <256 x i32> %t7, ptr %pc, align 64 266 ret void 267} 268 269define dso_local void @__tile_dpbf16ps(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { 270; CHECK-LABEL: @__tile_dpbf16ps( 271; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 272; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[K]] to i64 273; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 [[TMP2]]) 274; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[N:%.*]] to i64 275; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N]], ptr [[PB:%.*]], i64 [[TMP4]]) 276; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[N]] to i64 277; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 [[TMP6]]) 278; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]]) 279; CHECK-NEXT: [[TMP8:%.*]] = sext i16 [[N]] to i64 280; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 [[TMP8]], x86_amx [[T6]]) 281; CHECK-NEXT: ret void 282; 283 %t0 = load <256 x i32>, ptr %pa, align 64 284 %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) 285 %t2 = load <256 x i32>, ptr %pb, align 64 286 %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) 287 %t4 = load <256 x i32>, ptr %pc, align 64 288 %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) 289 %t6 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) 290 %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6) 291 store <256 x i32> %t7, ptr %pc, align 64 292 ret void 293} 294 295define dso_local void @__tile_stored(ptr %0, i64 %1, ptr nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr { 296; CHECK-LABEL: @__tile_stored( 297; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[TMP2:%.*]], align 64 298; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP2]], i64 0, i32 1 299; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2 300; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP2]], i64 0, i32 2 301; CHECK-NEXT: [[TMP8:%.*]] = sext i16 [[TMP6]] to i64 302; CHECK-NEXT: [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP7]], i64 [[TMP8]]) 303; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP1:%.*]], 32 304; CHECK-NEXT: [[TMP11:%.*]] = ashr exact i64 [[TMP10]], 32 305; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP0:%.*]], i64 [[TMP11]], x86_amx [[TMP9]]) 306; CHECK-NEXT: ret void 307; 308 %4 = load i16, ptr %2, align 64 309 %5 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 1 310 %6 = load i16, ptr %5, align 2 311 %7 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 2 312 %8 = load <256 x i32>, ptr %7, align 64 313 %9 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %8) 314 %10 = shl i64 %1, 32 315 %11 = ashr exact i64 %10, 32 316 tail call void @llvm.x86.tilestored64.internal(i16 %4, i16 %6, ptr %0, i64 %11, x86_amx %9) 317 ret void 318} 319 320define void @dead_code(ptr%buf, i1 %arg) { 321; CHECK-LABEL: @dead_code( 322; CHECK-NEXT: entry: 323; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 324; CHECK-NEXT: br i1 [[ARG:%.*]], label [[L1:%.*]], label [[L2:%.*]] 325; CHECK: l1: 326; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) 327; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]]) 328; CHECK-NEXT: [[TMP1:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024 329; CHECK-NEXT: br i1 [[ARG]], label [[L2]], label [[EXIT:%.*]] 330; CHECK: l2: 331; CHECK-NEXT: [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP1]], [[L1]] ] 332; CHECK-NEXT: store <256 x i32> [[T3]], ptr [[BUF:%.*]], align 1024 333; CHECK-NEXT: br label [[EXIT]] 334; CHECK: exit: 335; CHECK-NEXT: ret void 336; 337entry: 338 br i1 %arg, label %l1, label %l2 339 340l1: 341 %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) 342 %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) 343 br i1 %arg, label %l2, label %exit 344 345l2: 346 %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ] 347 %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) 348 %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t4) 349 store <256 x i32> %t5, ptr %buf 350 br label %exit 351 352exit: 353 ret void 354} 355 356declare x86_amx @llvm.x86.tilezero.internal(i16, i16) 357declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) 358declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 359declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 360declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 361declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 362declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 363declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) 364 365declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) 366declare x86_amx @llvm.x86.cast.vector.to.tile.v225i32(<225 x i32>) 367declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) 368declare <225 x i32> @llvm.x86.cast.tile.to.vector.v225i32(x86_amx) 369