1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s 3 4%struct.__tile_str = type { i16, i16, <256 x i32> } 5 6@buf = dso_local global [1024 x i8] zeroinitializer, align 64 7@buf2 = dso_local global [1024 x i8] zeroinitializer, align 64 8 9; test bitcast x86_amx to <256 x i32> 10define dso_local void @test_user_empty(i16 %m, i16 %n, ptr%buf, i64 %s) { 11; CHECK-LABEL: @test_user_empty( 12; CHECK-NEXT: entry: 13; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]]) 14; CHECK-NEXT: ret void 15; 16entry: 17 %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %n, ptr %buf, i64 %s) 18 %t2 = bitcast x86_amx %t1 to <256 x i32> 19 ret void 20} 21 22; test bitcast <256 x i32> to x86_amx 23define dso_local void @test_user_empty2(<256 x i32> %in) { 24; CHECK-LABEL: @test_user_empty2( 25; CHECK-NEXT: entry: 26; CHECK-NEXT: ret void 27; 28entry: 29 %t = bitcast <256 x i32> %in to x86_amx 30 ret void 31} 32 33define dso_local <256 x i32> @test_amx_load_bitcast(ptr %in, i16 %m, i16 %n, ptr%buf, i64 %s) { 34; CHECK-LABEL: @test_amx_load_bitcast( 35; CHECK-NEXT: entry: 36; CHECK-NEXT: [[T1:%.*]] = load <256 x i32>, ptr [[IN:%.*]], align 64 37; CHECK-NEXT: [[TMP0:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N:%.*]], ptr [[IN]], i64 64) 38; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP0]]) 39; CHECK-NEXT: ret <256 x i32> [[T1]] 40; 41entry: 42 %t1 = load <256 x i32>, ptr %in, align 64 43 %t2 = bitcast <256 x i32> %t1 to x86_amx 44 call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, ptr %buf, i64 %s, x86_amx %t2) 45 ret <256 x i32> %t1 46} 47 48define dso_local <256 x i32> @test_amx_bitcast_store(ptr %out, i16 %m, i16 %n, ptr%buf, i64 %s) { 49; CHECK-LABEL: @test_amx_bitcast_store( 50; CHECK-NEXT: entry: 51; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[M]], ptr [[BUF:%.*]], i64 [[S:%.*]]) 52; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], ptr [[OUT:%.*]], i64 64, x86_amx [[T1]]) 53; CHECK-NEXT: [[TMP0:%.*]] = load <256 x i32>, ptr [[OUT]], align 1024 54; CHECK-NEXT: ret <256 x i32> [[TMP0]] 55; 56entry: 57 %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %m, ptr %buf, i64 %s) 58 %t2 = bitcast x86_amx %t1 to <256 x i32> 59 store <256 x i32> %t2, ptr %out 60 ret <256 x i32> %t2 61} 62 63define dso_local void @test_src_add(<256 x i32> %x, <256 x i32> %y, i16 %r, i16 %c, ptr %buf, i64 %s) { 64; CHECK-LABEL: @test_src_add( 65; CHECK-NEXT: entry: 66; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 67; CHECK-NEXT: [[ADD:%.*]] = add <256 x i32> [[Y:%.*]], [[X:%.*]] 68; CHECK-NEXT: store <256 x i32> [[ADD]], ptr [[TMP0]], align 1024 69; CHECK-NEXT: [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C:%.*]], ptr [[TMP0]], i64 64) 70; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP1]]) 71; CHECK-NEXT: ret void 72; 73entry: 74 %add = add <256 x i32> %y, %x 75 %t = bitcast <256 x i32> %add to x86_amx 76 call void @llvm.x86.tilestored64.internal(i16 %r, i16 %c, ptr %buf, i64 %s, x86_amx %t) 77 ret void 78} 79 80define dso_local void @test_src_add2(<256 x i32> %x, i16 %r, i16 %c, ptr %buf, i64 %s) { 81; CHECK-LABEL: @test_src_add2( 82; CHECK-NEXT: entry: 83; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 84; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]]) 85; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[TMP0]], i64 64, x86_amx [[T1]]) 86; CHECK-NEXT: [[TMP1:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024 87; CHECK-NEXT: [[ADD:%.*]] = add <256 x i32> [[TMP1]], [[X:%.*]] 88; CHECK-NEXT: ret void 89; 90entry: 91 %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %r, i16 %c, ptr %buf, i64 %s) 92 %t2 = bitcast x86_amx %t1 to <256 x i32> 93 %add = add <256 x i32> %t2, %x 94 ret void 95} 96 97define dso_local void @test_load(ptr %in, ptr %out) local_unnamed_addr { 98; CHECK-LABEL: @test_load( 99; CHECK-NEXT: [[TMP1:%.*]] = load <256 x i32>, ptr [[IN:%.*]], align 64 100; CHECK-NEXT: store <256 x i32> [[TMP1]], ptr [[OUT:%.*]], align 64 101; CHECK-NEXT: ret void 102; 103 %1 = load <256 x i32>, ptr %in, align 64 104 store <256 x i32> %1, ptr %out, align 64 105 ret void 106} 107 108define dso_local <256 x i32> @foo(ptr nocapture readonly byval(<256 x i32>) align 1024 %0, ptr nocapture readonly byval(<256 x i32>) align 1024 %1) local_unnamed_addr { 109; CHECK-LABEL: @foo( 110; CHECK-NEXT: entry: 111; CHECK-NEXT: [[X:%.*]] = load <256 x i32>, ptr [[TMP0:%.*]], align 1024 112; CHECK-NEXT: [[Y:%.*]] = load <256 x i32>, ptr [[TMP1:%.*]], align 1024 113; CHECK-NEXT: [[ADD:%.*]] = add <256 x i32> [[Y]], [[X]] 114; CHECK-NEXT: ret <256 x i32> [[ADD]] 115; 116entry: 117 %x = load <256 x i32>, ptr %0, align 1024 118 %y = load <256 x i32>, ptr %1, align 1024 119 %add = add <256 x i32> %y, %x 120 ret <256 x i32> %add 121} 122 123define dso_local void @__tile_loadd(ptr nocapture %0, ptr %1, i64 %2) local_unnamed_addr { 124; CHECK-LABEL: @__tile_loadd( 125; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[TMP0:%.*]], align 64 126; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP0]], i64 0, i32 1 127; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2 128; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP2:%.*]], 32 129; CHECK-NEXT: [[TMP8:%.*]] = ashr exact i64 [[TMP7]], 32 130; CHECK-NEXT: [[TMP9:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP1:%.*]], i64 [[TMP8]]) 131; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP0]], i64 0, i32 2 132; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64, x86_amx [[TMP9]]) 133; CHECK-NEXT: ret void 134; 135 %4 = load i16, ptr %0, align 64 136 %5 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 1 137 %6 = load i16, ptr %5, align 2 138 %7 = shl i64 %2, 32 139 %8 = ashr exact i64 %7, 32 140 %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %4, i16 %6, ptr %1, i64 %8) 141 %10 = bitcast x86_amx %9 to <256 x i32> 142 %11 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 2 143 store <256 x i32> %10, ptr %11, align 64 144 ret void 145} 146 147define dso_local void @__tile_dpbssd(ptr nocapture %0, ptr nocapture readonly byval(%struct.__tile_str) align 64 %1, ptr nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr { 148; CHECK-LABEL: @__tile_dpbssd( 149; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[TMP1:%.*]], align 64 150; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP2:%.*]], i64 0, i32 1 151; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2 152; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP1]], i64 0, i32 1 153; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[TMP7]], align 2 154; CHECK-NEXT: [[TMP9:%.*]] = udiv i16 [[TMP8]], 4 155; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP0:%.*]], i64 0, i32 2 156; CHECK-NEXT: [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64) 157; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP1]], i64 0, i32 2 158; CHECK-NEXT: [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP8]], ptr [[TMP12]], i64 64) 159; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP2]], i64 0, i32 2 160; CHECK-NEXT: [[TMP15:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP6]], ptr [[TMP14]], i64 64) 161; CHECK-NEXT: [[TMP16:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[TMP4]], i16 [[TMP6]], i16 [[TMP8]], x86_amx [[TMP11]], x86_amx [[TMP13]], x86_amx [[TMP15]]) 162; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64, x86_amx [[TMP16]]) 163; CHECK-NEXT: ret void 164; 165 %4 = load i16, ptr %1, align 64 166 %5 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 1 167 %6 = load i16, ptr %5, align 2 168 %7 = getelementptr inbounds %struct.__tile_str, ptr %1, i64 0, i32 1 169 %8 = load i16, ptr %7, align 2 170 %9 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 2 171 %10 = load <256 x i32>, ptr %9, align 64 172 %11 = bitcast <256 x i32> %10 to x86_amx 173 %12 = getelementptr inbounds %struct.__tile_str, ptr %1, i64 0, i32 2 174 %13 = load <256 x i32>, ptr %12, align 64 175 %14 = bitcast <256 x i32> %13 to x86_amx 176 %15 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 2 177 %16 = load <256 x i32>, ptr %15, align 64 178 %17 = bitcast <256 x i32> %16 to x86_amx 179 %18 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %4, i16 %6, i16 %8, x86_amx %11, x86_amx %14, x86_amx %17) 180 %19 = bitcast x86_amx %18 to <256 x i32> 181 store <256 x i32> %19, ptr %9, align 64 182 ret void 183} 184 185define dso_local void @__tile_dpbsud(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { 186; CHECK-LABEL: @__tile_dpbsud( 187; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 188; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) 189; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) 190; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) 191; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]]) 192; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]]) 193; CHECK-NEXT: ret void 194; 195 %t0 = load <256 x i32>, ptr %pa, align 64 196 %t1 = bitcast <256 x i32> %t0 to x86_amx 197 %t2 = load <256 x i32>, ptr %pb, align 64 198 %t3 = bitcast <256 x i32> %t2 to x86_amx 199 %t4 = load <256 x i32>, ptr %pc, align 64 200 %t5 = bitcast <256 x i32> %t4 to x86_amx 201 %t6 = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) 202 %t7 = bitcast x86_amx %t6 to <256 x i32> 203 store <256 x i32> %t7, ptr %pc, align 64 204 ret void 205} 206 207define dso_local void @__tile_dpbusd(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { 208; CHECK-LABEL: @__tile_dpbusd( 209; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 210; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) 211; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) 212; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) 213; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]]) 214; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]]) 215; CHECK-NEXT: ret void 216; 217 %t0 = load <256 x i32>, ptr %pa, align 64 218 %t1 = bitcast <256 x i32> %t0 to x86_amx 219 %t2 = load <256 x i32>, ptr %pb, align 64 220 %t3 = bitcast <256 x i32> %t2 to x86_amx 221 %t4 = load <256 x i32>, ptr %pc, align 64 222 %t5 = bitcast <256 x i32> %t4 to x86_amx 223 %t6 = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) 224 %t7 = bitcast x86_amx %t6 to <256 x i32> 225 store <256 x i32> %t7, ptr %pc, align 64 226 ret void 227} 228 229define dso_local void @__tile_dpbuud(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { 230; CHECK-LABEL: @__tile_dpbuud( 231; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 232; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) 233; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) 234; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) 235; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]]) 236; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]]) 237; CHECK-NEXT: ret void 238; 239 %t0 = load <256 x i32>, ptr %pa, align 64 240 %t1 = bitcast <256 x i32> %t0 to x86_amx 241 %t2 = load <256 x i32>, ptr %pb, align 64 242 %t3 = bitcast <256 x i32> %t2 to x86_amx 243 %t4 = load <256 x i32>, ptr %pc, align 64 244 %t5 = bitcast <256 x i32> %t4 to x86_amx 245 %t6 = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) 246 %t7 = bitcast x86_amx %t6 to <256 x i32> 247 store <256 x i32> %t7, ptr %pc, align 64 248 ret void 249} 250 251define dso_local void @__tile_dpbf16ps(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { 252; CHECK-LABEL: @__tile_dpbf16ps( 253; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 254; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) 255; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) 256; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) 257; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]]) 258; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]]) 259; CHECK-NEXT: ret void 260; 261 %t0 = load <256 x i32>, ptr %pa, align 64 262 %t1 = bitcast <256 x i32> %t0 to x86_amx 263 %t2 = load <256 x i32>, ptr %pb, align 64 264 %t3 = bitcast <256 x i32> %t2 to x86_amx 265 %t4 = load <256 x i32>, ptr %pc, align 64 266 %t5 = bitcast <256 x i32> %t4 to x86_amx 267 %t6 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) 268 %t7 = bitcast x86_amx %t6 to <256 x i32> 269 store <256 x i32> %t7, ptr %pc, align 64 270 ret void 271} 272 273define dso_local void @__tile_stored(ptr %0, i64 %1, ptr nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr { 274; CHECK-LABEL: @__tile_stored( 275; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[TMP2:%.*]], align 64 276; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP2]], i64 0, i32 1 277; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2 278; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP2]], i64 0, i32 2 279; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP7]], i64 64) 280; CHECK-NEXT: [[TMP9:%.*]] = shl i64 [[TMP1:%.*]], 32 281; CHECK-NEXT: [[TMP10:%.*]] = ashr exact i64 [[TMP9]], 32 282; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP0:%.*]], i64 [[TMP10]], x86_amx [[TMP8]]) 283; CHECK-NEXT: ret void 284; 285 %4 = load i16, ptr %2, align 64 286 %5 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 1 287 %6 = load i16, ptr %5, align 2 288 %7 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 2 289 %8 = load <256 x i32>, ptr %7, align 64 290 %9 = bitcast <256 x i32> %8 to x86_amx 291 %10 = shl i64 %1, 32 292 %11 = ashr exact i64 %10, 32 293 tail call void @llvm.x86.tilestored64.internal(i16 %4, i16 %6, ptr %0, i64 %11, x86_amx %9) 294 ret void 295} 296 297declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) 298declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 299declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 300declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 301declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 302declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 303declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) 304