xref: /llvm-project/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll (revision 2b63077cfa13095b3e64f79fe825cc85ca9da7be)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
3
4%struct.__tile_str = type { i16, i16, <256 x i32> }
5
6@buf = dso_local global [1024 x i8] zeroinitializer, align 64
7@buf2 = dso_local global [1024 x i8] zeroinitializer, align 64
8
9; test bitcast x86_amx to <256 x i32>
10define dso_local void @test_user_empty(i16 %m, i16 %n, ptr%buf, i64 %s) {
11; CHECK-LABEL: @test_user_empty(
12; CHECK-NEXT:  entry:
13; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]])
14; CHECK-NEXT:    ret void
15;
16entry:
17  %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
18  %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
19  ret void
20}
21
22; test bitcast <256 x i32> to x86_amx
23define dso_local void @test_user_empty2(<256 x i32> %in) {
24; CHECK-LABEL: @test_user_empty2(
25; CHECK-NEXT:  entry:
26; CHECK-NEXT:    ret void
27;
28entry:
29  %t = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %in)
30  ret void
31}
32
33define dso_local <256 x i32> @test_amx_load_bitcast_v256i32(ptr %in, i16 %m, i16 %n, ptr%buf, i64 %s) {
34; CHECK-LABEL: @test_amx_load_bitcast_v256i32(
35; CHECK-NEXT:  entry:
36; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
37; CHECK-NEXT:    [[T1:%.*]] = load <256 x i32>, ptr [[IN:%.*]], align 64
38; CHECK-NEXT:    store <256 x i32> [[T1]], ptr [[TMP0]], align 1024
39; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[N:%.*]] to i64
40; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N]], ptr [[TMP0]], i64 [[TMP1]])
41; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP2]])
42; CHECK-NEXT:    ret <256 x i32> [[T1]]
43;
44entry:
45  %t1 = load <256 x i32>, ptr %in, align 64
46  %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1)
47  call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, ptr %buf, i64 %s, x86_amx %t2)
48  ret <256 x i32> %t1
49}
50
51define dso_local <225 x i32> @test_amx_load_bitcast_v225i32(ptr %in, i16 %m, i16 %n, ptr%buf, i64 %s) {
52; CHECK-LABEL: @test_amx_load_bitcast_v225i32(
53; CHECK-NEXT:  entry:
54; CHECK-NEXT:    [[TMP0:%.*]] = alloca <225 x i32>, align 64
55; CHECK-NEXT:    [[T1:%.*]] = load <225 x i32>, ptr [[IN:%.*]], align 64
56; CHECK-NEXT:    store <225 x i32> [[T1]], ptr [[TMP0]], align 1024
57; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[N:%.*]] to i64
58; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N]], ptr [[TMP0]], i64 [[TMP1]])
59; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP2]])
60; CHECK-NEXT:    ret <225 x i32> [[T1]]
61;
62entry:
63  %t1 = load <225 x i32>, ptr %in, align 64
64  %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v225i32(<225 x i32> %t1)
65  call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, ptr %buf, i64 %s, x86_amx %t2)
66  ret <225 x i32> %t1
67}
68
69define dso_local <256 x i32> @test_amx_bitcast_store(ptr %out, i16 %m, i16 %n, ptr%buf, i64 %s) {
70; CHECK-LABEL: @test_amx_bitcast_store(
71; CHECK-NEXT:  entry:
72; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
73; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[M]], ptr [[BUF:%.*]], i64 [[S:%.*]])
74; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[M]] to i64
75; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], ptr [[TMP0]], i64 [[TMP1]], x86_amx [[T1]])
76; CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
77; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[M]] to i64
78; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], ptr [[OUT:%.*]], i64 [[TMP3]], x86_amx [[T1]])
79; CHECK-NEXT:    ret <256 x i32> [[TMP2]]
80;
81entry:
82  %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %m, ptr %buf, i64 %s)
83  %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
84  store <256 x i32> %t2, ptr %out
85  ret <256 x i32> %t2
86}
87
88define dso_local void @test_src_add(<256 x i32> %x, <256 x i32> %y, i16 %r, i16 %c, ptr %buf, i64 %s) {
89; CHECK-LABEL: @test_src_add(
90; CHECK-NEXT:  entry:
91; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
92; CHECK-NEXT:    [[ADD:%.*]] = add <256 x i32> [[Y:%.*]], [[X:%.*]]
93; CHECK-NEXT:    store <256 x i32> [[ADD]], ptr [[TMP0]], align 1024
94; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[C:%.*]] to i64
95; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C]], ptr [[TMP0]], i64 [[TMP1]])
96; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP2]])
97; CHECK-NEXT:    ret void
98;
99entry:
100  %add = add <256 x i32> %y, %x
101  %t = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %add)
102  call void @llvm.x86.tilestored64.internal(i16 %r, i16 %c, ptr %buf, i64 %s, x86_amx %t)
103  ret void
104}
105
106define dso_local void @test_src_add2(<256 x i32> %x, i16 %r, i16 %c, ptr %buf, i64 %s) {
107; CHECK-LABEL: @test_src_add2(
108; CHECK-NEXT:  entry:
109; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
110; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]])
111; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[C]] to i64
112; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[TMP0]], i64 [[TMP1]], x86_amx [[T1]])
113; CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
114; CHECK-NEXT:    [[ADD:%.*]] = add <256 x i32> [[TMP2]], [[X:%.*]]
115; CHECK-NEXT:    ret void
116;
117entry:
118  %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %r, i16 %c, ptr %buf, i64 %s)
119  %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
120  %add = add <256 x i32> %t2, %x
121  ret void
122}
123
124define dso_local void @__tile_loadd(ptr nocapture %0, ptr %1, i64 %2) local_unnamed_addr {
125; CHECK-LABEL: @__tile_loadd(
126; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[TMP0:%.*]], align 64
127; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP0]], i64 0, i32 1
128; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2
129; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP2:%.*]], 32
130; CHECK-NEXT:    [[TMP8:%.*]] = ashr exact i64 [[TMP7]], 32
131; CHECK-NEXT:    [[TMP9:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP1:%.*]], i64 [[TMP8]])
132; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP0]], i64 0, i32 2
133; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP6]] to i64
134; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 [[TMP11]], x86_amx [[TMP9]])
135; CHECK-NEXT:    ret void
136;
137  %4 = load i16, ptr %0, align 64
138  %5 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 1
139  %6 = load i16, ptr %5, align 2
140  %7 = shl i64 %2, 32
141  %8 = ashr exact i64 %7, 32
142  %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %4, i16 %6, ptr %1, i64 %8)
143  %10 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %9)
144  %11 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 2
145  store <256 x i32> %10, ptr %11, align 64
146  ret void
147}
148
149define dso_local void @__tile_dpbssd(ptr nocapture %0, ptr nocapture readonly byval(%struct.__tile_str) align 64 %1, ptr nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr {
150; CHECK-LABEL: @__tile_dpbssd(
151; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[TMP1:%.*]], align 64
152; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP2:%.*]], i64 0, i32 1
153; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2
154; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP1]], i64 0, i32 1
155; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[TMP7]], align 2
156; CHECK-NEXT:    [[TMP9:%.*]] = udiv i16 [[TMP8]], 4
157; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP0:%.*]], i64 0, i32 2
158; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP6]] to i64
159; CHECK-NEXT:    [[TMP12:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 [[TMP11]])
160; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP1]], i64 0, i32 2
161; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[TMP8]] to i64
162; CHECK-NEXT:    [[TMP15:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP8]], ptr [[TMP13]], i64 [[TMP14]])
163; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP2]], i64 0, i32 2
164; CHECK-NEXT:    [[TMP17:%.*]] = sext i16 [[TMP6]] to i64
165; CHECK-NEXT:    [[TMP18:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP6]], ptr [[TMP16]], i64 [[TMP17]])
166; CHECK-NEXT:    [[TMP19:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[TMP4]], i16 [[TMP6]], i16 [[TMP8]], x86_amx [[TMP12]], x86_amx [[TMP15]], x86_amx [[TMP18]])
167; CHECK-NEXT:    [[TMP20:%.*]] = sext i16 [[TMP6]] to i64
168; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 [[TMP20]], x86_amx [[TMP19]])
169; CHECK-NEXT:    ret void
170;
171  %4 = load i16, ptr %1, align 64
172  %5 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 1
173  %6 = load i16, ptr %5, align 2
174  %7 = getelementptr inbounds %struct.__tile_str, ptr %1, i64 0, i32 1
175  %8 = load i16, ptr %7, align 2
176  %9 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 2
177  %10 = load <256 x i32>, ptr %9, align 64
178  %11 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %10)
179  %12 = getelementptr inbounds %struct.__tile_str, ptr %1, i64 0, i32 2
180  %13 = load <256 x i32>, ptr %12, align 64
181  %14 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %13)
182  %15 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 2
183  %16 = load <256 x i32>, ptr %15, align 64
184  %17 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %16)
185  %18 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %4, i16 %6, i16 %8, x86_amx %11, x86_amx %14, x86_amx %17)
186  %19 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %18)
187  store <256 x i32> %19, ptr %9, align 64
188  ret void
189}
190
191define dso_local void @__tile_dpbsud(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) {
192; CHECK-LABEL: @__tile_dpbsud(
193; CHECK-NEXT:    [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
194; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[K]] to i64
195; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 [[TMP2]])
196; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[N:%.*]] to i64
197; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N]], ptr [[PB:%.*]], i64 [[TMP4]])
198; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[N]] to i64
199; CHECK-NEXT:    [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 [[TMP6]])
200; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]])
201; CHECK-NEXT:    [[TMP8:%.*]] = sext i16 [[N]] to i64
202; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 [[TMP8]], x86_amx [[T6]])
203; CHECK-NEXT:    ret void
204;
205  %t0 = load <256 x i32>, ptr %pa, align 64
206  %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0)
207  %t2 = load <256 x i32>, ptr %pb, align 64
208  %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2)
209  %t4 = load <256 x i32>, ptr %pc, align 64
210  %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4)
211  %t6 = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
212  %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6)
213  store <256 x i32> %t7, ptr %pc, align 64
214  ret void
215}
216
217define dso_local void @__tile_dpbusd(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) {
218; CHECK-LABEL: @__tile_dpbusd(
219; CHECK-NEXT:    [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
220; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[K]] to i64
221; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 [[TMP2]])
222; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[N:%.*]] to i64
223; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N]], ptr [[PB:%.*]], i64 [[TMP4]])
224; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[N]] to i64
225; CHECK-NEXT:    [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 [[TMP6]])
226; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]])
227; CHECK-NEXT:    [[TMP8:%.*]] = sext i16 [[N]] to i64
228; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 [[TMP8]], x86_amx [[T6]])
229; CHECK-NEXT:    ret void
230;
231  %t0 = load <256 x i32>, ptr %pa, align 64
232  %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0)
233  %t2 = load <256 x i32>, ptr %pb, align 64
234  %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2)
235  %t4 = load <256 x i32>, ptr %pc, align 64
236  %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4)
237  %t6 = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
238  %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6)
239  store <256 x i32> %t7, ptr %pc, align 64
240  ret void
241}
242
243define dso_local void @__tile_dpbuud(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) {
244; CHECK-LABEL: @__tile_dpbuud(
245; CHECK-NEXT:    [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
246; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[K]] to i64
247; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 [[TMP2]])
248; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[N:%.*]] to i64
249; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N]], ptr [[PB:%.*]], i64 [[TMP4]])
250; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[N]] to i64
251; CHECK-NEXT:    [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 [[TMP6]])
252; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]])
253; CHECK-NEXT:    [[TMP8:%.*]] = sext i16 [[N]] to i64
254; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 [[TMP8]], x86_amx [[T6]])
255; CHECK-NEXT:    ret void
256;
257  %t0 = load <256 x i32>, ptr %pa, align 64
258  %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0)
259  %t2 = load <256 x i32>, ptr %pb, align 64
260  %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2)
261  %t4 = load <256 x i32>, ptr %pc, align 64
262  %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4)
263  %t6 = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
264  %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6)
265  store <256 x i32> %t7, ptr %pc, align 64
266  ret void
267}
268
269define dso_local void @__tile_dpbf16ps(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) {
270; CHECK-LABEL: @__tile_dpbf16ps(
271; CHECK-NEXT:    [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
272; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[K]] to i64
273; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 [[TMP2]])
274; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[N:%.*]] to i64
275; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N]], ptr [[PB:%.*]], i64 [[TMP4]])
276; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[N]] to i64
277; CHECK-NEXT:    [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 [[TMP6]])
278; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]])
279; CHECK-NEXT:    [[TMP8:%.*]] = sext i16 [[N]] to i64
280; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 [[TMP8]], x86_amx [[T6]])
281; CHECK-NEXT:    ret void
282;
283  %t0 = load <256 x i32>, ptr %pa, align 64
284  %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0)
285  %t2 = load <256 x i32>, ptr %pb, align 64
286  %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2)
287  %t4 = load <256 x i32>, ptr %pc, align 64
288  %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4)
289  %t6 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
290  %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6)
291  store <256 x i32> %t7, ptr %pc, align 64
292  ret void
293}
294
295define dso_local void @__tile_stored(ptr %0, i64 %1, ptr nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr {
296; CHECK-LABEL: @__tile_stored(
297; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[TMP2:%.*]], align 64
298; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP2]], i64 0, i32 1
299; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2
300; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP2]], i64 0, i32 2
301; CHECK-NEXT:    [[TMP8:%.*]] = sext i16 [[TMP6]] to i64
302; CHECK-NEXT:    [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP7]], i64 [[TMP8]])
303; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP1:%.*]], 32
304; CHECK-NEXT:    [[TMP11:%.*]] = ashr exact i64 [[TMP10]], 32
305; CHECK-NEXT:    tail call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP0:%.*]], i64 [[TMP11]], x86_amx [[TMP9]])
306; CHECK-NEXT:    ret void
307;
308  %4 = load i16, ptr %2, align 64
309  %5 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 1
310  %6 = load i16, ptr %5, align 2
311  %7 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 2
312  %8 = load <256 x i32>, ptr %7, align 64
313  %9 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %8)
314  %10 = shl i64 %1, 32
315  %11 = ashr exact i64 %10, 32
316  tail call void @llvm.x86.tilestored64.internal(i16 %4, i16 %6, ptr %0, i64 %11, x86_amx %9)
317  ret void
318}
319
320define void @dead_code(ptr%buf, i1 %arg) {
321; CHECK-LABEL: @dead_code(
322; CHECK-NEXT:  entry:
323; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
324; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[L1:%.*]], label [[L2:%.*]]
325; CHECK:       l1:
326; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
327; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]])
328; CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
329; CHECK-NEXT:    br i1 [[ARG]], label [[L2]], label [[EXIT:%.*]]
330; CHECK:       l2:
331; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP1]], [[L1]] ]
332; CHECK-NEXT:    store <256 x i32> [[T3]], ptr [[BUF:%.*]], align 1024
333; CHECK-NEXT:    br label [[EXIT]]
334; CHECK:       exit:
335; CHECK-NEXT:    ret void
336;
337entry:
338  br i1 %arg, label %l1, label %l2
339
340l1:
341  %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
342  %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
343  br i1 %arg, label %l2, label %exit
344
345l2:
346  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
347  %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3)
348  %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t4)
349  store <256 x i32> %t5, ptr %buf
350  br label %exit
351
352exit:
353  ret void
354}
355
356declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
357declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
358declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
359declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
360declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
361declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
362declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
363declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
364
365declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>)
366declare x86_amx @llvm.x86.cast.vector.to.tile.v225i32(<225 x i32>)
367declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx)
368declare <225 x i32> @llvm.x86.cast.tile.to.vector.v225i32(x86_amx)
369