xref: /llvm-project/llvm/test/CodeGen/X86/AMX/amx-type.ll (revision 88e64490c163650271fab630090244016c92b823)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
3
4%struct.__tile_str = type { i16, i16, <256 x i32> }
5
6@buf = dso_local global [1024 x i8] zeroinitializer, align 64
7@buf2 = dso_local global [1024 x i8] zeroinitializer, align 64
8
9; test bitcast x86_amx to <256 x i32>
10define dso_local void @test_user_empty(i16 %m, i16 %n, ptr%buf, i64 %s) {
11; CHECK-LABEL: @test_user_empty(
12; CHECK-NEXT:  entry:
13; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]])
14; CHECK-NEXT:    ret void
15;
16entry:
17  %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
18  %t2 = bitcast x86_amx %t1 to <256 x i32>
19  ret void
20}
21
22; test bitcast <256 x i32> to x86_amx
23define dso_local void @test_user_empty2(<256 x i32> %in) {
24; CHECK-LABEL: @test_user_empty2(
25; CHECK-NEXT:  entry:
26; CHECK-NEXT:    ret void
27;
28entry:
29  %t = bitcast <256 x i32> %in to x86_amx
30  ret void
31}
32
33define dso_local <256 x i32> @test_amx_load_bitcast(ptr %in, i16 %m, i16 %n, ptr%buf, i64 %s) {
34; CHECK-LABEL: @test_amx_load_bitcast(
35; CHECK-NEXT:  entry:
36; CHECK-NEXT:    [[T1:%.*]] = load <256 x i32>, ptr [[IN:%.*]], align 64
37; CHECK-NEXT:    [[TMP0:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N:%.*]], ptr [[IN]], i64 64)
38; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP0]])
39; CHECK-NEXT:    ret <256 x i32> [[T1]]
40;
41entry:
42  %t1 = load <256 x i32>, ptr %in, align 64
43  %t2 = bitcast <256 x i32> %t1 to x86_amx
44  call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, ptr %buf, i64 %s, x86_amx %t2)
45  ret <256 x i32> %t1
46}
47
48define dso_local <256 x i32> @test_amx_bitcast_store(ptr %out, i16 %m, i16 %n, ptr%buf, i64 %s) {
49; CHECK-LABEL: @test_amx_bitcast_store(
50; CHECK-NEXT:  entry:
51; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[M]], ptr [[BUF:%.*]], i64 [[S:%.*]])
52; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], ptr [[OUT:%.*]], i64 64, x86_amx [[T1]])
53; CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i32>, ptr [[OUT]], align 1024
54; CHECK-NEXT:    ret <256 x i32> [[TMP0]]
55;
56entry:
57  %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %m, ptr %buf, i64 %s)
58  %t2 = bitcast x86_amx %t1 to <256 x i32>
59  store <256 x i32> %t2, ptr %out
60  ret <256 x i32> %t2
61}
62
63define dso_local void @test_src_add(<256 x i32> %x, <256 x i32> %y, i16 %r, i16 %c, ptr %buf, i64 %s) {
64; CHECK-LABEL: @test_src_add(
65; CHECK-NEXT:  entry:
66; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
67; CHECK-NEXT:    [[ADD:%.*]] = add <256 x i32> [[Y:%.*]], [[X:%.*]]
68; CHECK-NEXT:    store <256 x i32> [[ADD]], ptr [[TMP0]], align 1024
69; CHECK-NEXT:    [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C:%.*]], ptr [[TMP0]], i64 64)
70; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP1]])
71; CHECK-NEXT:    ret void
72;
73entry:
74  %add = add <256 x i32> %y, %x
75  %t = bitcast <256 x i32> %add to x86_amx
76  call void @llvm.x86.tilestored64.internal(i16 %r, i16 %c, ptr %buf, i64 %s, x86_amx %t)
77  ret void
78}
79
80define dso_local void @test_src_add2(<256 x i32> %x, i16 %r, i16 %c, ptr %buf, i64 %s) {
81; CHECK-LABEL: @test_src_add2(
82; CHECK-NEXT:  entry:
83; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
84; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]])
85; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[TMP0]], i64 64, x86_amx [[T1]])
86; CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
87; CHECK-NEXT:    [[ADD:%.*]] = add <256 x i32> [[TMP1]], [[X:%.*]]
88; CHECK-NEXT:    ret void
89;
90entry:
91  %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %r, i16 %c, ptr %buf, i64 %s)
92  %t2 = bitcast x86_amx %t1 to <256 x i32>
93  %add = add <256 x i32> %t2, %x
94  ret void
95}
96
97define dso_local void @test_load(ptr %in, ptr %out) local_unnamed_addr {
98; CHECK-LABEL: @test_load(
99; CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i32>, ptr [[IN:%.*]], align 64
100; CHECK-NEXT:    store <256 x i32> [[TMP1]], ptr [[OUT:%.*]], align 64
101; CHECK-NEXT:    ret void
102;
103  %1 = load <256 x i32>, ptr %in, align 64
104  store <256 x i32> %1, ptr %out, align 64
105  ret void
106}
107
108define dso_local <256 x i32> @foo(ptr nocapture readonly byval(<256 x i32>) align 1024 %0, ptr nocapture readonly byval(<256 x i32>) align 1024 %1) local_unnamed_addr {
109; CHECK-LABEL: @foo(
110; CHECK-NEXT:  entry:
111; CHECK-NEXT:    [[X:%.*]] = load <256 x i32>, ptr [[TMP0:%.*]], align 1024
112; CHECK-NEXT:    [[Y:%.*]] = load <256 x i32>, ptr [[TMP1:%.*]], align 1024
113; CHECK-NEXT:    [[ADD:%.*]] = add <256 x i32> [[Y]], [[X]]
114; CHECK-NEXT:    ret <256 x i32> [[ADD]]
115;
116entry:
117  %x = load <256 x i32>, ptr %0, align 1024
118  %y = load <256 x i32>, ptr %1, align 1024
119  %add = add <256 x i32> %y, %x
120  ret <256 x i32> %add
121}
122
123define dso_local void @__tile_loadd(ptr nocapture %0, ptr %1, i64 %2) local_unnamed_addr {
124; CHECK-LABEL: @__tile_loadd(
125; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[TMP0:%.*]], align 64
126; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP0]], i64 0, i32 1
127; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2
128; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP2:%.*]], 32
129; CHECK-NEXT:    [[TMP8:%.*]] = ashr exact i64 [[TMP7]], 32
130; CHECK-NEXT:    [[TMP9:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP1:%.*]], i64 [[TMP8]])
131; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP0]], i64 0, i32 2
132; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64, x86_amx [[TMP9]])
133; CHECK-NEXT:    ret void
134;
135  %4 = load i16, ptr %0, align 64
136  %5 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 1
137  %6 = load i16, ptr %5, align 2
138  %7 = shl i64 %2, 32
139  %8 = ashr exact i64 %7, 32
140  %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %4, i16 %6, ptr %1, i64 %8)
141  %10 = bitcast x86_amx %9 to <256 x i32>
142  %11 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 2
143  store <256 x i32> %10, ptr %11, align 64
144  ret void
145}
146
147define dso_local void @__tile_dpbssd(ptr nocapture %0, ptr nocapture readonly byval(%struct.__tile_str) align 64 %1, ptr nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr {
148; CHECK-LABEL: @__tile_dpbssd(
149; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[TMP1:%.*]], align 64
150; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP2:%.*]], i64 0, i32 1
151; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2
152; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP1]], i64 0, i32 1
153; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[TMP7]], align 2
154; CHECK-NEXT:    [[TMP9:%.*]] = udiv i16 [[TMP8]], 4
155; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP0:%.*]], i64 0, i32 2
156; CHECK-NEXT:    [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64)
157; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP1]], i64 0, i32 2
158; CHECK-NEXT:    [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP8]], ptr [[TMP12]], i64 64)
159; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP2]], i64 0, i32 2
160; CHECK-NEXT:    [[TMP15:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP6]], ptr [[TMP14]], i64 64)
161; CHECK-NEXT:    [[TMP16:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[TMP4]], i16 [[TMP6]], i16 [[TMP8]], x86_amx [[TMP11]], x86_amx [[TMP13]], x86_amx [[TMP15]])
162; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64, x86_amx [[TMP16]])
163; CHECK-NEXT:    ret void
164;
165  %4 = load i16, ptr %1, align 64
166  %5 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 1
167  %6 = load i16, ptr %5, align 2
168  %7 = getelementptr inbounds %struct.__tile_str, ptr %1, i64 0, i32 1
169  %8 = load i16, ptr %7, align 2
170  %9 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 2
171  %10 = load <256 x i32>, ptr %9, align 64
172  %11 = bitcast <256 x i32> %10 to x86_amx
173  %12 = getelementptr inbounds %struct.__tile_str, ptr %1, i64 0, i32 2
174  %13 = load <256 x i32>, ptr %12, align 64
175  %14 = bitcast <256 x i32> %13 to x86_amx
176  %15 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 2
177  %16 = load <256 x i32>, ptr %15, align 64
178  %17 = bitcast <256 x i32> %16 to x86_amx
179  %18 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %4, i16 %6, i16 %8, x86_amx %11, x86_amx %14, x86_amx %17)
180  %19 = bitcast x86_amx %18 to <256 x i32>
181  store <256 x i32> %19, ptr %9, align 64
182  ret void
183}
184
185define dso_local void @__tile_dpbsud(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) {
186; CHECK-LABEL: @__tile_dpbsud(
187; CHECK-NEXT:    [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
188; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64)
189; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64)
190; CHECK-NEXT:    [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64)
191; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]])
192; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]])
193; CHECK-NEXT:    ret void
194;
195  %t0 = load <256 x i32>, ptr %pa, align 64
196  %t1 = bitcast <256 x i32> %t0 to x86_amx
197  %t2 = load <256 x i32>, ptr %pb, align 64
198  %t3 = bitcast <256 x i32> %t2 to x86_amx
199  %t4 = load <256 x i32>, ptr %pc, align 64
200  %t5 = bitcast <256 x i32> %t4 to x86_amx
201  %t6 = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
202  %t7 = bitcast x86_amx %t6 to <256 x i32>
203  store <256 x i32> %t7, ptr %pc, align 64
204  ret void
205}
206
207define dso_local void @__tile_dpbusd(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) {
208; CHECK-LABEL: @__tile_dpbusd(
209; CHECK-NEXT:    [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
210; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64)
211; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64)
212; CHECK-NEXT:    [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64)
213; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]])
214; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]])
215; CHECK-NEXT:    ret void
216;
217  %t0 = load <256 x i32>, ptr %pa, align 64
218  %t1 = bitcast <256 x i32> %t0 to x86_amx
219  %t2 = load <256 x i32>, ptr %pb, align 64
220  %t3 = bitcast <256 x i32> %t2 to x86_amx
221  %t4 = load <256 x i32>, ptr %pc, align 64
222  %t5 = bitcast <256 x i32> %t4 to x86_amx
223  %t6 = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
224  %t7 = bitcast x86_amx %t6 to <256 x i32>
225  store <256 x i32> %t7, ptr %pc, align 64
226  ret void
227}
228
229define dso_local void @__tile_dpbuud(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) {
230; CHECK-LABEL: @__tile_dpbuud(
231; CHECK-NEXT:    [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
232; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64)
233; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64)
234; CHECK-NEXT:    [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64)
235; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]])
236; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]])
237; CHECK-NEXT:    ret void
238;
239  %t0 = load <256 x i32>, ptr %pa, align 64
240  %t1 = bitcast <256 x i32> %t0 to x86_amx
241  %t2 = load <256 x i32>, ptr %pb, align 64
242  %t3 = bitcast <256 x i32> %t2 to x86_amx
243  %t4 = load <256 x i32>, ptr %pc, align 64
244  %t5 = bitcast <256 x i32> %t4 to x86_amx
245  %t6 = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
246  %t7 = bitcast x86_amx %t6 to <256 x i32>
247  store <256 x i32> %t7, ptr %pc, align 64
248  ret void
249}
250
251define dso_local void @__tile_dpbf16ps(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) {
252; CHECK-LABEL: @__tile_dpbf16ps(
253; CHECK-NEXT:    [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
254; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64)
255; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64)
256; CHECK-NEXT:    [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64)
257; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]])
258; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]])
259; CHECK-NEXT:    ret void
260;
261  %t0 = load <256 x i32>, ptr %pa, align 64
262  %t1 = bitcast <256 x i32> %t0 to x86_amx
263  %t2 = load <256 x i32>, ptr %pb, align 64
264  %t3 = bitcast <256 x i32> %t2 to x86_amx
265  %t4 = load <256 x i32>, ptr %pc, align 64
266  %t5 = bitcast <256 x i32> %t4 to x86_amx
267  %t6 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
268  %t7 = bitcast x86_amx %t6 to <256 x i32>
269  store <256 x i32> %t7, ptr %pc, align 64
270  ret void
271}
272
273define dso_local void @__tile_stored(ptr %0, i64 %1, ptr nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr {
274; CHECK-LABEL: @__tile_stored(
275; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[TMP2:%.*]], align 64
276; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP2]], i64 0, i32 1
277; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2
278; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP2]], i64 0, i32 2
279; CHECK-NEXT:    [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP7]], i64 64)
280; CHECK-NEXT:    [[TMP9:%.*]] = shl i64 [[TMP1:%.*]], 32
281; CHECK-NEXT:    [[TMP10:%.*]] = ashr exact i64 [[TMP9]], 32
282; CHECK-NEXT:    tail call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP0:%.*]], i64 [[TMP10]], x86_amx [[TMP8]])
283; CHECK-NEXT:    ret void
284;
285  %4 = load i16, ptr %2, align 64
286  %5 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 1
287  %6 = load i16, ptr %5, align 2
288  %7 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 2
289  %8 = load <256 x i32>, ptr %7, align 64
290  %9 = bitcast <256 x i32> %8 to x86_amx
291  %10 = shl i64 %1, 32
292  %11 = ashr exact i64 %10, 32
293  tail call void @llvm.x86.tilestored64.internal(i16 %4, i16 %6, ptr %0, i64 %11, x86_amx %9)
294  ret void
295}
296
297declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
298declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
299declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
300declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
301declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
302declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
303declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
304