xref: /llvm-project/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll (revision 2f448bf509432c1a19ec46ab8cbc7353c03c6280)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
3
4define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr) {
5; CHECK-LABEL: @test_amx_load_non_O0(
6; CHECK-NEXT:  entry:
7; CHECK-NEXT:    [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
8; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[STRIDE:%.*]], 2
9; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]]
10; CHECK:       tileload.scalarize.rows.header:
11; CHECK-NEXT:    [[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ]
12; CHECK-NEXT:    [[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP11:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ]
13; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
14; CHECK:       tileload.scalarize.rows.body:
15; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]
16; CHECK:       tileload.scalarize.cols.header:
17; CHECK-NEXT:    [[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH:%.*]] ]
18; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TMP11]], [[TILELOAD_SCALARIZE_COLS_LATCH]] ]
19; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_COLS_BODY:%.*]]
20; CHECK:       tileload.scalarize.cols.body:
21; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV]] to i64
22; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV]] to i64
23; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP1]]
24; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], [[TMP3]]
25; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP5]]
26; CHECK-NEXT:    [[TMP8:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 16
27; CHECK-NEXT:    [[TMP9:%.*]] = add i16 [[TMP8]], [[TILELOAD_SCALARIZE_COLS_IV]]
28; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP7]], align 4
29; CHECK-NEXT:    [[TMP11]] = insertelement <256 x i32> [[VEC_PHI]], i32 [[TMP10]], i16 [[TMP9]]
30; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_COLS_LATCH]]
31; CHECK:       tileload.scalarize.cols.latch:
32; CHECK-NEXT:    [[TILELOAD_SCALARIZE_COLS_STEP]] = add i16 [[TILELOAD_SCALARIZE_COLS_IV]], 1
33; CHECK-NEXT:    [[TILELOAD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_COLS_STEP]], [[TMP0]]
34; CHECK-NEXT:    br i1 [[TILELOAD_SCALARIZE_COLS_COND]], label [[TILELOAD_SCALARIZE_COLS_HEADER]], label [[TILELOAD_SCALARIZE_ROWS_LATCH]]
35; CHECK:       tileload.scalarize.rows.latch:
36; CHECK-NEXT:    [[TILELOAD_SCALARIZE_ROWS_STEP]] = add i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 1
37; CHECK-NEXT:    [[TILELOAD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
38; CHECK-NEXT:    br i1 [[TILELOAD_SCALARIZE_ROWS_COND]], label [[TILELOAD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
39; CHECK:       continue:
40; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <256 x i32> [[TMP11]] to x86_amx
41; CHECK-NEXT:    store <256 x i32> [[TMP11]], ptr [[VPTR:%.*]], align 64
42; CHECK-NEXT:    ret void
43;
44entry:
45  %amx = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr %ptr, i64 %stride)
46  %vec = bitcast x86_amx %amx to <256 x i32>
47  store <256 x i32> %vec, ptr %vptr, align 64
48  ret void
49}
50
51define dso_local void @test_amx_load(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr) #0 {
52; CHECK-LABEL: @test_amx_load(
53; CHECK-NEXT:  entry:
54; CHECK-NEXT:    [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
55; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[STRIDE:%.*]], 2
56; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_ROWS_HEADER:%.*]]
57; CHECK:       tileload.scalarize.rows.header:
58; CHECK-NEXT:    [[TILELOAD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH:%.*]] ]
59; CHECK-NEXT:    [[VEC_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP11:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH]] ]
60; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_ROWS_BODY:%.*]]
61; CHECK:       tileload.scalarize.rows.body:
62; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_COLS_HEADER:%.*]]
63; CHECK:       tileload.scalarize.cols.header:
64; CHECK-NEXT:    [[TILELOAD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TILELOAD_SCALARIZE_COLS_STEP:%.*]], [[TILELOAD_SCALARIZE_COLS_LATCH:%.*]] ]
65; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <256 x i32> [ [[VEC_PHI_ROW]], [[TILELOAD_SCALARIZE_ROWS_BODY]] ], [ [[TMP11]], [[TILELOAD_SCALARIZE_COLS_LATCH]] ]
66; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_COLS_BODY:%.*]]
67; CHECK:       tileload.scalarize.cols.body:
68; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TILELOAD_SCALARIZE_ROWS_IV]] to i64
69; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TILELOAD_SCALARIZE_COLS_IV]] to i64
70; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP1]]
71; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], [[TMP3]]
72; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP5]]
73; CHECK-NEXT:    [[TMP8:%.*]] = mul i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 16
74; CHECK-NEXT:    [[TMP9:%.*]] = add i16 [[TMP8]], [[TILELOAD_SCALARIZE_COLS_IV]]
75; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP7]], align 4
76; CHECK-NEXT:    [[TMP11]] = insertelement <256 x i32> [[VEC_PHI]], i32 [[TMP10]], i16 [[TMP9]]
77; CHECK-NEXT:    br label [[TILELOAD_SCALARIZE_COLS_LATCH]]
78; CHECK:       tileload.scalarize.cols.latch:
79; CHECK-NEXT:    [[TILELOAD_SCALARIZE_COLS_STEP]] = add i16 [[TILELOAD_SCALARIZE_COLS_IV]], 1
80; CHECK-NEXT:    [[TILELOAD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_COLS_STEP]], [[TMP0]]
81; CHECK-NEXT:    br i1 [[TILELOAD_SCALARIZE_COLS_COND]], label [[TILELOAD_SCALARIZE_COLS_HEADER]], label [[TILELOAD_SCALARIZE_ROWS_LATCH]]
82; CHECK:       tileload.scalarize.rows.latch:
83; CHECK-NEXT:    [[TILELOAD_SCALARIZE_ROWS_STEP]] = add i16 [[TILELOAD_SCALARIZE_ROWS_IV]], 1
84; CHECK-NEXT:    [[TILELOAD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
85; CHECK-NEXT:    br i1 [[TILELOAD_SCALARIZE_ROWS_COND]], label [[TILELOAD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
86; CHECK:       continue:
87; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <256 x i32> [[TMP11]] to x86_amx
88; CHECK-NEXT:    store <256 x i32> [[TMP11]], ptr [[VPTR:%.*]], align 64
89; CHECK-NEXT:    ret void
90;
91entry:
92  %amx = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr %ptr, i64 %stride)
93  %vec = bitcast x86_amx %amx to <256 x i32>
94  store <256 x i32> %vec, ptr %vptr, align 64
95  ret void
96}
97
98define dso_local void @test_amx_dpbssd(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, ptr %vptr) #0 {
99; CHECK-LABEL: @test_amx_dpbssd(
100; CHECK-NEXT:  entry:
101; CHECK-NEXT:    [[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
102; CHECK-NEXT:    [[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
103; CHECK-NEXT:    [[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
104; CHECK-NEXT:    [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
105; CHECK-NEXT:    [[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
106; CHECK-NEXT:    br label [[TILEDPBSSD_SCALARIZE_ROWS_HEADER:%.*]]
107; CHECK:       tiledpbssd.scalarize.rows.header:
108; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILEDPBSSD_SCALARIZE_ROWS_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH:%.*]] ]
109; CHECK-NEXT:    [[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP18:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]] ]
110; CHECK-NEXT:    [[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP20:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]] ]
111; CHECK-NEXT:    br label [[TILEDPBSSD_SCALARIZE_ROWS_BODY:%.*]]
112; CHECK:       tiledpbssd.scalarize.rows.body:
113; CHECK-NEXT:    br label [[TILEDPBSSD_SCALARIZE_COLS_HEADER:%.*]]
114; CHECK:       tiledpbssd.scalarize.cols.header:
115; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TILEDPBSSD_SCALARIZE_COLS_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH:%.*]] ]
116; CHECK-NEXT:    [[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP18]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH]] ]
117; CHECK-NEXT:    [[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TILEDPBSSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP20]], [[TILEDPBSSD_SCALARIZE_COLS_LATCH]] ]
118; CHECK-NEXT:    [[TMP2:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 16
119; CHECK-NEXT:    [[TMP3:%.*]] = add i16 [[TMP2]], [[TILEDPBSSD_SCALARIZE_COLS_IV]]
120; CHECK-NEXT:    br label [[TILEDPBSSD_SCALARIZE_COLS_BODY:%.*]]
121; CHECK:       tiledpbssd.scalarize.cols.body:
122; CHECK-NEXT:    br label [[TILEDPBSSD_SCALARIZE_INNER_HEADER:%.*]]
123; CHECK:       tiledpbssd.scalarize.inner.header:
124; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TILEDPBSSD_SCALARIZE_COLS_BODY]] ], [ [[TILEDPBSSD_SCALARIZE_INNER_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_INNER_LATCH:%.*]] ]
125; CHECK-NEXT:    [[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TILEDPBSSD_SCALARIZE_COLS_BODY]] ], [ [[TMP18]], [[TILEDPBSSD_SCALARIZE_INNER_LATCH]] ]
126; CHECK-NEXT:    br label [[TILEDPBSSD_SCALARIZE_INNER_BODY:%.*]]
127; CHECK:       tiledpbssd.scalarize.inner.body:
128; CHECK-NEXT:    [[TMP4:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 16
129; CHECK-NEXT:    [[TMP5:%.*]] = add i16 [[TMP4]], [[TILEDPBSSD_SCALARIZE_INNER_IV]]
130; CHECK-NEXT:    [[TMP6:%.*]] = mul i16 [[TILEDPBSSD_SCALARIZE_INNER_IV]], 16
131; CHECK-NEXT:    [[TMP7:%.*]] = add i16 [[TMP6]], [[TILEDPBSSD_SCALARIZE_COLS_IV]]
132; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
133; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <256 x i32> [[A]], i16 [[TMP5]]
134; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP9]] to <4 x i8>
135; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <256 x i32> [[B]], i16 [[TMP7]]
136; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32 [[TMP11]] to <4 x i8>
137; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i8> [[TMP12]] to <4 x i32>
138; CHECK-NEXT:    [[TMP14:%.*]] = sext <4 x i8> [[TMP10]] to <4 x i32>
139; CHECK-NEXT:    [[TMP15:%.*]] = mul <4 x i32> [[TMP14]], [[TMP13]]
140; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
141; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP8]], [[TMP16]]
142; CHECK-NEXT:    [[TMP18]] = insertelement <256 x i32> [[VEC_C_INNER_PHI]], i32 [[TMP17]], i16 [[TMP3]]
143; CHECK-NEXT:    br label [[TILEDPBSSD_SCALARIZE_INNER_LATCH]]
144; CHECK:       tiledpbssd.scalarize.inner.latch:
145; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_INNER_STEP]] = add i16 [[TILEDPBSSD_SCALARIZE_INNER_IV]], 1
146; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_INNER_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_INNER_STEP]], [[TMP1]]
147; CHECK-NEXT:    br i1 [[TILEDPBSSD_SCALARIZE_INNER_COND]], label [[TILEDPBSSD_SCALARIZE_INNER_HEADER]], label [[TILEDPBSSD_SCALARIZE_COLS_LATCH]]
148; CHECK:       tiledpbssd.scalarize.cols.latch:
149; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_COLS_STEP]] = add i16 [[TILEDPBSSD_SCALARIZE_COLS_IV]], 1
150; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_COLS_STEP]], [[TMP0]]
151; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <256 x i32> [[TMP18]], i16 [[TMP3]]
152; CHECK-NEXT:    [[TMP20]] = insertelement <256 x i32> [[VEC_D_PHI_COL]], i32 [[TMP19]], i16 [[TMP3]]
153; CHECK-NEXT:    br i1 [[TILEDPBSSD_SCALARIZE_COLS_COND]], label [[TILEDPBSSD_SCALARIZE_COLS_HEADER]], label [[TILEDPBSSD_SCALARIZE_ROWS_LATCH]]
154; CHECK:       tiledpbssd.scalarize.rows.latch:
155; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_ROWS_STEP]] = add i16 [[TILEDPBSSD_SCALARIZE_ROWS_IV]], 1
156; CHECK-NEXT:    [[TILEDPBSSD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
157; CHECK-NEXT:    br i1 [[TILEDPBSSD_SCALARIZE_ROWS_COND]], label [[TILEDPBSSD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
158; CHECK:       continue:
159; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <256 x i32> [[TMP20]] to x86_amx
160; CHECK-NEXT:    store <256 x i32> [[TMP20]], ptr [[VPTR:%.*]], align 64
161; CHECK-NEXT:    ret void
162;
163entry:
164  %a.amx = bitcast <256 x i32> %a to x86_amx
165  %b.amx = bitcast <256 x i32> %b to x86_amx
166  %c.amx = bitcast <256 x i32> %c to x86_amx
167  %acc = call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 %k, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx)
168  %vec = bitcast x86_amx %acc to <256 x i32>
169  store <256 x i32> %vec, ptr %vptr, align 64
170  ret void
171}
172
173define dso_local void @test_amx_dpbsud(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, ptr %vptr) #0 {
174; CHECK-LABEL: @test_amx_dpbsud(
175; CHECK-NEXT:  entry:
176; CHECK-NEXT:    [[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
177; CHECK-NEXT:    [[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
178; CHECK-NEXT:    [[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
179; CHECK-NEXT:    [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
180; CHECK-NEXT:    [[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
181; CHECK-NEXT:    br label [[TILEDPBSUD_SCALARIZE_ROWS_HEADER:%.*]]
182; CHECK:       tiledpbsud.scalarize.rows.header:
183; CHECK-NEXT:    [[TILEDPBSUD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILEDPBSUD_SCALARIZE_ROWS_STEP:%.*]], [[TILEDPBSUD_SCALARIZE_ROWS_LATCH:%.*]] ]
184; CHECK-NEXT:    [[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP18:%.*]], [[TILEDPBSUD_SCALARIZE_ROWS_LATCH]] ]
185; CHECK-NEXT:    [[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP20:%.*]], [[TILEDPBSUD_SCALARIZE_ROWS_LATCH]] ]
186; CHECK-NEXT:    br label [[TILEDPBSUD_SCALARIZE_ROWS_BODY:%.*]]
187; CHECK:       tiledpbsud.scalarize.rows.body:
188; CHECK-NEXT:    br label [[TILEDPBSUD_SCALARIZE_COLS_HEADER:%.*]]
189; CHECK:       tiledpbsud.scalarize.cols.header:
190; CHECK-NEXT:    [[TILEDPBSUD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILEDPBSUD_SCALARIZE_ROWS_BODY]] ], [ [[TILEDPBSUD_SCALARIZE_COLS_STEP:%.*]], [[TILEDPBSUD_SCALARIZE_COLS_LATCH:%.*]] ]
191; CHECK-NEXT:    [[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TILEDPBSUD_SCALARIZE_ROWS_BODY]] ], [ [[TMP18]], [[TILEDPBSUD_SCALARIZE_COLS_LATCH]] ]
192; CHECK-NEXT:    [[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TILEDPBSUD_SCALARIZE_ROWS_BODY]] ], [ [[TMP20]], [[TILEDPBSUD_SCALARIZE_COLS_LATCH]] ]
193; CHECK-NEXT:    [[TMP2:%.*]] = mul i16 [[TILEDPBSUD_SCALARIZE_ROWS_IV]], 16
194; CHECK-NEXT:    [[TMP3:%.*]] = add i16 [[TMP2]], [[TILEDPBSUD_SCALARIZE_COLS_IV]]
195; CHECK-NEXT:    br label [[TILEDPBSUD_SCALARIZE_COLS_BODY:%.*]]
196; CHECK:       tiledpbsud.scalarize.cols.body:
197; CHECK-NEXT:    br label [[TILEDPBSUD_SCALARIZE_INNER_HEADER:%.*]]
198; CHECK:       tiledpbsud.scalarize.inner.header:
199; CHECK-NEXT:    [[TILEDPBSUD_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TILEDPBSUD_SCALARIZE_COLS_BODY]] ], [ [[TILEDPBSUD_SCALARIZE_INNER_STEP:%.*]], [[TILEDPBSUD_SCALARIZE_INNER_LATCH:%.*]] ]
200; CHECK-NEXT:    [[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TILEDPBSUD_SCALARIZE_COLS_BODY]] ], [ [[TMP18]], [[TILEDPBSUD_SCALARIZE_INNER_LATCH]] ]
201; CHECK-NEXT:    br label [[TILEDPBSUD_SCALARIZE_INNER_BODY:%.*]]
202; CHECK:       tiledpbsud.scalarize.inner.body:
203; CHECK-NEXT:    [[TMP4:%.*]] = mul i16 [[TILEDPBSUD_SCALARIZE_ROWS_IV]], 16
204; CHECK-NEXT:    [[TMP5:%.*]] = add i16 [[TMP4]], [[TILEDPBSUD_SCALARIZE_INNER_IV]]
205; CHECK-NEXT:    [[TMP6:%.*]] = mul i16 [[TILEDPBSUD_SCALARIZE_INNER_IV]], 16
206; CHECK-NEXT:    [[TMP7:%.*]] = add i16 [[TMP6]], [[TILEDPBSUD_SCALARIZE_COLS_IV]]
207; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
208; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <256 x i32> [[A]], i16 [[TMP5]]
209; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP9]] to <4 x i8>
210; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <256 x i32> [[B]], i16 [[TMP7]]
211; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32 [[TMP11]] to <4 x i8>
212; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32>
213; CHECK-NEXT:    [[TMP14:%.*]] = sext <4 x i8> [[TMP10]] to <4 x i32>
214; CHECK-NEXT:    [[TMP15:%.*]] = mul <4 x i32> [[TMP14]], [[TMP13]]
215; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
216; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP8]], [[TMP16]]
217; CHECK-NEXT:    [[TMP18]] = insertelement <256 x i32> [[VEC_C_INNER_PHI]], i32 [[TMP17]], i16 [[TMP3]]
218; CHECK-NEXT:    br label [[TILEDPBSUD_SCALARIZE_INNER_LATCH]]
219; CHECK:       tiledpbsud.scalarize.inner.latch:
220; CHECK-NEXT:    [[TILEDPBSUD_SCALARIZE_INNER_STEP]] = add i16 [[TILEDPBSUD_SCALARIZE_INNER_IV]], 1
221; CHECK-NEXT:    [[TILEDPBSUD_SCALARIZE_INNER_COND:%.*]] = icmp ne i16 [[TILEDPBSUD_SCALARIZE_INNER_STEP]], [[TMP1]]
222; CHECK-NEXT:    br i1 [[TILEDPBSUD_SCALARIZE_INNER_COND]], label [[TILEDPBSUD_SCALARIZE_INNER_HEADER]], label [[TILEDPBSUD_SCALARIZE_COLS_LATCH]]
223; CHECK:       tiledpbsud.scalarize.cols.latch:
224; CHECK-NEXT:    [[TILEDPBSUD_SCALARIZE_COLS_STEP]] = add i16 [[TILEDPBSUD_SCALARIZE_COLS_IV]], 1
225; CHECK-NEXT:    [[TILEDPBSUD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILEDPBSUD_SCALARIZE_COLS_STEP]], [[TMP0]]
226; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <256 x i32> [[TMP18]], i16 [[TMP3]]
227; CHECK-NEXT:    [[TMP20]] = insertelement <256 x i32> [[VEC_D_PHI_COL]], i32 [[TMP19]], i16 [[TMP3]]
228; CHECK-NEXT:    br i1 [[TILEDPBSUD_SCALARIZE_COLS_COND]], label [[TILEDPBSUD_SCALARIZE_COLS_HEADER]], label [[TILEDPBSUD_SCALARIZE_ROWS_LATCH]]
229; CHECK:       tiledpbsud.scalarize.rows.latch:
230; CHECK-NEXT:    [[TILEDPBSUD_SCALARIZE_ROWS_STEP]] = add i16 [[TILEDPBSUD_SCALARIZE_ROWS_IV]], 1
231; CHECK-NEXT:    [[TILEDPBSUD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBSUD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
232; CHECK-NEXT:    br i1 [[TILEDPBSUD_SCALARIZE_ROWS_COND]], label [[TILEDPBSUD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
233; CHECK:       continue:
234; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <256 x i32> [[TMP20]] to x86_amx
235; CHECK-NEXT:    store <256 x i32> [[TMP20]], ptr [[VPTR:%.*]], align 64
236; CHECK-NEXT:    ret void
237;
238entry:
239  %a.amx = bitcast <256 x i32> %a to x86_amx
240  %b.amx = bitcast <256 x i32> %b to x86_amx
241  %c.amx = bitcast <256 x i32> %c to x86_amx
242  %acc = call x86_amx @llvm.x86.tdpbsud.internal(i16 %row, i16 %col, i16 %k, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx)
243  %vec = bitcast x86_amx %acc to <256 x i32>
244  store <256 x i32> %vec, ptr %vptr, align 64
245  ret void
246}
247
248define dso_local void @test_amx_dpbusd(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, ptr %vptr) #0 {
249; CHECK-LABEL: @test_amx_dpbusd(
250; CHECK-NEXT:  entry:
251; CHECK-NEXT:    [[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
252; CHECK-NEXT:    [[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
253; CHECK-NEXT:    [[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
254; CHECK-NEXT:    [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
255; CHECK-NEXT:    [[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
256; CHECK-NEXT:    br label [[TILEDPBUSD_SCALARIZE_ROWS_HEADER:%.*]]
257; CHECK:       tiledpbusd.scalarize.rows.header:
258; CHECK-NEXT:    [[TILEDPBUSD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILEDPBUSD_SCALARIZE_ROWS_STEP:%.*]], [[TILEDPBUSD_SCALARIZE_ROWS_LATCH:%.*]] ]
259; CHECK-NEXT:    [[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP18:%.*]], [[TILEDPBUSD_SCALARIZE_ROWS_LATCH]] ]
260; CHECK-NEXT:    [[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP20:%.*]], [[TILEDPBUSD_SCALARIZE_ROWS_LATCH]] ]
261; CHECK-NEXT:    br label [[TILEDPBUSD_SCALARIZE_ROWS_BODY:%.*]]
262; CHECK:       tiledpbusd.scalarize.rows.body:
263; CHECK-NEXT:    br label [[TILEDPBUSD_SCALARIZE_COLS_HEADER:%.*]]
264; CHECK:       tiledpbusd.scalarize.cols.header:
265; CHECK-NEXT:    [[TILEDPBUSD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILEDPBUSD_SCALARIZE_ROWS_BODY]] ], [ [[TILEDPBUSD_SCALARIZE_COLS_STEP:%.*]], [[TILEDPBUSD_SCALARIZE_COLS_LATCH:%.*]] ]
266; CHECK-NEXT:    [[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TILEDPBUSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP18]], [[TILEDPBUSD_SCALARIZE_COLS_LATCH]] ]
267; CHECK-NEXT:    [[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TILEDPBUSD_SCALARIZE_ROWS_BODY]] ], [ [[TMP20]], [[TILEDPBUSD_SCALARIZE_COLS_LATCH]] ]
268; CHECK-NEXT:    [[TMP2:%.*]] = mul i16 [[TILEDPBUSD_SCALARIZE_ROWS_IV]], 16
269; CHECK-NEXT:    [[TMP3:%.*]] = add i16 [[TMP2]], [[TILEDPBUSD_SCALARIZE_COLS_IV]]
270; CHECK-NEXT:    br label [[TILEDPBUSD_SCALARIZE_COLS_BODY:%.*]]
271; CHECK:       tiledpbusd.scalarize.cols.body:
272; CHECK-NEXT:    br label [[TILEDPBUSD_SCALARIZE_INNER_HEADER:%.*]]
273; CHECK:       tiledpbusd.scalarize.inner.header:
274; CHECK-NEXT:    [[TILEDPBUSD_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TILEDPBUSD_SCALARIZE_COLS_BODY]] ], [ [[TILEDPBUSD_SCALARIZE_INNER_STEP:%.*]], [[TILEDPBUSD_SCALARIZE_INNER_LATCH:%.*]] ]
275; CHECK-NEXT:    [[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TILEDPBUSD_SCALARIZE_COLS_BODY]] ], [ [[TMP18]], [[TILEDPBUSD_SCALARIZE_INNER_LATCH]] ]
276; CHECK-NEXT:    br label [[TILEDPBUSD_SCALARIZE_INNER_BODY:%.*]]
277; CHECK:       tiledpbusd.scalarize.inner.body:
278; CHECK-NEXT:    [[TMP4:%.*]] = mul i16 [[TILEDPBUSD_SCALARIZE_ROWS_IV]], 16
279; CHECK-NEXT:    [[TMP5:%.*]] = add i16 [[TMP4]], [[TILEDPBUSD_SCALARIZE_INNER_IV]]
280; CHECK-NEXT:    [[TMP6:%.*]] = mul i16 [[TILEDPBUSD_SCALARIZE_INNER_IV]], 16
281; CHECK-NEXT:    [[TMP7:%.*]] = add i16 [[TMP6]], [[TILEDPBUSD_SCALARIZE_COLS_IV]]
282; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
283; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <256 x i32> [[A]], i16 [[TMP5]]
284; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP9]] to <4 x i8>
285; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <256 x i32> [[B]], i16 [[TMP7]]
286; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32 [[TMP11]] to <4 x i8>
287; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i8> [[TMP12]] to <4 x i32>
288; CHECK-NEXT:    [[TMP14:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32>
289; CHECK-NEXT:    [[TMP15:%.*]] = mul <4 x i32> [[TMP14]], [[TMP13]]
290; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
291; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP8]], [[TMP16]]
292; CHECK-NEXT:    [[TMP18]] = insertelement <256 x i32> [[VEC_C_INNER_PHI]], i32 [[TMP17]], i16 [[TMP3]]
293; CHECK-NEXT:    br label [[TILEDPBUSD_SCALARIZE_INNER_LATCH]]
294; CHECK:       tiledpbusd.scalarize.inner.latch:
295; CHECK-NEXT:    [[TILEDPBUSD_SCALARIZE_INNER_STEP]] = add i16 [[TILEDPBUSD_SCALARIZE_INNER_IV]], 1
296; CHECK-NEXT:    [[TILEDPBUSD_SCALARIZE_INNER_COND:%.*]] = icmp ne i16 [[TILEDPBUSD_SCALARIZE_INNER_STEP]], [[TMP1]]
297; CHECK-NEXT:    br i1 [[TILEDPBUSD_SCALARIZE_INNER_COND]], label [[TILEDPBUSD_SCALARIZE_INNER_HEADER]], label [[TILEDPBUSD_SCALARIZE_COLS_LATCH]]
298; CHECK:       tiledpbusd.scalarize.cols.latch:
299; CHECK-NEXT:    [[TILEDPBUSD_SCALARIZE_COLS_STEP]] = add i16 [[TILEDPBUSD_SCALARIZE_COLS_IV]], 1
300; CHECK-NEXT:    [[TILEDPBUSD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILEDPBUSD_SCALARIZE_COLS_STEP]], [[TMP0]]
301; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <256 x i32> [[TMP18]], i16 [[TMP3]]
302; CHECK-NEXT:    [[TMP20]] = insertelement <256 x i32> [[VEC_D_PHI_COL]], i32 [[TMP19]], i16 [[TMP3]]
303; CHECK-NEXT:    br i1 [[TILEDPBUSD_SCALARIZE_COLS_COND]], label [[TILEDPBUSD_SCALARIZE_COLS_HEADER]], label [[TILEDPBUSD_SCALARIZE_ROWS_LATCH]]
304; CHECK:       tiledpbusd.scalarize.rows.latch:
305; CHECK-NEXT:    [[TILEDPBUSD_SCALARIZE_ROWS_STEP]] = add i16 [[TILEDPBUSD_SCALARIZE_ROWS_IV]], 1
306; CHECK-NEXT:    [[TILEDPBUSD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBUSD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
307; CHECK-NEXT:    br i1 [[TILEDPBUSD_SCALARIZE_ROWS_COND]], label [[TILEDPBUSD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
308; CHECK:       continue:
309; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <256 x i32> [[TMP20]] to x86_amx
310; CHECK-NEXT:    store <256 x i32> [[TMP20]], ptr [[VPTR:%.*]], align 64
311; CHECK-NEXT:    ret void
312;
313entry:
314  %a.amx = bitcast <256 x i32> %a to x86_amx
315  %b.amx = bitcast <256 x i32> %b to x86_amx
316  %c.amx = bitcast <256 x i32> %c to x86_amx
317  %acc = call x86_amx @llvm.x86.tdpbusd.internal(i16 %row, i16 %col, i16 %k, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx)
318  %vec = bitcast x86_amx %acc to <256 x i32>
319  store <256 x i32> %vec, ptr %vptr, align 64
320  ret void
321}
322
323define dso_local void @test_amx_dpbuud(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, ptr %vptr) #0 {
324; CHECK-LABEL: @test_amx_dpbuud(
325; CHECK-NEXT:  entry:
326; CHECK-NEXT:    [[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
327; CHECK-NEXT:    [[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
328; CHECK-NEXT:    [[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
329; CHECK-NEXT:    [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
330; CHECK-NEXT:    [[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
331; CHECK-NEXT:    br label [[TILEDPBUUD_SCALARIZE_ROWS_HEADER:%.*]]
332; CHECK:       tiledpbuud.scalarize.rows.header:
333; CHECK-NEXT:    [[TILEDPBUUD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILEDPBUUD_SCALARIZE_ROWS_STEP:%.*]], [[TILEDPBUUD_SCALARIZE_ROWS_LATCH:%.*]] ]
334; CHECK-NEXT:    [[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP18:%.*]], [[TILEDPBUUD_SCALARIZE_ROWS_LATCH]] ]
335; CHECK-NEXT:    [[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP20:%.*]], [[TILEDPBUUD_SCALARIZE_ROWS_LATCH]] ]
336; CHECK-NEXT:    br label [[TILEDPBUUD_SCALARIZE_ROWS_BODY:%.*]]
337; CHECK:       tiledpbuud.scalarize.rows.body:
338; CHECK-NEXT:    br label [[TILEDPBUUD_SCALARIZE_COLS_HEADER:%.*]]
339; CHECK:       tiledpbuud.scalarize.cols.header:
340; CHECK-NEXT:    [[TILEDPBUUD_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILEDPBUUD_SCALARIZE_ROWS_BODY]] ], [ [[TILEDPBUUD_SCALARIZE_COLS_STEP:%.*]], [[TILEDPBUUD_SCALARIZE_COLS_LATCH:%.*]] ]
341; CHECK-NEXT:    [[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TILEDPBUUD_SCALARIZE_ROWS_BODY]] ], [ [[TMP18]], [[TILEDPBUUD_SCALARIZE_COLS_LATCH]] ]
342; CHECK-NEXT:    [[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TILEDPBUUD_SCALARIZE_ROWS_BODY]] ], [ [[TMP20]], [[TILEDPBUUD_SCALARIZE_COLS_LATCH]] ]
343; CHECK-NEXT:    [[TMP2:%.*]] = mul i16 [[TILEDPBUUD_SCALARIZE_ROWS_IV]], 16
344; CHECK-NEXT:    [[TMP3:%.*]] = add i16 [[TMP2]], [[TILEDPBUUD_SCALARIZE_COLS_IV]]
345; CHECK-NEXT:    br label [[TILEDPBUUD_SCALARIZE_COLS_BODY:%.*]]
346; CHECK:       tiledpbuud.scalarize.cols.body:
347; CHECK-NEXT:    br label [[TILEDPBUUD_SCALARIZE_INNER_HEADER:%.*]]
348; CHECK:       tiledpbuud.scalarize.inner.header:
349; CHECK-NEXT:    [[TILEDPBUUD_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TILEDPBUUD_SCALARIZE_COLS_BODY]] ], [ [[TILEDPBUUD_SCALARIZE_INNER_STEP:%.*]], [[TILEDPBUUD_SCALARIZE_INNER_LATCH:%.*]] ]
350; CHECK-NEXT:    [[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TILEDPBUUD_SCALARIZE_COLS_BODY]] ], [ [[TMP18]], [[TILEDPBUUD_SCALARIZE_INNER_LATCH]] ]
351; CHECK-NEXT:    br label [[TILEDPBUUD_SCALARIZE_INNER_BODY:%.*]]
352; CHECK:       tiledpbuud.scalarize.inner.body:
353; CHECK-NEXT:    [[TMP4:%.*]] = mul i16 [[TILEDPBUUD_SCALARIZE_ROWS_IV]], 16
354; CHECK-NEXT:    [[TMP5:%.*]] = add i16 [[TMP4]], [[TILEDPBUUD_SCALARIZE_INNER_IV]]
355; CHECK-NEXT:    [[TMP6:%.*]] = mul i16 [[TILEDPBUUD_SCALARIZE_INNER_IV]], 16
356; CHECK-NEXT:    [[TMP7:%.*]] = add i16 [[TMP6]], [[TILEDPBUUD_SCALARIZE_COLS_IV]]
357; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
358; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <256 x i32> [[A]], i16 [[TMP5]]
359; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP9]] to <4 x i8>
360; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <256 x i32> [[B]], i16 [[TMP7]]
361; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32 [[TMP11]] to <4 x i8>
362; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32>
363; CHECK-NEXT:    [[TMP14:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32>
364; CHECK-NEXT:    [[TMP15:%.*]] = mul <4 x i32> [[TMP14]], [[TMP13]]
365; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
366; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP8]], [[TMP16]]
367; CHECK-NEXT:    [[TMP18]] = insertelement <256 x i32> [[VEC_C_INNER_PHI]], i32 [[TMP17]], i16 [[TMP3]]
368; CHECK-NEXT:    br label [[TILEDPBUUD_SCALARIZE_INNER_LATCH]]
369; CHECK:       tiledpbuud.scalarize.inner.latch:
370; CHECK-NEXT:    [[TILEDPBUUD_SCALARIZE_INNER_STEP]] = add i16 [[TILEDPBUUD_SCALARIZE_INNER_IV]], 1
371; CHECK-NEXT:    [[TILEDPBUUD_SCALARIZE_INNER_COND:%.*]] = icmp ne i16 [[TILEDPBUUD_SCALARIZE_INNER_STEP]], [[TMP1]]
372; CHECK-NEXT:    br i1 [[TILEDPBUUD_SCALARIZE_INNER_COND]], label [[TILEDPBUUD_SCALARIZE_INNER_HEADER]], label [[TILEDPBUUD_SCALARIZE_COLS_LATCH]]
373; CHECK:       tiledpbuud.scalarize.cols.latch:
374; CHECK-NEXT:    [[TILEDPBUUD_SCALARIZE_COLS_STEP]] = add i16 [[TILEDPBUUD_SCALARIZE_COLS_IV]], 1
375; CHECK-NEXT:    [[TILEDPBUUD_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILEDPBUUD_SCALARIZE_COLS_STEP]], [[TMP0]]
376; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <256 x i32> [[TMP18]], i16 [[TMP3]]
377; CHECK-NEXT:    [[TMP20]] = insertelement <256 x i32> [[VEC_D_PHI_COL]], i32 [[TMP19]], i16 [[TMP3]]
378; CHECK-NEXT:    br i1 [[TILEDPBUUD_SCALARIZE_COLS_COND]], label [[TILEDPBUUD_SCALARIZE_COLS_HEADER]], label [[TILEDPBUUD_SCALARIZE_ROWS_LATCH]]
379; CHECK:       tiledpbuud.scalarize.rows.latch:
380; CHECK-NEXT:    [[TILEDPBUUD_SCALARIZE_ROWS_STEP]] = add i16 [[TILEDPBUUD_SCALARIZE_ROWS_IV]], 1
381; CHECK-NEXT:    [[TILEDPBUUD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBUUD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
382; CHECK-NEXT:    br i1 [[TILEDPBUUD_SCALARIZE_ROWS_COND]], label [[TILEDPBUUD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
383; CHECK:       continue:
384; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <256 x i32> [[TMP20]] to x86_amx
385; CHECK-NEXT:    store <256 x i32> [[TMP20]], ptr [[VPTR:%.*]], align 64
386; CHECK-NEXT:    ret void
387;
388entry:
389  %a.amx = bitcast <256 x i32> %a to x86_amx
390  %b.amx = bitcast <256 x i32> %b to x86_amx
391  %c.amx = bitcast <256 x i32> %c to x86_amx
392  %acc = call x86_amx @llvm.x86.tdpbuud.internal(i16 %row, i16 %col, i16 %k, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx)
393  %vec = bitcast x86_amx %acc to <256 x i32>
394  store <256 x i32> %vec, ptr %vptr, align 64
395  ret void
396}
397
398define dso_local void @test_amx_dpbf16ps(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, ptr %vptr) #0 {
399; CHECK-LABEL: @test_amx_dpbf16ps(
400; CHECK-NEXT:  entry:
401; CHECK-NEXT:    [[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx
402; CHECK-NEXT:    [[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx
403; CHECK-NEXT:    [[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx
404; CHECK-NEXT:    [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
405; CHECK-NEXT:    [[TMP1:%.*]] = lshr i16 [[K:%.*]], 2
406; CHECK-NEXT:    br label [[TILEDPBF16PS_SCALARIZE_ROWS_HEADER:%.*]]
407; CHECK:       tiledpbf16ps.scalarize.rows.header:
408; CHECK-NEXT:    [[TILEDPBF16PS_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILEDPBF16PS_SCALARIZE_ROWS_STEP:%.*]], [[TILEDPBF16PS_SCALARIZE_ROWS_LATCH:%.*]] ]
409; CHECK-NEXT:    [[VEC_C_PHI_ROW:%.*]] = phi <256 x i32> [ [[C]], [[ENTRY]] ], [ [[TMP21:%.*]], [[TILEDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
410; CHECK-NEXT:    [[VEC_D_PHI_ROW:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[TILEDPBF16PS_SCALARIZE_ROWS_LATCH]] ]
411; CHECK-NEXT:    br label [[TILEDPBF16PS_SCALARIZE_ROWS_BODY:%.*]]
412; CHECK:       tiledpbf16ps.scalarize.rows.body:
413; CHECK-NEXT:    br label [[TILEDPBF16PS_SCALARIZE_COLS_HEADER:%.*]]
414; CHECK:       tiledpbf16ps.scalarize.cols.header:
415; CHECK-NEXT:    [[TILEDPBF16PS_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILEDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TILEDPBF16PS_SCALARIZE_COLS_STEP:%.*]], [[TILEDPBF16PS_SCALARIZE_COLS_LATCH:%.*]] ]
416; CHECK-NEXT:    [[VEC_C_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_ROW]], [[TILEDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP21]], [[TILEDPBF16PS_SCALARIZE_COLS_LATCH]] ]
417; CHECK-NEXT:    [[VEC_D_PHI_COL:%.*]] = phi <256 x i32> [ [[VEC_D_PHI_ROW]], [[TILEDPBF16PS_SCALARIZE_ROWS_BODY]] ], [ [[TMP23]], [[TILEDPBF16PS_SCALARIZE_COLS_LATCH]] ]
418; CHECK-NEXT:    [[TMP2:%.*]] = mul i16 [[TILEDPBF16PS_SCALARIZE_ROWS_IV]], 16
419; CHECK-NEXT:    [[TMP3:%.*]] = add i16 [[TMP2]], [[TILEDPBF16PS_SCALARIZE_COLS_IV]]
420; CHECK-NEXT:    br label [[TILEDPBF16PS_SCALARIZE_COLS_BODY:%.*]]
421; CHECK:       tiledpbf16ps.scalarize.cols.body:
422; CHECK-NEXT:    br label [[TILEDPBF16PS_SCALARIZE_INNER_HEADER:%.*]]
423; CHECK:       tiledpbf16ps.scalarize.inner.header:
424; CHECK-NEXT:    [[TILEDPBF16PS_SCALARIZE_INNER_IV:%.*]] = phi i16 [ 0, [[TILEDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TILEDPBF16PS_SCALARIZE_INNER_STEP:%.*]], [[TILEDPBF16PS_SCALARIZE_INNER_LATCH:%.*]] ]
425; CHECK-NEXT:    [[VEC_C_INNER_PHI:%.*]] = phi <256 x i32> [ [[VEC_C_PHI_COL]], [[TILEDPBF16PS_SCALARIZE_COLS_BODY]] ], [ [[TMP21]], [[TILEDPBF16PS_SCALARIZE_INNER_LATCH]] ]
426; CHECK-NEXT:    br label [[TILEDPBF16PS_SCALARIZE_INNER_BODY:%.*]]
427; CHECK:       tiledpbf16ps.scalarize.inner.body:
428; CHECK-NEXT:    [[TMP4:%.*]] = mul i16 [[TILEDPBF16PS_SCALARIZE_ROWS_IV]], 16
429; CHECK-NEXT:    [[TMP5:%.*]] = add i16 [[TMP4]], [[TILEDPBF16PS_SCALARIZE_INNER_IV]]
430; CHECK-NEXT:    [[TMP6:%.*]] = mul i16 [[TILEDPBF16PS_SCALARIZE_INNER_IV]], 16
431; CHECK-NEXT:    [[TMP7:%.*]] = add i16 [[TMP6]], [[TILEDPBF16PS_SCALARIZE_COLS_IV]]
432; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <256 x i32> [[VEC_C_INNER_PHI]], i16 [[TMP3]]
433; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float
434; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <256 x i32> [[A]], i16 [[TMP5]]
435; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[TMP10]] to <2 x i16>
436; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <256 x i32> [[B]], i16 [[TMP7]]
437; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32 [[TMP12]] to <2 x i16>
438; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> zeroinitializer, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
439; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP14]] to <2 x float>
440; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> zeroinitializer, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
441; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP16]] to <2 x float>
442; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x float> [[TMP15]], [[TMP17]]
443; CHECK-NEXT:    [[TMP19:%.*]] = call float @llvm.vector.reduce.fadd.v2f32(float [[TMP9]], <2 x float> [[TMP18]])
444; CHECK-NEXT:    [[TMP20:%.*]] = bitcast float [[TMP19]] to i32
445; CHECK-NEXT:    [[TMP21]] = insertelement <256 x i32> [[VEC_C_INNER_PHI]], i32 [[TMP20]], i16 [[TMP3]]
446; CHECK-NEXT:    br label [[TILEDPBF16PS_SCALARIZE_INNER_LATCH]]
447; CHECK:       tiledpbf16ps.scalarize.inner.latch:
448; CHECK-NEXT:    [[TILEDPBF16PS_SCALARIZE_INNER_STEP]] = add i16 [[TILEDPBF16PS_SCALARIZE_INNER_IV]], 1
449; CHECK-NEXT:    [[TILEDPBF16PS_SCALARIZE_INNER_COND:%.*]] = icmp ne i16 [[TILEDPBF16PS_SCALARIZE_INNER_STEP]], [[TMP1]]
450; CHECK-NEXT:    br i1 [[TILEDPBF16PS_SCALARIZE_INNER_COND]], label [[TILEDPBF16PS_SCALARIZE_INNER_HEADER]], label [[TILEDPBF16PS_SCALARIZE_COLS_LATCH]]
451; CHECK:       tiledpbf16ps.scalarize.cols.latch:
452; CHECK-NEXT:    [[TILEDPBF16PS_SCALARIZE_COLS_STEP]] = add i16 [[TILEDPBF16PS_SCALARIZE_COLS_IV]], 1
453; CHECK-NEXT:    [[TILEDPBF16PS_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILEDPBF16PS_SCALARIZE_COLS_STEP]], [[TMP0]]
454; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <256 x i32> [[TMP21]], i16 [[TMP3]]
455; CHECK-NEXT:    [[TMP23]] = insertelement <256 x i32> [[VEC_D_PHI_COL]], i32 [[TMP22]], i16 [[TMP3]]
456; CHECK-NEXT:    br i1 [[TILEDPBF16PS_SCALARIZE_COLS_COND]], label [[TILEDPBF16PS_SCALARIZE_COLS_HEADER]], label [[TILEDPBF16PS_SCALARIZE_ROWS_LATCH]]
457; CHECK:       tiledpbf16ps.scalarize.rows.latch:
458; CHECK-NEXT:    [[TILEDPBF16PS_SCALARIZE_ROWS_STEP]] = add i16 [[TILEDPBF16PS_SCALARIZE_ROWS_IV]], 1
459; CHECK-NEXT:    [[TILEDPBF16PS_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBF16PS_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
460; CHECK-NEXT:    br i1 [[TILEDPBF16PS_SCALARIZE_ROWS_COND]], label [[TILEDPBF16PS_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
461; CHECK:       continue:
462; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <256 x i32> [[TMP23]] to x86_amx
463; CHECK-NEXT:    store <256 x i32> [[TMP23]], ptr [[VPTR:%.*]], align 64
464; CHECK-NEXT:    ret void
465;
466entry:
467  %a.amx = bitcast <256 x i32> %a to x86_amx
468  %b.amx = bitcast <256 x i32> %b to x86_amx
469  %c.amx = bitcast <256 x i32> %c to x86_amx
470  %acc = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 %row, i16 %col, i16 %k, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx)
471  %vec = bitcast x86_amx %acc to <256 x i32>
472  store <256 x i32> %vec, ptr %vptr, align 64
473  ret void
474}
475
476define dso_local void @test_amx_store(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr, <256 x i32> %vec) #0 {
477; CHECK-LABEL: @test_amx_store(
478; CHECK-NEXT:  entry:
479; CHECK-NEXT:    [[AMX:%.*]] = bitcast <256 x i32> [[VEC:%.*]] to x86_amx
480; CHECK-NEXT:    [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2
481; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[STRIDE:%.*]], 2
482; CHECK-NEXT:    br label [[TILESTORE_SCALARIZE_ROWS_HEADER:%.*]]
483; CHECK:       tilestore.scalarize.rows.header:
484; CHECK-NEXT:    [[TILESTORE_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TILESTORE_SCALARIZE_ROWS_STEP:%.*]], [[TILESTORE_SCALARIZE_ROWS_LATCH:%.*]] ]
485; CHECK-NEXT:    br label [[TILESTORE_SCALARIZE_ROWS_BODY:%.*]]
486; CHECK:       tilestore.scalarize.rows.body:
487; CHECK-NEXT:    br label [[TILESTORE_SCALARIZE_COLS_HEADER:%.*]]
488; CHECK:       tilestore.scalarize.cols.header:
489; CHECK-NEXT:    [[TILESTORE_SCALARIZE_COLS_IV:%.*]] = phi i16 [ 0, [[TILESTORE_SCALARIZE_ROWS_BODY]] ], [ [[TILESTORE_SCALARIZE_COLS_STEP:%.*]], [[TILESTORE_SCALARIZE_COLS_LATCH:%.*]] ]
490; CHECK-NEXT:    br label [[TILESTORE_SCALARIZE_COLS_BODY:%.*]]
491; CHECK:       tilestore.scalarize.cols.body:
492; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TILESTORE_SCALARIZE_ROWS_IV]] to i64
493; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TILESTORE_SCALARIZE_COLS_IV]] to i64
494; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP1]]
495; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], [[TMP3]]
496; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP5]]
497; CHECK-NEXT:    [[TMP8:%.*]] = mul i16 [[TILESTORE_SCALARIZE_ROWS_IV]], 16
498; CHECK-NEXT:    [[TMP9:%.*]] = add i16 [[TMP8]], [[TILESTORE_SCALARIZE_COLS_IV]]
499; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <256 x i32> [[VEC]], i16 [[TMP9]]
500; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP7]], align 4
501; CHECK-NEXT:    br label [[TILESTORE_SCALARIZE_COLS_LATCH]]
502; CHECK:       tilestore.scalarize.cols.latch:
503; CHECK-NEXT:    [[TILESTORE_SCALARIZE_COLS_STEP]] = add i16 [[TILESTORE_SCALARIZE_COLS_IV]], 1
504; CHECK-NEXT:    [[TILESTORE_SCALARIZE_COLS_COND:%.*]] = icmp ne i16 [[TILESTORE_SCALARIZE_COLS_STEP]], [[TMP0]]
505; CHECK-NEXT:    br i1 [[TILESTORE_SCALARIZE_COLS_COND]], label [[TILESTORE_SCALARIZE_COLS_HEADER]], label [[TILESTORE_SCALARIZE_ROWS_LATCH]]
506; CHECK:       tilestore.scalarize.rows.latch:
507; CHECK-NEXT:    [[TILESTORE_SCALARIZE_ROWS_STEP]] = add i16 [[TILESTORE_SCALARIZE_ROWS_IV]], 1
508; CHECK-NEXT:    [[TILESTORE_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILESTORE_SCALARIZE_ROWS_STEP]], [[ROW:%.*]]
509; CHECK-NEXT:    br i1 [[TILESTORE_SCALARIZE_ROWS_COND]], label [[TILESTORE_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]]
510; CHECK:       continue:
511; CHECK-NEXT:    ret void
512;
513entry:
514  %amx = bitcast <256 x i32> %vec to x86_amx
515  call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr %ptr, i64 %stride, x86_amx %amx)
516  ret void
517}
518
519define dso_local void @test_amx_zero(i16 signext %row, i16 signext %col, ptr %vptr) #0 {
520; CHECK-LABEL: @test_amx_zero(
521; CHECK-NEXT:  entry:
522; CHECK-NEXT:    store <256 x i32> zeroinitializer, ptr [[VPTR:%.*]], align 64
523; CHECK-NEXT:    ret void
524;
525entry:
526  %amx = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
527  %vec = bitcast x86_amx %amx to <256 x i32>
528  store <256 x i32> %vec, ptr %vptr, align 64
529  ret void
530}
531
532declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
533declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
534declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
535declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
536declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
537declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
538declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
539declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
540
541attributes #0 = { noinline nounwind optnone }
542