xref: /llvm-project/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl (revision 7d544c64e3b6ea014c59e230dcf65ac4f9d60f2b)
1// REQUIRES: amdgpu-registered-target
2// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908
3// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A
4// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
5// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -DMFMA_GFX950_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX950
6
7#pragma OPENCL EXTENSION cl_khr_fp64:enable
8
9typedef float  v2f   __attribute__((ext_vector_type(2)));
10typedef float  v4f   __attribute__((ext_vector_type(4)));
11typedef float  v16f  __attribute__((ext_vector_type(16)));
12typedef float  v32f  __attribute__((ext_vector_type(32)));
13typedef half   v4h   __attribute__((ext_vector_type(4)));
14typedef half   v8h   __attribute__((ext_vector_type(8)));
15typedef half   v16h  __attribute__((ext_vector_type(16)));
16typedef half   v32h  __attribute__((ext_vector_type(32)));
17typedef int    v2i   __attribute__((ext_vector_type(2)));
18typedef int    v4i   __attribute__((ext_vector_type(4)));
19typedef int    v8i   __attribute__((ext_vector_type(8)));
20typedef int    v16i  __attribute__((ext_vector_type(16)));
21typedef int    v32i  __attribute__((ext_vector_type(32)));
22typedef short  v2s   __attribute__((ext_vector_type(2)));
23typedef short  v4s   __attribute__((ext_vector_type(4)));
24typedef short  v8s   __attribute__((ext_vector_type(8)));
25typedef short  v16s  __attribute__((ext_vector_type(16)));
26typedef short  v32s  __attribute__((ext_vector_type(32)));
27typedef double v4d   __attribute__((ext_vector_type(4)));
28typedef __bf16 v8bf16   __attribute__((ext_vector_type(8)));
29typedef __bf16 v16bf16   __attribute__((ext_vector_type(16)));
30
31
32#ifdef MFMA_GFX908_TESTS
33
34// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x1f32
35// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
36void test_mfma_f32_32x32x1f32(global v32f* out, float a, float b, v32f c)
37{
38  *out = __builtin_amdgcn_mfma_f32_32x32x1f32(a, b, c, 0, 0, 0);
39}
40
41// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x1f32
42// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
43void test_mfma_f32_16x16x1f32(global v16f* out, float a, float b, v16f c)
44{
45  *out = __builtin_amdgcn_mfma_f32_16x16x1f32(a, b, c, 0, 0, 0);
46}
47
48// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x1f32
49// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
50void test_mfma_f32_4x4x1f32(global v4f* out, float a, float b, v4f c)
51{
52  *out = __builtin_amdgcn_mfma_f32_4x4x1f32(a, b, c, 0, 0, 0);
53}
54
55// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2f32
56// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0)
57void test_mfma_f32_32x32x2f32(global v16f* out, float a, float b, v16f c)
58{
59  *out = __builtin_amdgcn_mfma_f32_32x32x2f32(a, b, c, 0, 0, 0);
60}
61
62// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f32
63// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0)
64void test_mfma_f32_16x16x4f32(global v4f* out, float a, float b, v4f c)
65{
66  *out = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, c, 0, 0, 0);
67}
68
69// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4f16
70// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %a, <4 x half> %b, <32 x float> %c, i32 0, i32 0, i32 0)
71void test_mfma_f32_32x32x4f16(global v32f* out, v4h a, v4h b, v32f c)
72{
73  *out = __builtin_amdgcn_mfma_f32_32x32x4f16(a, b, c, 0, 0, 0);
74}
75
76// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f16
77// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
78void test_mfma_f32_16x16x4f16(global v16f* out, v4h a, v4h b, v16f c)
79{
80  *out = __builtin_amdgcn_mfma_f32_16x16x4f16(a, b, c, 0, 0, 0);
81}
82
83// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x4f16
84// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
85void test_mfma_f32_4x4x4f16(global v4f* out, v4h a, v4h b, v4f c)
86{
87  *out = __builtin_amdgcn_mfma_f32_4x4x4f16(a, b, c, 0, 0, 0);
88}
89
90// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x8f16
91// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0)
92void test_mfma_f32_32x32x8f16(global v16f* out, v4h a, v4h b, v16f c)
93{
94  *out = __builtin_amdgcn_mfma_f32_32x32x8f16(a, b, c, 0, 0, 0);
95}
96
97// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x16f16
98// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0)
99void test_mfma_f32_16x16x16f16(global v4f* out, v4h a, v4h b, v4f c)
100{
101  *out = __builtin_amdgcn_mfma_f32_16x16x16f16(a, b, c, 0, 0, 0);
102}
103
104// CHECK-GFX908-LABEL: @test_mfma_i32_32x32x4i8
105// CHECK-GFX908: call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 %a, i32 %b, <32 x i32> %c, i32 0, i32 0, i32 0)
106void test_mfma_i32_32x32x4i8(global v32i* out, int a, int b, v32i c)
107{
108  *out = __builtin_amdgcn_mfma_i32_32x32x4i8(a, b, c, 0, 0, 0);
109}
110
111// CHECK-GFX908-LABEL: @test_mfma_i32_16x16x4i8
112// CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
113void test_mfma_i32_16x16x4i8(global v16i* out, int a, int b, v16i c)
114{
115  *out = __builtin_amdgcn_mfma_i32_16x16x4i8(a, b, c, 0, 0, 0);
116}
117
118// CHECK-GFX908-LABEL: @test_mfma_i32_4x4x4i8
119// CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
120void test_mfma_i32_4x4x4i8(global v4i* out, int a, int b, v4i c)
121{
122  *out = __builtin_amdgcn_mfma_i32_4x4x4i8(a, b, c, 0, 0, 0);
123}
124
125// CHECK-GFX908-LABEL: @test_mfma_i32_32x32x8i8
126// CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
127void test_mfma_i32_32x32x8i8(global v16i* out, int a, int b, v16i c)
128{
129  *out = __builtin_amdgcn_mfma_i32_32x32x8i8(a, b, c, 0, 0, 0);
130}
131
132// CHECK-GFX908-LABEL: @test_mfma_i32_16x16x16i8
133// CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
134void test_mfma_i32_16x16x16i8(global v4i* out, int a, int b, v4i c)
135{
136  *out = __builtin_amdgcn_mfma_i32_16x16x16i8(a, b, c, 0, 0, 0);
137}
138
139// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2bf16
140// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
141void test_mfma_f32_32x32x2bf16(global v32f* out, v2s a, v2s b, v32f c)
142{
143  *out = __builtin_amdgcn_mfma_f32_32x32x2bf16(a, b, c, 0, 0, 0);
144}
145
146// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x2bf16
147// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
148void test_mfma_f32_16x16x2bf16(global v16f* out, v2s a, v2s b, v16f c)
149{
150  *out = __builtin_amdgcn_mfma_f32_16x16x2bf16(a, b, c, 0, 0, 0);
151}
152
153// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x2bf16
154// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
155void test_mfma_f32_4x4x2bf16(global v4f* out, v2s a, v2s b, v4f c)
156{
157  *out = __builtin_amdgcn_mfma_f32_4x4x2bf16(a, b, c, 0, 0, 0);
158}
159
160// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4bf16
161// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
162void test_mfma_f32_32x32x4bf16(global v16f* out, v2s a, v2s b, v16f c)
163{
164  *out = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, b, c, 0, 0, 0);
165}
166
167// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x8bf16
168// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
169void test_mfma_f32_16x16x8bf16(global v4f* out, v2s a, v2s b, v4f c)
170{
171  *out = __builtin_amdgcn_mfma_f32_16x16x8bf16(a, b, c, 0, 0, 0);
172}
173
174#endif // MFMA_GFX908_TESTS
175
176#ifdef MFMA_GFX90A_TESTS
177
178// CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x4bf16_1k
179// CHECK-GFX90A: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
180void test_mfma_f32_32x32x4bf16_1k(global v32f* out, v4s a, v4s b, v32f c)
181{
182  *out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, 0, 0, 0);
183}
184
185// CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x4bf16_1k
186// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
187void test_mfma_f32_16x16x4bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
188{
189  *out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, 0, 0, 0);
190}
191
192// CHECK-GFX90A-LABEL: @test_mfma_f32_4x4x4bf16_1k
193// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
194void test_mfma_f32_4x4x4bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
195{
196  *out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, 0, 0, 0);
197}
198
199// CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x8bf16_1k
200// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
201void test_mfma_f32_32x32x8bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
202{
203  *out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, 0, 0, 0);
204}
205
206// CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x16bf16_1k
207// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
208void test_mfma_f32_16x16x16bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
209{
210  *out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, 0, 0, 0);
211}
212
213// CHECK-GFX90A-LABEL: @test_mfma_f64_16x16x4f64
214// CHECK-GFX90A: call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %c, i32 0, i32 0, i32 0)
215void test_mfma_f64_16x16x4f64(global v4d* out, double a, double b, v4d c)
216{
217  *out = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, 0, 0, 0);
218}
219
220// CHECK-GFX90A-LABEL: @test_mfma_f64_4x4x4f64
221// CHECK-GFX90A: call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %c, i32 0, i32 0, i32 0)
222void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c)
223{
224  *out = __builtin_amdgcn_mfma_f64_4x4x4f64(a, b, c, 0, 0, 0);
225}
226
227#endif // MFMA_GFX90A_TESTS
228
229#if defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS)
230// CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8
231// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
232void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c)
233{
234  *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, 0, 0);
235}
236
237// CHECK-GFX940-LABEL: @test_mfma_i32_32x32x16_i8
238// CHECK-GFX940: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 %a, i64 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
239void test_mfma_i32_32x32x16_i8(global v16i* out, long a, long b, v16i c)
240{
241  *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, 0, 0);
242}
243
244// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x8_xf32
245// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> %a, <2 x float> %b, <4 x float> %c, i32 0, i32 0, i32 0)
246void test_mfma_f32_16x16x8_xf32(global v4f* out, v2f a, v2f b, v4f c)
247{
248  *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, 0, 0);
249}
250
251// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x4_xf32
252// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> %a, <2 x float> %b, <16 x float> %c, i32 0, i32 0, i32 0)
253void test_mfma_f32_32x32x4_xf32(global v16f* out, v2f a, v2f b, v16f c)
254{
255  *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0);
256}
257
258// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_bf8
259// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
260void test_mfma_f32_16x16x32_bf8_bf8(global v4f* out, long a, long b, v4f c)
261{
262  *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(a, b, c, 0, 0, 0);
263}
264
265// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_fp8
266// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
267void test_mfma_f32_16x16x32_bf8_fp8(global v4f* out, long a, long b, v4f c)
268{
269  *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(a, b, c, 0, 0, 0);
270}
271
272// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_bf8
273// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
274void test_mfma_f32_16x16x32_fp8_bf8(global v4f* out, long a, long b, v4f c)
275{
276  *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(a, b, c, 0, 0, 0);
277}
278
279// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_fp8
280// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0)
281void test_mfma_f32_16x16x32_fp8_fp8(global v4f* out, long a, long b, v4f c)
282{
283  *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(a, b, c, 0, 0, 0);
284}
285
286// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_bf8
287// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
288void test_mfma_f32_32x32x16_bf8_bf8(global v16f* out, long a, long b, v16f c)
289{
290  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(a, b, c, 0, 0, 0);
291}
292
293// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_fp8
294// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
295void test_mfma_f32_32x32x16_bf8_fp8(global v16f* out, long a, long b, v16f c)
296{
297  *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(a, b, c, 0, 0, 0);
298}
299
300// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_bf8
301// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
302void test_mfma_f32_32x32x16_fp8_bf8(global v16f* out, long a, long b, v16f c)
303{
304  *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(a, b, c, 0, 0, 0);
305}
306
307// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_fp8
308// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0)
309void test_mfma_f32_32x32x16_fp8_fp8(global v16f* out, long a, long b, v16f c)
310{
311  *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(a, b, c, 0, 0, 0);
312}
313
314// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_f16
315// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
316void test_smfmac_f32_16x16x32_f16(global v4f* out, v4h a, v8h b, v4f c, int idx)
317{
318  *out = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a, b, c, idx, 0, 0);
319}
320
321// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_f16
322// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
323void test_smfmac_f32_32x32x16_f16(global v16f* out, v4h a, v8h b, v16f c, int idx)
324{
325  *out = __builtin_amdgcn_smfmac_f32_32x32x16_f16(a, b, c, idx, 0, 0);
326}
327
328// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_bf16
329// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
330void test_smfmac_f32_16x16x32_bf16(global v4f* out, v4s a, v8s b, v4f c, int idx)
331{
332  *out = __builtin_amdgcn_smfmac_f32_16x16x32_bf16(a, b, c, idx, 0, 0);
333}
334
335// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_bf16
336// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
337void test_smfmac_f32_32x32x16_bf16(global v16f* out, v4s a, v8s b, v16f c, int idx)
338{
339  *out = __builtin_amdgcn_smfmac_f32_32x32x16_bf16(a, b, c, idx, 0, 0);
340}
341
342// CHECK-GFX940-LABEL: @test_smfmac_i32_16x16x64_i8
343// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 %idx, i32 0, i32 0)
344void test_smfmac_i32_16x16x64_i8(global v4i* out, v2i a, v4i b, v4i c, int idx)
345{
346  *out = __builtin_amdgcn_smfmac_i32_16x16x64_i8(a, b, c, idx, 0, 0);
347}
348
349// CHECK-GFX940-LABEL: @test_smfmac_i32_32x32x32_i8
350// CHECK-GFX940: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %c, i32 %idx, i32 0, i32 0)
351void test_smfmac_i32_32x32x32_i8(global v16i* out, v2i a, v4i b, v16i c, int idx)
352{
353  *out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, 0, 0);
354}
355
356// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_bf8
357// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
358void test_smfmac_f32_16x16x64_bf8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx)
359{
360  *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8(a, b, c, idx, 0, 0);
361}
362
363// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_fp8
364// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
365void test_smfmac_f32_16x16x64_bf8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx)
366{
367  *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8(a, b, c, idx, 0, 0);
368}
369
370// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_bf8
371// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
372void test_smfmac_f32_16x16x64_fp8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx)
373{
374  *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8(a, b, c, idx, 0, 0);
375}
376
377// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_fp8
378// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
379void test_smfmac_f32_16x16x64_fp8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx)
380{
381  *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8(a, b, c, idx, 0, 0);
382}
383
384// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_bf8
385// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
386void test_smfmac_f32_32x32x32_bf8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx)
387{
388  *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8(a, b, c, idx, 0, 0);
389}
390
391// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_fp8
392// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
393void test_smfmac_f32_32x32x32_bf8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx)
394{
395  *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8(a, b, c, idx, 0, 0);
396}
397
398// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_bf8
399// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
400void test_smfmac_f32_32x32x32_fp8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx)
401{
402  *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8(a, b, c, idx, 0, 0);
403}
404
405// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_fp8
406// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
407void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx)
408{
409  *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, 0);
410}
411#endif // defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS)
412
413#ifdef MFMA_GFX950_TESTS
414
415// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_f16(
416// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %a, <8 x half> %b, <4 x float> %c, i32 1, i32 2, i32 3)
417
418v4f test_mfma_f32_16x16x32_f16(v8h a, v8h b, v4f c)
419{
420  return __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 1, 2, 3);
421}
422
423// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_f16
424// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %a, <8 x half> %b, <16 x float> %c, i32 1, i32 2, i32 3)
425v16f test_mfma_f32_32x32x16_f16(v8h a, v8h b, v16f c)
426{
427  return __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 1, 2, 3);
428}
429
430// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_bf16(
431// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <16 x float> %c, i32 1, i32 2, i32 3)
432v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) {
433  return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3);
434}
435
436// CHECK-GFX950-LABEL: @test_mfma_scale_f32_16x16x128_f8f6f4
437// CHECK-GFX950: [[EXTRACT_A:%.+]] = shufflevector <8 x i32> %a, <8 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
438// CHECK-GFX950: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> [[EXTRACT_A]], <8 x i32> %b, <4 x float> %c, i32 3, i32 1, i32 2, i32 %scale_a, i32 3, i32 %scale_b)
439void test_mfma_scale_f32_16x16x128_f8f6f4(global v4f* out, v8i a, v8i b, v4f c, int scale_a, int scale_b)
440{
441  *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 3, 1, 2, scale_a, 3, scale_b);
442}
443
444// CHECK-GFX950-LABEL: @test_mfma_scale_f32_32x32x64_f8f6f4
445// CHECK-GFX950: [[EXTRACT_A:%.+]] = shufflevector <8 x i32> %a, <8 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
446// CHECK-GFX950: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> [[EXTRACT_A]], <8 x i32> %b, <16 x float> %c, i32 3, i32 1, i32 2, i32 %scale_a, i32 3, i32 %scale_b)
447void test_mfma_scale_f32_32x32x64_f8f6f4(global v16f* out, v8i a, v8i b, v16f c, int scale_a, int scale_b)
448{
449  *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 3, 1, 2, scale_a, 3, scale_b);
450}
451
452// CHECK-GFX950-LABEL: @test_mfma_i32_16x16x64_i8(
453// CHECK-GFX950: tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 1, i32 2, i32 3)
454v4i test_mfma_i32_16x16x64_i8(v4i a, v4i b, v4i c) {
455  return __builtin_amdgcn_mfma_i32_16x16x64_i8(a, b, c, 1, 2, 3);
456}
457
458// CHECK-GFX950-LABEL: @test_mfma_i32_32x32x32_i8(
459// CHECK-GFX950: tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %a, <4 x i32> %b, <16 x i32> %c, i32 1, i32 2, i32 3)
460v16i test_mfma_i32_32x32x32_i8(v4i a, v4i b, v16i c) {
461  return __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 1, 2, 3);
462}
463
464// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_bf16(
465// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <4 x float> %c, i32 1, i32 2, i32 3)
466v4f test_mfma_f32_16x16x32_bf16(v8bf16 a, v8bf16 b, v4f c)
467{
468  return __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 1, 2, 3);
469}
470
471// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x64_f16
472// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %a, <16 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
473void test_smfmac_f32_16x16x64_f16(global v4f* out, v8h a, v16h b, v4f c, int idx)
474{
475  *out = __builtin_amdgcn_smfmac_f32_16x16x64_f16(a, b, c, idx, 0, 0);
476}
477
478// CHECK-GFX950-LABEL: @test_smfmac_f32_32x32x32_f16
479// CHECK-GFX950: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %a, <16 x half> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
480void test_smfmac_f32_32x32x32_f16(global v16f* out, v8h a, v16h b, v16f c, int idx)
481{
482  *out = __builtin_amdgcn_smfmac_f32_32x32x32_f16(a, b, c, idx, 0, 0);
483}
484
485// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x64_bf16
486// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
487void test_smfmac_f32_16x16x64_bf16(global v4f* out, v8bf16 a, v16bf16 b, v4f c, int idx)
488{
489  *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf16(a, b, c, idx, 0, 0);
490}
491
492// CHECK-GFX950-LABEL: @test_smfmac_f32_32x32x32_bf16
493// CHECK-GFX950: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
494void test_smfmac_f32_32x32x32_bf16(global v16f* out, v8bf16 a, v16bf16 b, v16f c, int idx)
495{
496  *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf16(a, b, c, idx, 0, 0);
497}
498
499// CHECK-GFX950-LABEL: @test_smfmac_i32_16x16x128_i8
500// CHECK-GFX950: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %a, <8 x i32> %b, <4 x i32> %c, i32 %idx, i32 0, i32 0)
501void test_smfmac_i32_16x16x128_i8(global v4i* out, v4i a, v8i b, v4i c, int idx)
502{
503  *out = __builtin_amdgcn_smfmac_i32_16x16x128_i8(a, b, c, idx, 0, 0);
504}
505
506// CHECK-GFX950-LABEL: @test_smfmac_i32_32x32x64_i8
507// CHECK-GFX950: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %a, <8 x i32> %b, <16 x i32> %c, i32 %idx, i32 0, i32 0)
508void test_smfmac_i32_32x32x64_i8(global v16i* out, v4i a, v8i b, v16i c, int idx)
509{
510  *out = __builtin_amdgcn_smfmac_i32_32x32x64_i8(a, b, c, idx, 0, 0);
511}
512
513// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x128_bf8_bf8
514// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %a, <8 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
515void test_smfmac_f32_16x16x128_bf8_bf8(global v4f* out, v4i a, v8i b, v4f c, int idx)
516{
517  *out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8(a, b, c, idx, 0, 0);
518}
519
520// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x128_bf8_fp8
521// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
522void test_smfmac_f32_16x16x128_bf8_fp8(global v4f* out, v4i a, v8i b, v4f c, int idx)
523{
524  *out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8(a, b, c, idx, 0, 0);
525}
526
527// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x128_fp8_bf8
528// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %a, <8 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
529void test_smfmac_f32_16x16x128_fp8_bf8(global v4f* out, v4i a, v8i b, v4f c, int idx)
530{
531  *out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8(a, b, c, idx, 0, 0);
532}
533
534// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x128_fp8_fp8
535// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
536void test_smfmac_f32_16x16x128_fp8_fp8(global v4f* out, v4i a, v8i b, v4f c, int idx)
537{
538  *out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8(a, b, c, idx, 0, 0);
539}
540
541// CHECK-GFX950-LABEL: @test_smfmac_f32_32x32x64_bf8_bf8
542// CHECK-GFX950: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %a, <8 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
543void test_smfmac_f32_32x32x64_bf8_bf8(global v16f* out, v4i a, v8i b, v16f c, int idx)
544{
545  *out = __builtin_amdgcn_smfmac_f32_32x32x64_bf8_bf8(a, b, c, idx, 0, 0);
546}
547
548// CHECK-GFX950-LABEL: @test_smfmac_f32_32x32x64_bf8_fp8
549// CHECK-GFX950: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %a, <8 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
550void test_smfmac_f32_32x32x64_bf8_fp8(global v16f* out, v4i a, v8i b, v16f c, int idx)
551{
552  *out = __builtin_amdgcn_smfmac_f32_32x32x64_bf8_fp8(a, b, c, idx, 0, 0);
553}
554
555// CHECK-GFX950-LABEL: @test_smfmac_f32_32x32x64_fp8_bf8
556// CHECK-GFX950: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %a, <8 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
557void test_smfmac_f32_32x32x64_fp8_bf8(global v16f* out, v4i a, v8i b, v16f c, int idx)
558{
559  *out = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_bf8(a, b, c, idx, 0, 0);
560}
561
562// CHECK-GFX950-LABEL: @test_smfmac_f32_32x32x64_fp8_fp8
563// CHECK-GFX950: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %a, <8 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0)
564void test_smfmac_f32_32x32x64_fp8_fp8(global v16f* out, v4i a, v8i b, v16f c, int idx)
565{
566  *out = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8(a, b, c, idx, 0, 0);
567}
568
569#endif
570