1// REQUIRES: amdgpu-registered-target 2// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908 3// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A 4// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940 5// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -DMFMA_GFX950_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX950 6 7#pragma OPENCL EXTENSION cl_khr_fp64:enable 8 9typedef float v2f __attribute__((ext_vector_type(2))); 10typedef float v4f __attribute__((ext_vector_type(4))); 11typedef float v16f __attribute__((ext_vector_type(16))); 12typedef float v32f __attribute__((ext_vector_type(32))); 13typedef half v4h __attribute__((ext_vector_type(4))); 14typedef half v8h __attribute__((ext_vector_type(8))); 15typedef half v16h __attribute__((ext_vector_type(16))); 16typedef half v32h __attribute__((ext_vector_type(32))); 17typedef int v2i __attribute__((ext_vector_type(2))); 18typedef int v4i __attribute__((ext_vector_type(4))); 19typedef int v8i __attribute__((ext_vector_type(8))); 20typedef int v16i __attribute__((ext_vector_type(16))); 21typedef int v32i __attribute__((ext_vector_type(32))); 22typedef short v2s __attribute__((ext_vector_type(2))); 23typedef short v4s __attribute__((ext_vector_type(4))); 24typedef short v8s __attribute__((ext_vector_type(8))); 25typedef short v16s __attribute__((ext_vector_type(16))); 26typedef short v32s __attribute__((ext_vector_type(32))); 27typedef double v4d __attribute__((ext_vector_type(4))); 28typedef __bf16 v8bf16 __attribute__((ext_vector_type(8))); 29typedef __bf16 v16bf16 __attribute__((ext_vector_type(16))); 30 31 32#ifdef MFMA_GFX908_TESTS 33 34// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x1f32 35// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0) 36void test_mfma_f32_32x32x1f32(global v32f* out, float a, float b, v32f c) 37{ 38 *out = __builtin_amdgcn_mfma_f32_32x32x1f32(a, b, c, 0, 0, 0); 39} 40 41// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x1f32 42// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0) 43void test_mfma_f32_16x16x1f32(global v16f* out, float a, float b, v16f c) 44{ 45 *out = __builtin_amdgcn_mfma_f32_16x16x1f32(a, b, c, 0, 0, 0); 46} 47 48// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x1f32 49// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0) 50void test_mfma_f32_4x4x1f32(global v4f* out, float a, float b, v4f c) 51{ 52 *out = __builtin_amdgcn_mfma_f32_4x4x1f32(a, b, c, 0, 0, 0); 53} 54 55// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2f32 56// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0) 57void test_mfma_f32_32x32x2f32(global v16f* out, float a, float b, v16f c) 58{ 59 *out = __builtin_amdgcn_mfma_f32_32x32x2f32(a, b, c, 0, 0, 0); 60} 61 62// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f32 63// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0) 64void test_mfma_f32_16x16x4f32(global v4f* out, float a, float b, v4f c) 65{ 66 *out = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, c, 0, 0, 0); 67} 68 69// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4f16 70// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %a, <4 x half> %b, <32 x float> %c, i32 0, i32 0, i32 0) 71void test_mfma_f32_32x32x4f16(global v32f* out, v4h a, v4h b, v32f c) 72{ 73 *out = __builtin_amdgcn_mfma_f32_32x32x4f16(a, b, c, 0, 0, 0); 74} 75 76// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f16 77// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0) 78void test_mfma_f32_16x16x4f16(global v16f* out, v4h a, v4h b, v16f c) 79{ 80 *out = __builtin_amdgcn_mfma_f32_16x16x4f16(a, b, c, 0, 0, 0); 81} 82 83// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x4f16 84// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0) 85void test_mfma_f32_4x4x4f16(global v4f* out, v4h a, v4h b, v4f c) 86{ 87 *out = __builtin_amdgcn_mfma_f32_4x4x4f16(a, b, c, 0, 0, 0); 88} 89 90// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x8f16 91// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0) 92void test_mfma_f32_32x32x8f16(global v16f* out, v4h a, v4h b, v16f c) 93{ 94 *out = __builtin_amdgcn_mfma_f32_32x32x8f16(a, b, c, 0, 0, 0); 95} 96 97// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x16f16 98// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0) 99void test_mfma_f32_16x16x16f16(global v4f* out, v4h a, v4h b, v4f c) 100{ 101 *out = __builtin_amdgcn_mfma_f32_16x16x16f16(a, b, c, 0, 0, 0); 102} 103 104// CHECK-GFX908-LABEL: @test_mfma_i32_32x32x4i8 105// CHECK-GFX908: call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 %a, i32 %b, <32 x i32> %c, i32 0, i32 0, i32 0) 106void test_mfma_i32_32x32x4i8(global v32i* out, int a, int b, v32i c) 107{ 108 *out = __builtin_amdgcn_mfma_i32_32x32x4i8(a, b, c, 0, 0, 0); 109} 110 111// CHECK-GFX908-LABEL: @test_mfma_i32_16x16x4i8 112// CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0) 113void test_mfma_i32_16x16x4i8(global v16i* out, int a, int b, v16i c) 114{ 115 *out = __builtin_amdgcn_mfma_i32_16x16x4i8(a, b, c, 0, 0, 0); 116} 117 118// CHECK-GFX908-LABEL: @test_mfma_i32_4x4x4i8 119// CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0) 120void test_mfma_i32_4x4x4i8(global v4i* out, int a, int b, v4i c) 121{ 122 *out = __builtin_amdgcn_mfma_i32_4x4x4i8(a, b, c, 0, 0, 0); 123} 124 125// CHECK-GFX908-LABEL: @test_mfma_i32_32x32x8i8 126// CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0) 127void test_mfma_i32_32x32x8i8(global v16i* out, int a, int b, v16i c) 128{ 129 *out = __builtin_amdgcn_mfma_i32_32x32x8i8(a, b, c, 0, 0, 0); 130} 131 132// CHECK-GFX908-LABEL: @test_mfma_i32_16x16x16i8 133// CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0) 134void test_mfma_i32_16x16x16i8(global v4i* out, int a, int b, v4i c) 135{ 136 *out = __builtin_amdgcn_mfma_i32_16x16x16i8(a, b, c, 0, 0, 0); 137} 138 139// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2bf16 140// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0) 141void test_mfma_f32_32x32x2bf16(global v32f* out, v2s a, v2s b, v32f c) 142{ 143 *out = __builtin_amdgcn_mfma_f32_32x32x2bf16(a, b, c, 0, 0, 0); 144} 145 146// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x2bf16 147// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0) 148void test_mfma_f32_16x16x2bf16(global v16f* out, v2s a, v2s b, v16f c) 149{ 150 *out = __builtin_amdgcn_mfma_f32_16x16x2bf16(a, b, c, 0, 0, 0); 151} 152 153// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x2bf16 154// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0) 155void test_mfma_f32_4x4x2bf16(global v4f* out, v2s a, v2s b, v4f c) 156{ 157 *out = __builtin_amdgcn_mfma_f32_4x4x2bf16(a, b, c, 0, 0, 0); 158} 159 160// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4bf16 161// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0) 162void test_mfma_f32_32x32x4bf16(global v16f* out, v2s a, v2s b, v16f c) 163{ 164 *out = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, b, c, 0, 0, 0); 165} 166 167// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x8bf16 168// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0) 169void test_mfma_f32_16x16x8bf16(global v4f* out, v2s a, v2s b, v4f c) 170{ 171 *out = __builtin_amdgcn_mfma_f32_16x16x8bf16(a, b, c, 0, 0, 0); 172} 173 174#endif // MFMA_GFX908_TESTS 175 176#ifdef MFMA_GFX90A_TESTS 177 178// CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x4bf16_1k 179// CHECK-GFX90A: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0) 180void test_mfma_f32_32x32x4bf16_1k(global v32f* out, v4s a, v4s b, v32f c) 181{ 182 *out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, 0, 0, 0); 183} 184 185// CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x4bf16_1k 186// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0) 187void test_mfma_f32_16x16x4bf16_1k(global v16f* out, v4s a, v4s b, v16f c) 188{ 189 *out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, 0, 0, 0); 190} 191 192// CHECK-GFX90A-LABEL: @test_mfma_f32_4x4x4bf16_1k 193// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0) 194void test_mfma_f32_4x4x4bf16_1k(global v4f* out, v4s a, v4s b, v4f c) 195{ 196 *out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, 0, 0, 0); 197} 198 199// CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x8bf16_1k 200// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0) 201void test_mfma_f32_32x32x8bf16_1k(global v16f* out, v4s a, v4s b, v16f c) 202{ 203 *out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, 0, 0, 0); 204} 205 206// CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x16bf16_1k 207// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0) 208void test_mfma_f32_16x16x16bf16_1k(global v4f* out, v4s a, v4s b, v4f c) 209{ 210 *out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, 0, 0, 0); 211} 212 213// CHECK-GFX90A-LABEL: @test_mfma_f64_16x16x4f64 214// CHECK-GFX90A: call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %c, i32 0, i32 0, i32 0) 215void test_mfma_f64_16x16x4f64(global v4d* out, double a, double b, v4d c) 216{ 217 *out = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, 0, 0, 0); 218} 219 220// CHECK-GFX90A-LABEL: @test_mfma_f64_4x4x4f64 221// CHECK-GFX90A: call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %c, i32 0, i32 0, i32 0) 222void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c) 223{ 224 *out = __builtin_amdgcn_mfma_f64_4x4x4f64(a, b, c, 0, 0, 0); 225} 226 227#endif // MFMA_GFX90A_TESTS 228 229#if defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS) 230// CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8 231// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0) 232void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c) 233{ 234 *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, 0, 0); 235} 236 237// CHECK-GFX940-LABEL: @test_mfma_i32_32x32x16_i8 238// CHECK-GFX940: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 %a, i64 %b, <16 x i32> %c, i32 0, i32 0, i32 0) 239void test_mfma_i32_32x32x16_i8(global v16i* out, long a, long b, v16i c) 240{ 241 *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, 0, 0); 242} 243 244// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x8_xf32 245// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> %a, <2 x float> %b, <4 x float> %c, i32 0, i32 0, i32 0) 246void test_mfma_f32_16x16x8_xf32(global v4f* out, v2f a, v2f b, v4f c) 247{ 248 *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, 0, 0); 249} 250 251// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x4_xf32 252// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> %a, <2 x float> %b, <16 x float> %c, i32 0, i32 0, i32 0) 253void test_mfma_f32_32x32x4_xf32(global v16f* out, v2f a, v2f b, v16f c) 254{ 255 *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0); 256} 257 258// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_bf8 259// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) 260void test_mfma_f32_16x16x32_bf8_bf8(global v4f* out, long a, long b, v4f c) 261{ 262 *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(a, b, c, 0, 0, 0); 263} 264 265// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_fp8 266// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) 267void test_mfma_f32_16x16x32_bf8_fp8(global v4f* out, long a, long b, v4f c) 268{ 269 *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(a, b, c, 0, 0, 0); 270} 271 272// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_bf8 273// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) 274void test_mfma_f32_16x16x32_fp8_bf8(global v4f* out, long a, long b, v4f c) 275{ 276 *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(a, b, c, 0, 0, 0); 277} 278 279// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_fp8 280// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) 281void test_mfma_f32_16x16x32_fp8_fp8(global v4f* out, long a, long b, v4f c) 282{ 283 *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(a, b, c, 0, 0, 0); 284} 285 286// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_bf8 287// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) 288void test_mfma_f32_32x32x16_bf8_bf8(global v16f* out, long a, long b, v16f c) 289{ 290 *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(a, b, c, 0, 0, 0); 291} 292 293// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_fp8 294// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) 295void test_mfma_f32_32x32x16_bf8_fp8(global v16f* out, long a, long b, v16f c) 296{ 297 *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(a, b, c, 0, 0, 0); 298} 299 300// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_bf8 301// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) 302void test_mfma_f32_32x32x16_fp8_bf8(global v16f* out, long a, long b, v16f c) 303{ 304 *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(a, b, c, 0, 0, 0); 305} 306 307// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_fp8 308// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) 309void test_mfma_f32_32x32x16_fp8_fp8(global v16f* out, long a, long b, v16f c) 310{ 311 *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(a, b, c, 0, 0, 0); 312} 313 314// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_f16 315// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) 316void test_smfmac_f32_16x16x32_f16(global v4f* out, v4h a, v8h b, v4f c, int idx) 317{ 318 *out = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a, b, c, idx, 0, 0); 319} 320 321// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_f16 322// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) 323void test_smfmac_f32_32x32x16_f16(global v16f* out, v4h a, v8h b, v16f c, int idx) 324{ 325 *out = __builtin_amdgcn_smfmac_f32_32x32x16_f16(a, b, c, idx, 0, 0); 326} 327 328// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_bf16 329// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) 330void test_smfmac_f32_16x16x32_bf16(global v4f* out, v4s a, v8s b, v4f c, int idx) 331{ 332 *out = __builtin_amdgcn_smfmac_f32_16x16x32_bf16(a, b, c, idx, 0, 0); 333} 334 335// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_bf16 336// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) 337void test_smfmac_f32_32x32x16_bf16(global v16f* out, v4s a, v8s b, v16f c, int idx) 338{ 339 *out = __builtin_amdgcn_smfmac_f32_32x32x16_bf16(a, b, c, idx, 0, 0); 340} 341 342// CHECK-GFX940-LABEL: @test_smfmac_i32_16x16x64_i8 343// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 %idx, i32 0, i32 0) 344void test_smfmac_i32_16x16x64_i8(global v4i* out, v2i a, v4i b, v4i c, int idx) 345{ 346 *out = __builtin_amdgcn_smfmac_i32_16x16x64_i8(a, b, c, idx, 0, 0); 347} 348 349// CHECK-GFX940-LABEL: @test_smfmac_i32_32x32x32_i8 350// CHECK-GFX940: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %c, i32 %idx, i32 0, i32 0) 351void test_smfmac_i32_32x32x32_i8(global v16i* out, v2i a, v4i b, v16i c, int idx) 352{ 353 *out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, 0, 0); 354} 355 356// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_bf8 357// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) 358void test_smfmac_f32_16x16x64_bf8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx) 359{ 360 *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8(a, b, c, idx, 0, 0); 361} 362 363// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_fp8 364// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) 365void test_smfmac_f32_16x16x64_bf8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx) 366{ 367 *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8(a, b, c, idx, 0, 0); 368} 369 370// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_bf8 371// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) 372void test_smfmac_f32_16x16x64_fp8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx) 373{ 374 *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8(a, b, c, idx, 0, 0); 375} 376 377// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_fp8 378// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) 379void test_smfmac_f32_16x16x64_fp8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx) 380{ 381 *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8(a, b, c, idx, 0, 0); 382} 383 384// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_bf8 385// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) 386void test_smfmac_f32_32x32x32_bf8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx) 387{ 388 *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8(a, b, c, idx, 0, 0); 389} 390 391// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_fp8 392// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) 393void test_smfmac_f32_32x32x32_bf8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx) 394{ 395 *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8(a, b, c, idx, 0, 0); 396} 397 398// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_bf8 399// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) 400void test_smfmac_f32_32x32x32_fp8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx) 401{ 402 *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8(a, b, c, idx, 0, 0); 403} 404 405// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_fp8 406// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) 407void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx) 408{ 409 *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, 0); 410} 411#endif // defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS) 412 413#ifdef MFMA_GFX950_TESTS 414 415// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_f16( 416// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %a, <8 x half> %b, <4 x float> %c, i32 1, i32 2, i32 3) 417 418v4f test_mfma_f32_16x16x32_f16(v8h a, v8h b, v4f c) 419{ 420 return __builtin_amdgcn_mfma_f32_16x16x32_f16(a, b, c, 1, 2, 3); 421} 422 423// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_f16 424// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %a, <8 x half> %b, <16 x float> %c, i32 1, i32 2, i32 3) 425v16f test_mfma_f32_32x32x16_f16(v8h a, v8h b, v16f c) 426{ 427 return __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 1, 2, 3); 428} 429 430// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_bf16( 431// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <16 x float> %c, i32 1, i32 2, i32 3) 432v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) { 433 return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3); 434} 435 436// CHECK-GFX950-LABEL: @test_mfma_scale_f32_16x16x128_f8f6f4 437// CHECK-GFX950: [[EXTRACT_A:%.+]] = shufflevector <8 x i32> %a, <8 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> 438// CHECK-GFX950: call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> [[EXTRACT_A]], <8 x i32> %b, <4 x float> %c, i32 3, i32 1, i32 2, i32 %scale_a, i32 3, i32 %scale_b) 439void test_mfma_scale_f32_16x16x128_f8f6f4(global v4f* out, v8i a, v8i b, v4f c, int scale_a, int scale_b) 440{ 441 *out = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a, b, c, 3, 1, 2, scale_a, 3, scale_b); 442} 443 444// CHECK-GFX950-LABEL: @test_mfma_scale_f32_32x32x64_f8f6f4 445// CHECK-GFX950: [[EXTRACT_A:%.+]] = shufflevector <8 x i32> %a, <8 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> 446// CHECK-GFX950: call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> [[EXTRACT_A]], <8 x i32> %b, <16 x float> %c, i32 3, i32 1, i32 2, i32 %scale_a, i32 3, i32 %scale_b) 447void test_mfma_scale_f32_32x32x64_f8f6f4(global v16f* out, v8i a, v8i b, v16f c, int scale_a, int scale_b) 448{ 449 *out = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, 3, 1, 2, scale_a, 3, scale_b); 450} 451 452// CHECK-GFX950-LABEL: @test_mfma_i32_16x16x64_i8( 453// CHECK-GFX950: tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 1, i32 2, i32 3) 454v4i test_mfma_i32_16x16x64_i8(v4i a, v4i b, v4i c) { 455 return __builtin_amdgcn_mfma_i32_16x16x64_i8(a, b, c, 1, 2, 3); 456} 457 458// CHECK-GFX950-LABEL: @test_mfma_i32_32x32x32_i8( 459// CHECK-GFX950: tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %a, <4 x i32> %b, <16 x i32> %c, i32 1, i32 2, i32 3) 460v16i test_mfma_i32_32x32x32_i8(v4i a, v4i b, v16i c) { 461 return __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 1, 2, 3); 462} 463 464// CHECK-GFX950-LABEL: @test_mfma_f32_16x16x32_bf16( 465// CHECK-GFX950: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <4 x float> %c, i32 1, i32 2, i32 3) 466v4f test_mfma_f32_16x16x32_bf16(v8bf16 a, v8bf16 b, v4f c) 467{ 468 return __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 1, 2, 3); 469} 470 471// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x64_f16 472// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %a, <16 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) 473void test_smfmac_f32_16x16x64_f16(global v4f* out, v8h a, v16h b, v4f c, int idx) 474{ 475 *out = __builtin_amdgcn_smfmac_f32_16x16x64_f16(a, b, c, idx, 0, 0); 476} 477 478// CHECK-GFX950-LABEL: @test_smfmac_f32_32x32x32_f16 479// CHECK-GFX950: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %a, <16 x half> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) 480void test_smfmac_f32_32x32x32_f16(global v16f* out, v8h a, v16h b, v16f c, int idx) 481{ 482 *out = __builtin_amdgcn_smfmac_f32_32x32x32_f16(a, b, c, idx, 0, 0); 483} 484 485// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x64_bf16 486// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) 487void test_smfmac_f32_16x16x64_bf16(global v4f* out, v8bf16 a, v16bf16 b, v4f c, int idx) 488{ 489 *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf16(a, b, c, idx, 0, 0); 490} 491 492// CHECK-GFX950-LABEL: @test_smfmac_f32_32x32x32_bf16 493// CHECK-GFX950: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) 494void test_smfmac_f32_32x32x32_bf16(global v16f* out, v8bf16 a, v16bf16 b, v16f c, int idx) 495{ 496 *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf16(a, b, c, idx, 0, 0); 497} 498 499// CHECK-GFX950-LABEL: @test_smfmac_i32_16x16x128_i8 500// CHECK-GFX950: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %a, <8 x i32> %b, <4 x i32> %c, i32 %idx, i32 0, i32 0) 501void test_smfmac_i32_16x16x128_i8(global v4i* out, v4i a, v8i b, v4i c, int idx) 502{ 503 *out = __builtin_amdgcn_smfmac_i32_16x16x128_i8(a, b, c, idx, 0, 0); 504} 505 506// CHECK-GFX950-LABEL: @test_smfmac_i32_32x32x64_i8 507// CHECK-GFX950: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %a, <8 x i32> %b, <16 x i32> %c, i32 %idx, i32 0, i32 0) 508void test_smfmac_i32_32x32x64_i8(global v16i* out, v4i a, v8i b, v16i c, int idx) 509{ 510 *out = __builtin_amdgcn_smfmac_i32_32x32x64_i8(a, b, c, idx, 0, 0); 511} 512 513// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x128_bf8_bf8 514// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %a, <8 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) 515void test_smfmac_f32_16x16x128_bf8_bf8(global v4f* out, v4i a, v8i b, v4f c, int idx) 516{ 517 *out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8(a, b, c, idx, 0, 0); 518} 519 520// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x128_bf8_fp8 521// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) 522void test_smfmac_f32_16x16x128_bf8_fp8(global v4f* out, v4i a, v8i b, v4f c, int idx) 523{ 524 *out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8(a, b, c, idx, 0, 0); 525} 526 527// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x128_fp8_bf8 528// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %a, <8 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) 529void test_smfmac_f32_16x16x128_fp8_bf8(global v4f* out, v4i a, v8i b, v4f c, int idx) 530{ 531 *out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8(a, b, c, idx, 0, 0); 532} 533 534// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x128_fp8_fp8 535// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) 536void test_smfmac_f32_16x16x128_fp8_fp8(global v4f* out, v4i a, v8i b, v4f c, int idx) 537{ 538 *out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8(a, b, c, idx, 0, 0); 539} 540 541// CHECK-GFX950-LABEL: @test_smfmac_f32_32x32x64_bf8_bf8 542// CHECK-GFX950: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %a, <8 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) 543void test_smfmac_f32_32x32x64_bf8_bf8(global v16f* out, v4i a, v8i b, v16f c, int idx) 544{ 545 *out = __builtin_amdgcn_smfmac_f32_32x32x64_bf8_bf8(a, b, c, idx, 0, 0); 546} 547 548// CHECK-GFX950-LABEL: @test_smfmac_f32_32x32x64_bf8_fp8 549// CHECK-GFX950: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %a, <8 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) 550void test_smfmac_f32_32x32x64_bf8_fp8(global v16f* out, v4i a, v8i b, v16f c, int idx) 551{ 552 *out = __builtin_amdgcn_smfmac_f32_32x32x64_bf8_fp8(a, b, c, idx, 0, 0); 553} 554 555// CHECK-GFX950-LABEL: @test_smfmac_f32_32x32x64_fp8_bf8 556// CHECK-GFX950: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %a, <8 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) 557void test_smfmac_f32_32x32x64_fp8_bf8(global v16f* out, v4i a, v8i b, v16f c, int idx) 558{ 559 *out = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_bf8(a, b, c, idx, 0, 0); 560} 561 562// CHECK-GFX950-LABEL: @test_smfmac_f32_32x32x64_fp8_fp8 563// CHECK-GFX950: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %a, <8 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) 564void test_smfmac_f32_32x32x64_fp8_fp8(global v16f* out, v4i a, v8i b, v16f c, int idx) 565{ 566 *out = __builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8(a, b, c, idx, 0, 0); 567} 568 569#endif 570