1// RUN: mlir-opt -split-input-file -verify-diagnostics %s 2 3func.func @ldmatrix_address_space_f16_x4(%arg0: memref<128x128xf16, 2>) -> vector<4x1xf16> { 4 %c0 = arith.constant 0 : index 5 // expected-error @below {{expected nvgpu.ldmatrix srcMemref must have a memory space attribute of IntegerAttr(3) or gpu::AddressSpaceAttr(Workgroup)}} 6 %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf16, 2> -> vector<4x1xf16> 7 return %a : vector<4x1xf16> 8} 9// ----- 10 11func.func @ldmatrix_num_elements_f16_x4(%arg0: memref<128x128xf16, 3>) -> vector<4x1xf16> { 12 %c0 = arith.constant 0 : index 13 // expected-error @+1 {{expected vector register shape[1] = 2}} 14 %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf16, 3> -> vector<4x1xf16> 15 return %a : vector<4x1xf16> 16} 17// ----- 18 19func.func @ldmatrix_num_tiles_f16_x4(%arg0: memref<128x128xf16, 3>) -> vector<2x2xf16> { 20 %c0 = arith.constant 0 : index 21 // expected-error @+1 {{expected vector register shape[0] and numTiles to match}} 22 %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf16, 3> -> vector<2x2xf16> 23 return %a : vector<2x2xf16> 24} 25// ----- 26 27func.func @ldmatrix_num_tiles_f32_x4(%arg0: memref<128x128xf32, 3>) -> vector<4x2xf32> { 28 %c0 = arith.constant 0 : index 29 // expected-error @+1 {{expected vector register shape[1] = 1}} 30 %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf32, 3> -> vector<4x2xf32> 31 return %a : vector<4x2xf32> 32} 33// ----- 34 35func.func @ldmatrix_trans_f32_x4(%arg0: memref<128x128xf32, 3>) -> vector<4x1xf32> { 36 %c0 = arith.constant 0 : index 37 // expected-error @+1 {{nvgpu.ldmatrix transpose works only at 16b granularity}} 38 %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = true, numTiles = 4 : i32} : memref<128x128xf32, 3> -> vector<4x1xf32> 39 return %a : vector<4x1xf32> 40} 41// ----- 42 43func.func @ldmatrix_trans_f32_x4(%arg0: memref<128x128xf32, 3>) -> vector<4x1xf32> { 44 %c0 = arith.constant 0 : index 45 // expected-error @+1 {{results must be 2 dimensional vector}} 46 %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf32, 3> -> vector<4xf32> 47 return %a : vector<4xf32> 48} 49// ----- 50 51func.func @ldmatrix_type_x4(%arg0: memref<128x128xf32, 3>) -> vector<4x2xf16> { 52 %c0 = arith.constant 0 : index 53 // expected-error @+1 {{'nvgpu.ldmatrix' op failed to verify that srcMemref and res have same element type}} 54 %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf32, 3> -> vector<4x2xf16> 55 return %a : vector<4x2xf16> 56} 57// ----- 58 59func.func @m16n8k16_fp16_vector_shape_a(%arg0: vector<4x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> { 60 // expected-error @+1 {{expected 256 warp-wide matrix A elements}} 61 %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x4xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> 62 return %d : vector<2x2xf16> 63} 64// ----- 65 66func.func @m16n8k16_fp16_vector_shape_b(%arg0: vector<4x2xf16>, %arg1: vector<2x4xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> { 67 // expected-error @+1 {{expected 128 warp-wide matrix B elements}} 68 %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x4xf16>, vector<2x2xf16>) -> vector<2x2xf16> 69 return %d : vector<2x2xf16> 70} 71// ----- 72 73func.func @m16n8k16_fp16_vector_shape_c(%arg0: vector<4x2xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x4xf16>) -> vector<2x4xf16> { 74 // expected-error @+1 {{expected 128 warp-wide matrix C elements}} 75 %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x4xf16>) -> vector<2x4xf16> 76 return %d : vector<2x4xf16> 77} 78// ----- 79 80func.func @m16n8k16_fp16_vector_shape_a_extended(%arg0: vector<2x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> { 81 // expected-error @+1 {{expected matrix A to be shaped (4 x 2)}} 82 %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<2x4xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> 83 return %d : vector<2x2xf16> 84} 85// ----- 86 87func.func @m16n8k16_fp16_tf32Enabled(%arg0: vector<4x2xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> { 88 // expected-error @+1 {{expected tf32 tensor cores only for F32 operands}} 89 %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16], tf32Enabled} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> 90 return %d : vector<2x2xf16> 91} 92// ----- 93 94func.func @m16n8k8_fp32_vector_shape_a(%arg0: vector<4x2xf32>, %arg1: vector<2x1xf32>, %arg2: vector<2x2xf32>) -> vector<2x2xf32> { 95 // expected-error @+1 {{expected 128 warp-wide matrix A elements}} 96 %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 8]} : (vector<4x2xf32>, vector<2x1xf32>, vector<2x2xf32>) -> vector<2x2xf32> 97 return %d : vector<2x2xf32> 98} 99// ----- 100 101func.func @m16n8k8_fp32_vector_shape_a_extended(%arg0: vector<1x4xf32>, %arg1: vector<2x1xf32>, %arg2: vector<2x2xf32>) -> vector<2x2xf32> { 102 // expected-error @+1 {{expected matrix A to be shaped (4 x 1)}} 103 %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 8]} : (vector<1x4xf32>, vector<2x1xf32>, vector<2x2xf32>) -> vector<2x2xf32> 104 return %d : vector<2x2xf32> 105} 106// ----- 107 108func.func @m8n8k4_fp64_vector_shape_a(%arg0: vector<1x2xf64>, %arg1: vector<1x1xf64>, %arg2: vector<1x2xf64>) -> vector<1x2xf64> { 109 // expected-error @+1 {{expected 32 warp-wide matrix A elements}} 110 %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [8, 8, 4]} : (vector<1x2xf64>, vector<1x1xf64>, vector<1x2xf64>) -> vector<1x2xf64> 111 return %d : vector<1x2xf64> 112} 113// ----- 114 115func.func @m8n8k4_fp64_vector_shape_c_extended(%arg0: vector<1x1xf64>, %arg1: vector<1x1xf64>, %arg2: vector<2x1xf64>) -> vector<2x1xf64> { 116 // expected-error @+1 {{expected matrix C to be shaped (1 x 2)}} 117 %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [8, 8, 4]} : (vector<1x1xf64>, vector<1x1xf64>, vector<2x1xf64>) -> vector<2x1xf64> 118 return %d : vector<2x1xf64> 119} 120// ----- 121 122func.func @m16n8k32_int8_vector_shape_b(%arg0: vector<4x4xi8>, %arg1: vector<4x4xi8>, %arg2: vector<2x2xi32>) -> vector<2x2xi32> { 123 // expected-error @+1 {{expected 256 warp-wide matrix B elements}} 124 %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 32]} : (vector<4x4xi8>, vector<4x4xi8>, vector<2x2xi32>) -> vector<2x2xi32> 125 return %d : vector<2x2xi32> 126} 127// ----- 128 129func.func @m16n8k32_int32_datatype(%arg0: vector<4x4xi32>, %arg1: vector<2x4xi8>, %arg2: vector<2x2xi32>) -> vector<2x2xi32> { 130 // expected-error @+1 {{op failed to verify that matrixA and matrixB have same element type}} 131 %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 32]} : (vector<4x4xi32>, vector<2x4xi8>, vector<2x2xi32>) -> vector<2x2xi32> 132 return %d : vector<2x2xi32> 133} 134// ----- 135 136func.func @async_cp_memory_space(%dst : memref<16xf32>, %src : memref<16xf32>, %i : index) -> () { 137 // expected-error @below {{destination memref must have a memory space attribute of IntegerAttr(3) or gpu::AddressSpaceAttr(Workgroup)}} 138 nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16xf32> to memref<16xf32> 139 return 140} 141// ----- 142 143func.func @async_cp_memref_type(%dst : memref<16xi32, 3>, %src : memref<16xf32>, %i : index) -> () { 144 // expected-error @+1 {{source and destination must have the same element type}} 145 nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16xf32> to memref<16xi32, 3> 146 return 147} 148// ----- 149 150func.func @async_cp_num_src_indices(%dst : memref<16xf32, 3>, %src : memref<16x16xf32>, %i : index) -> () { 151 // expected-error @+1 {{expected 2 source indices, got 1}} 152 nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16x16xf32> to memref<16xf32, 3> 153 return 154} 155// ----- 156 157func.func @async_cp_num_dst_indices(%dst : memref<16x16xf32, 3>, %src : memref<16xf32>, %i : index) -> () { 158 // expected-error @+1 {{expected 2 destination indices, got 1}} 159 nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16xf32> to memref<16x16xf32, 3> 160 return 161} 162// ----- 163 164func.func @async_cp_num_src_stride( 165 %dst : memref<200x100xf32, 3>, 166 %src : memref<200x100xf32, affine_map<(d0, d1) -> (200*d0 + 2*d1)>>, 167 %i : index) -> () { 168 // expected-error @+1 {{source memref most minor dim must have unit stride}} 169 nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i], 16 : 170 memref<200x100xf32, affine_map<(d0, d1) -> (200*d0 + 2*d1)>> to memref<200x100xf32, 3> 171 return 172} 173// ----- 174 175func.func @async_cp_num_dst_stride( 176 %dst : memref<200x100xf32, affine_map<(d0, d1) -> (200*d0 + 2*d1)>, 3>, 177 %src : memref<200x100xf32>, 178 %i : index) -> () { 179 // expected-error @+1 {{destination memref most minor dim must have unit stride}} 180 nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i], 16 : 181 memref<200x100xf32> to memref<200x100xf32, affine_map<(d0, d1) -> (200*d0 + 2*d1)>, 3> 182 return 183} 184// ----- 185 186// 42 is never the answer! 187func.func @mma_sp_sync_f16_16816(%arg0: vector<2x2xf16>, 188 %arg1: vector<2x2xf16>, 189 %arg2: vector<2x2xf16>, 190 %arg3: vector<2xi16>) -> vector<2x2xf16> { 191 // expected-error @+1 {{'nvgpu.mma.sp.sync' op sparsity selector should be 0 or 1}} 192 %d = nvgpu.mma.sp.sync(%arg0, %arg1, %arg2) metadata(%arg3) {mmaShape = [16, 8, 16], sparsitySelector = 42 : i32} : 193 (vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> 194 return %d : vector<2x2xf16> 195} 196 197// ----- 198 199func.func @async_cp_zfill_f32_align1( 200 %src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index, %srcElements : index) { 201 // expected-error @+1 {{'nvgpu.device_async_copy' op bypassL1 does not satify alignment for 'memref<3x16x128xf32, 3>' with destination element 1. Unset bypassL1, or set destination element to 4}} 202 %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 1, %srcElements {bypassL1} : memref<128x128xf32> to memref<3x16x128xf32, 3> 203 return 204} 205 206// ----- 207 208func.func @async_cp_size_invalid_f32( 209 %src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index) { 210 // expected-error @+1 {{Requested copy elements is 3 with width 32. But copy elements could be one of 1, 2, 4.}} 211 %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 3: memref<128x128xf32> to memref<3x16x128xf32, 3> 212 return 213} 214 215// ----- 216 217func.func @async_cp_size_invalid_f16( 218 %src: memref<128x128xf16>, %dst: memref<3x16x128xf16, 3>, %i : index) { 219 // expected-error @+1 {{Requested copy elements is 3 with width 16. But copy elements could be one of 2, 4, 8.}} 220 %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 3: memref<128x128xf16> to memref<3x16x128xf16, 3> 221 return 222} 223 224// ----- 225 226func.func @async_cp_size_invalid_f64( 227 %src: memref<128x128xf64>, %dst: memref<3x16x128xf64, 3>, %i : index) { 228 // expected-error @+1 {{Requested copy elements is 3 with width 64. But copy elements could be one of 1, 2.}} 229 %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 3: memref<128x128xf64> to memref<3x16x128xf64, 3> 230 return 231} 232 233// ----- 234 235!tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<128x128xf32>> 236!tDescA = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>> 237!tDescB = !nvgpu.warpgroup.descriptor<tensor = memref<64x121xf16, 3>> 238 239func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc: !tResult) { 240 // expected-error @+1 {{'nvgpu.warpgroup.mma' op 2nd dim matrix-B ( 121 ) != 2nd dim matrix-C ( 128 )}} 241 %0 = nvgpu.warpgroup.mma %descA, %descB, %acc: !tDescA, !tDescB, !tResult -> !tResult 242 return 243} 244 245// ----- 246 247!tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<128xf32>> 248!tDescA = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>> 249!tDescB = !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>> 250func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc: !tResult) { 251 // expected-error @+1 {{'nvgpu.warpgroup.mma' op has matrices A, B, C and D, they must be 2 dimensional}} 252 %0 = nvgpu.warpgroup.mma %descA, %descB, %acc: !tDescA, !tDescB, !tResult -> !tResult 253 return 254} 255 256// ----- 257!tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<128x128xf32>> 258!tDescA = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>> 259!tDescB = !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf32, 3>> 260func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc: !tResult) { 261 // expected-error @+1 {{'nvgpu.warpgroup.mma' op 'f32' += 'f16' * 'f32', it is not supported.}} 262 %0 = nvgpu.warpgroup.mma %descA, %descB, %acc: !tDescA, !tDescB, !tResult -> !tResult 263 return 264} 265 266// ----- 267 268!tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<128x128xf32>> 269!tDescA = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>> 270!tDescB = !nvgpu.warpgroup.descriptor<tensor = memref<64x512xf16, 3>> 271func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc: !tResult) { 272 // expected-error @+1 {{'nvgpu.warpgroup.mma' op 2nd dim matrix-B ( 512 ) != 2nd dim matrix-C ( 128 )}} 273 %0 = nvgpu.warpgroup.mma %descA, %descB, %acc: !tDescA, !tDescB, !tResult -> !tResult 274 return 275} 276 277// ----- 278 279!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> 280!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>> 281func.func @tma_load_1(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) { 282 %c0 = arith.constant 0 : index 283 // Pass fine 284 nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x32xf32,3> 285 // expected-error @+1 {{Maximum 5 coordinates are supported.}} 286 nvgpu.tma.async.load %desc[%c0, %c0, %c0, %c0, %c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x32xf32,3> 287 return 288} 289// ----- 290 291!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> 292!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>> 293func.func @tma_load_2(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) { 294 %c0 = arith.constant 0 : index 295 // expected-error @+1 {{the tensor map descriptor has incorrect address space, it must be shared memory address space.}} 296 nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x32xf32,3> 297 return 298} 299// ----- 300 301!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> 302!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>> 303func.func @tma_load_3(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) { 304 %c0 = arith.constant 0 : index 305 // expected-error @+1 {{the destination memref has incorrect address space, it must be shared memory address space}} 306 nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer3 : !desc, !mbarrier -> memref<32x32xf32> 307 return 308} 309// ----- 310 311!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> 312!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>> 313func.func @tma_load_4(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) { 314 %c0 = arith.constant 0 : index 315 // expected-error @+1 {{the shape of tensor map descriptor and memref must have same rank}} 316 nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer1 : !desc, !mbarrier -> memref<128xf32,3> 317 return 318} 319 320// ----- 321 322!desc = !nvgpu.tensormap.descriptor<tensor = memref<64x128xf16,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> 323func.func @tma_generate_descriptor_incorrect_last_dim(%b0 : index, %b1 : index, %mem : memref<*xf16>) { 324 // expected-error @+1 {{the tensormap descriptor must have last dimension of 128 bytes but it is 256 bytes}} 325 %descA = nvgpu.tma.create.descriptor %mem box[%b0, %b1] : memref<*xf16> -> !desc 326 return 327} 328// ----- 329 330 331!desc = !nvgpu.tensormap.descriptor<tensor = memref<64x128xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> 332!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>> 333func.func @tma_generate_descriptor_incorrect_last_dim(%desc: !desc, %buffer2: memref<64x128xf32,3>, %mbarrier: !mbarrier) { 334 %c0 = arith.constant 0 : index 335 // expected-error @+1 {{the tensormap descriptor must have last dimension of 128 bytes but it is 512 bytes}} 336 nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<64x128xf32,3> 337 return 338} 339// ----- 340 341func.func @rcp_unsupported_rounding_0(%in : vector<16xf32>) { 342 // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode rn> or non-ftz is not supported yet.}} 343 %out = nvgpu.rcp %in {rounding = rn, ftz} : vector<16xf32> 344} 345// ----- 346 347func.func @rcp_unsupported_rounding_1(%in : vector<16xf32>) { 348 // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode rz> or non-ftz is not supported yet.}} 349 %out = nvgpu.rcp %in {rounding = rz} : vector<16xf32> 350} 351// ----- 352 353func.func @rcp_unsupported_ftz(%in : vector<16xf32>) { 354 // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode approx> or non-ftz is not supported yet.}} 355 %out = nvgpu.rcp %in {rounding = approx} : vector<16xf32> 356} 357 358// ----- 359 360func.func @check_matrixA_dim(%arg0: vector<16xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> { 361 // expected-error @+1 {{matrixA must be 2 dimensional vector}} 362 %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<16xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> 363 return %d : vector<2x2xf16> 364} 365 366// ----- 367 368func.func @check_matrixB_dim(%arg0: vector<4x4xf16>, %arg1: vector<4xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> { 369 // expected-error @+1 {{matrixB must be 2 dimensional vector}} 370 %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x4xf16>, vector<4xf16>, vector<2x2xf16>) -> vector<2x2xf16> 371 return %d : vector<2x2xf16> 372} 373 374// ----- 375 376func.func @check_matrixC_dim(%arg0: vector<4x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<4xf16>) -> vector<2x2xf16> { 377 // expected-error @+1 {{matrixC must be 2 dimensional vector}} 378 %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x4xf16>, vector<2x2xf16>, vector<4xf16>) -> vector<2x2xf16> 379 return %d : vector<2x2xf16> 380} 381