xref: /llvm-project/mlir/test/Dialect/NVGPU/invalid.mlir (revision 53c7fe50d869386459226aeac5ec72ee918737c9)
1// RUN: mlir-opt -split-input-file -verify-diagnostics %s
2
3func.func @ldmatrix_address_space_f16_x4(%arg0: memref<128x128xf16, 2>) ->  vector<4x1xf16> {
4  %c0  = arith.constant 0 : index
5  // expected-error @below {{expected nvgpu.ldmatrix srcMemref must have a memory space attribute of IntegerAttr(3) or gpu::AddressSpaceAttr(Workgroup)}}
6  %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf16, 2> -> vector<4x1xf16>
7  return %a : vector<4x1xf16>
8}
9// -----
10
11func.func @ldmatrix_num_elements_f16_x4(%arg0: memref<128x128xf16, 3>) ->  vector<4x1xf16> {
12  %c0  = arith.constant 0 : index
13  // expected-error @+1 {{expected vector register shape[1] = 2}}
14  %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf16, 3> -> vector<4x1xf16>
15  return %a : vector<4x1xf16>
16}
17// -----
18
19func.func @ldmatrix_num_tiles_f16_x4(%arg0: memref<128x128xf16, 3>) ->  vector<2x2xf16> {
20  %c0  = arith.constant 0 : index
21  // expected-error @+1 {{expected vector register shape[0] and numTiles to match}}
22  %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf16, 3> -> vector<2x2xf16>
23  return %a : vector<2x2xf16>
24}
25// -----
26
27func.func @ldmatrix_num_tiles_f32_x4(%arg0: memref<128x128xf32, 3>) ->  vector<4x2xf32> {
28  %c0  = arith.constant 0 : index
29  // expected-error @+1 {{expected vector register shape[1] = 1}}
30  %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf32, 3> -> vector<4x2xf32>
31  return %a : vector<4x2xf32>
32}
33// -----
34
35func.func @ldmatrix_trans_f32_x4(%arg0: memref<128x128xf32, 3>) ->  vector<4x1xf32> {
36  %c0  = arith.constant 0 : index
37  // expected-error @+1 {{nvgpu.ldmatrix transpose works only at 16b granularity}}
38  %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = true, numTiles = 4 : i32} : memref<128x128xf32, 3> -> vector<4x1xf32>
39  return %a : vector<4x1xf32>
40}
41// -----
42
43func.func @ldmatrix_trans_f32_x4(%arg0: memref<128x128xf32, 3>) ->  vector<4x1xf32> {
44  %c0  = arith.constant 0 : index
45  // expected-error @+1 {{results must be 2 dimensional vector}}
46  %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf32, 3> -> vector<4xf32>
47  return %a : vector<4xf32>
48}
49// -----
50
51func.func @ldmatrix_type_x4(%arg0: memref<128x128xf32, 3>) ->  vector<4x2xf16> {
52  %c0  = arith.constant 0 : index
53  // expected-error @+1 {{'nvgpu.ldmatrix' op failed to verify that srcMemref and res have same element type}}
54  %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf32, 3> -> vector<4x2xf16>
55  return %a : vector<4x2xf16>
56}
57// -----
58
59func.func @m16n8k16_fp16_vector_shape_a(%arg0: vector<4x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
60  // expected-error @+1 {{expected 256 warp-wide matrix A elements}}
61  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x4xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
62  return %d : vector<2x2xf16>
63}
64// -----
65
66func.func @m16n8k16_fp16_vector_shape_b(%arg0: vector<4x2xf16>, %arg1: vector<2x4xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
67  // expected-error @+1 {{expected 128 warp-wide matrix B elements}}
68  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x4xf16>, vector<2x2xf16>) -> vector<2x2xf16>
69  return %d : vector<2x2xf16>
70}
71// -----
72
73func.func @m16n8k16_fp16_vector_shape_c(%arg0: vector<4x2xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x4xf16>) -> vector<2x4xf16> {
74  // expected-error @+1 {{expected 128 warp-wide matrix C elements}}
75  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x4xf16>) -> vector<2x4xf16>
76  return %d : vector<2x4xf16>
77}
78// -----
79
80func.func @m16n8k16_fp16_vector_shape_a_extended(%arg0: vector<2x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
81  // expected-error @+1 {{expected matrix A to be shaped (4 x 2)}}
82  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<2x4xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
83  return %d : vector<2x2xf16>
84}
85// -----
86
87func.func @m16n8k16_fp16_tf32Enabled(%arg0: vector<4x2xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
88  // expected-error @+1 {{expected tf32 tensor cores only for F32 operands}}
89  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16], tf32Enabled} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
90  return %d : vector<2x2xf16>
91}
92// -----
93
94func.func @m16n8k8_fp32_vector_shape_a(%arg0: vector<4x2xf32>, %arg1: vector<2x1xf32>, %arg2: vector<2x2xf32>) -> vector<2x2xf32> {
95  // expected-error @+1 {{expected 128 warp-wide matrix A elements}}
96  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 8]} : (vector<4x2xf32>, vector<2x1xf32>, vector<2x2xf32>) -> vector<2x2xf32>
97  return %d : vector<2x2xf32>
98}
99// -----
100
101func.func @m16n8k8_fp32_vector_shape_a_extended(%arg0: vector<1x4xf32>, %arg1: vector<2x1xf32>, %arg2: vector<2x2xf32>) -> vector<2x2xf32> {
102  // expected-error @+1 {{expected matrix A to be shaped (4 x 1)}}
103  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 8]} : (vector<1x4xf32>, vector<2x1xf32>, vector<2x2xf32>) -> vector<2x2xf32>
104  return %d : vector<2x2xf32>
105}
106// -----
107
108func.func @m8n8k4_fp64_vector_shape_a(%arg0: vector<1x2xf64>, %arg1: vector<1x1xf64>, %arg2: vector<1x2xf64>) -> vector<1x2xf64> {
109  // expected-error @+1 {{expected 32 warp-wide matrix A elements}}
110  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [8, 8, 4]} : (vector<1x2xf64>, vector<1x1xf64>, vector<1x2xf64>) -> vector<1x2xf64>
111  return %d : vector<1x2xf64>
112}
113// -----
114
115func.func @m8n8k4_fp64_vector_shape_c_extended(%arg0: vector<1x1xf64>, %arg1: vector<1x1xf64>, %arg2: vector<2x1xf64>) -> vector<2x1xf64> {
116  // expected-error @+1 {{expected matrix C to be shaped (1 x 2)}}
117  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [8, 8, 4]} : (vector<1x1xf64>, vector<1x1xf64>, vector<2x1xf64>) -> vector<2x1xf64>
118  return %d : vector<2x1xf64>
119}
120// -----
121
122func.func @m16n8k32_int8_vector_shape_b(%arg0: vector<4x4xi8>, %arg1: vector<4x4xi8>, %arg2: vector<2x2xi32>) -> vector<2x2xi32> {
123  // expected-error @+1 {{expected 256 warp-wide matrix B elements}}
124  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 32]} : (vector<4x4xi8>, vector<4x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
125  return %d : vector<2x2xi32>
126}
127// -----
128
129func.func @m16n8k32_int32_datatype(%arg0: vector<4x4xi32>, %arg1: vector<2x4xi8>, %arg2: vector<2x2xi32>) -> vector<2x2xi32> {
130  // expected-error @+1 {{op failed to verify that matrixA and matrixB have same element type}}
131  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 32]} : (vector<4x4xi32>, vector<2x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
132  return %d : vector<2x2xi32>
133}
134// -----
135
136func.func @async_cp_memory_space(%dst : memref<16xf32>, %src : memref<16xf32>, %i : index) -> () {
137  // expected-error @below {{destination memref must have a memory space attribute of IntegerAttr(3) or gpu::AddressSpaceAttr(Workgroup)}}
138  nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16xf32> to memref<16xf32>
139  return
140}
141// -----
142
143func.func @async_cp_memref_type(%dst : memref<16xi32, 3>, %src : memref<16xf32>, %i : index) -> () {
144  // expected-error @+1 {{source and destination must have the same element type}}
145  nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16xf32> to memref<16xi32, 3>
146  return
147}
148// -----
149
150func.func @async_cp_num_src_indices(%dst : memref<16xf32, 3>, %src : memref<16x16xf32>, %i : index) -> () {
151  // expected-error @+1 {{expected 2 source indices, got 1}}
152  nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16x16xf32> to memref<16xf32, 3>
153  return
154}
155// -----
156
157func.func @async_cp_num_dst_indices(%dst : memref<16x16xf32, 3>, %src : memref<16xf32>, %i : index) -> () {
158  // expected-error @+1 {{expected 2 destination indices, got 1}}
159  nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16xf32> to memref<16x16xf32, 3>
160  return
161}
162// -----
163
164func.func @async_cp_num_src_stride(
165  %dst : memref<200x100xf32, 3>,
166  %src : memref<200x100xf32, affine_map<(d0, d1) -> (200*d0 + 2*d1)>>,
167  %i : index) -> () {
168  // expected-error @+1 {{source memref most minor dim must have unit stride}}
169  nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i], 16 :
170    memref<200x100xf32, affine_map<(d0, d1) -> (200*d0 + 2*d1)>> to memref<200x100xf32, 3>
171  return
172}
173// -----
174
175func.func @async_cp_num_dst_stride(
176  %dst : memref<200x100xf32, affine_map<(d0, d1) -> (200*d0 + 2*d1)>, 3>,
177  %src : memref<200x100xf32>,
178  %i : index) -> () {
179  // expected-error @+1 {{destination memref most minor dim must have unit stride}}
180  nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i], 16 :
181    memref<200x100xf32> to memref<200x100xf32, affine_map<(d0, d1) -> (200*d0 + 2*d1)>, 3>
182  return
183}
184// -----
185
186// 42 is never the answer!
187func.func @mma_sp_sync_f16_16816(%arg0: vector<2x2xf16>,
188                                 %arg1: vector<2x2xf16>,
189                                 %arg2: vector<2x2xf16>,
190                                 %arg3: vector<2xi16>) -> vector<2x2xf16> {
191  // expected-error @+1 {{'nvgpu.mma.sp.sync' op sparsity selector should be 0 or 1}}
192  %d = nvgpu.mma.sp.sync(%arg0, %arg1, %arg2) metadata(%arg3) {mmaShape = [16, 8, 16], sparsitySelector = 42 : i32} :
193       (vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
194  return %d : vector<2x2xf16>
195}
196
197// -----
198
199func.func @async_cp_zfill_f32_align1(
200  %src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index, %srcElements : index) {
201    // expected-error @+1 {{'nvgpu.device_async_copy' op bypassL1 does not satify alignment for 'memref<3x16x128xf32, 3>' with destination element 1. Unset bypassL1, or set destination element to 4}}
202  %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 1, %srcElements {bypassL1} : memref<128x128xf32> to memref<3x16x128xf32, 3>
203  return
204}
205
206// -----
207
208func.func @async_cp_size_invalid_f32(
209  %src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index) {
210    // expected-error @+1 {{Requested copy elements is 3 with width 32. But copy elements could be one of 1, 2, 4.}}
211  %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 3: memref<128x128xf32> to memref<3x16x128xf32, 3>
212  return
213}
214
215// -----
216
217func.func @async_cp_size_invalid_f16(
218  %src: memref<128x128xf16>, %dst: memref<3x16x128xf16, 3>, %i : index) {
219    // expected-error @+1 {{Requested copy elements is 3 with width 16. But copy elements could be one of 2, 4, 8.}}
220  %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 3: memref<128x128xf16> to memref<3x16x128xf16, 3>
221  return
222}
223
224// -----
225
226func.func @async_cp_size_invalid_f64(
227  %src: memref<128x128xf64>, %dst: memref<3x16x128xf64, 3>, %i : index) {
228    // expected-error @+1 {{Requested copy elements is 3 with width 64. But copy elements could be one of 1, 2.}}
229  %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 3: memref<128x128xf64> to memref<3x16x128xf64, 3>
230  return
231}
232
233// -----
234
235!tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<128x128xf32>>
236!tDescA  = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>
237!tDescB  = !nvgpu.warpgroup.descriptor<tensor = memref<64x121xf16, 3>>
238
239func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc: !tResult) {
240  // expected-error @+1 {{'nvgpu.warpgroup.mma' op 2nd dim matrix-B ( 121 ) != 2nd dim matrix-C ( 128 )}}
241  %0 = nvgpu.warpgroup.mma %descA, %descB, %acc: !tDescA, !tDescB, !tResult -> !tResult
242  return
243}
244
245// -----
246
247!tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<128xf32>>
248!tDescA  = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>
249!tDescB  = !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>
250func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc: !tResult) {
251  // expected-error @+1 {{'nvgpu.warpgroup.mma' op has matrices A, B, C and D, they must be 2 dimensional}}
252  %0 = nvgpu.warpgroup.mma %descA, %descB, %acc: !tDescA, !tDescB, !tResult -> !tResult
253  return
254}
255
256// -----
257!tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<128x128xf32>>
258!tDescA  = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>
259!tDescB  = !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf32, 3>>
260func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc: !tResult) {
261  // expected-error @+1 {{'nvgpu.warpgroup.mma' op 'f32' += 'f16' * 'f32', it is not supported.}}
262  %0 = nvgpu.warpgroup.mma %descA, %descB, %acc: !tDescA, !tDescB, !tResult -> !tResult
263  return
264}
265
266// -----
267
268!tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<128x128xf32>>
269!tDescA  = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>
270!tDescB  = !nvgpu.warpgroup.descriptor<tensor = memref<64x512xf16, 3>>
271func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc: !tResult) {
272  // expected-error @+1 {{'nvgpu.warpgroup.mma' op 2nd dim matrix-B ( 512 ) != 2nd dim matrix-C ( 128 )}}
273  %0 = nvgpu.warpgroup.mma %descA, %descB, %acc: !tDescA, !tDescB, !tResult -> !tResult
274  return
275}
276
277// -----
278
279!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
280!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
281func.func @tma_load_1(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) {
282  %c0 = arith.constant 0 : index
283  // Pass fine
284  nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x32xf32,3>
285  // expected-error @+1 {{Maximum 5 coordinates are supported.}}
286  nvgpu.tma.async.load %desc[%c0, %c0, %c0, %c0, %c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x32xf32,3>
287  return
288}
289// -----
290
291!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
292!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
293func.func @tma_load_2(%desc: !desc,  %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) {
294  %c0 = arith.constant 0 : index
295  // expected-error @+1 {{the tensor map descriptor has incorrect address space, it must be shared memory address space.}}
296  nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x32xf32,3>
297  return
298}
299// -----
300
301!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
302!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
303func.func @tma_load_3(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) {
304  %c0 = arith.constant 0 : index
305  // expected-error @+1 {{the destination memref has incorrect address space, it must be shared memory address space}}
306  nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer3 : !desc, !mbarrier -> memref<32x32xf32>
307  return
308}
309// -----
310
311!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
312!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
313func.func @tma_load_4(%desc: !desc,  %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) {
314  %c0 = arith.constant 0 : index
315  // expected-error @+1 {{the shape of tensor map descriptor and memref must have same rank}}
316  nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer1 : !desc, !mbarrier -> memref<128xf32,3>
317  return
318}
319
320// -----
321
322!desc = !nvgpu.tensormap.descriptor<tensor = memref<64x128xf16,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
323func.func @tma_generate_descriptor_incorrect_last_dim(%b0 : index, %b1 : index, %mem : memref<*xf16>) {
324  // expected-error @+1 {{the tensormap descriptor must have last dimension of 128 bytes but it is 256 bytes}}
325  %descA = nvgpu.tma.create.descriptor %mem box[%b0, %b1] : memref<*xf16> -> !desc
326  return
327}
328// -----
329
330
331!desc = !nvgpu.tensormap.descriptor<tensor = memref<64x128xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
332!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
333func.func @tma_generate_descriptor_incorrect_last_dim(%desc: !desc,  %buffer2: memref<64x128xf32,3>, %mbarrier: !mbarrier) {
334  %c0 = arith.constant 0 : index
335  // expected-error @+1 {{the tensormap descriptor must have last dimension of 128 bytes but it is 512 bytes}}
336  nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<64x128xf32,3>
337  return
338}
339// -----
340
341func.func @rcp_unsupported_rounding_0(%in : vector<16xf32>) {
342  // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode rn> or non-ftz is not supported yet.}}
343  %out = nvgpu.rcp %in {rounding = rn, ftz} : vector<16xf32>
344}
345// -----
346
347func.func @rcp_unsupported_rounding_1(%in : vector<16xf32>) {
348  // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode rz> or non-ftz is not supported yet.}}
349  %out = nvgpu.rcp %in {rounding = rz} : vector<16xf32>
350}
351// -----
352
353func.func @rcp_unsupported_ftz(%in : vector<16xf32>) {
354  // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode approx> or non-ftz is not supported yet.}}
355  %out = nvgpu.rcp %in {rounding = approx} : vector<16xf32>
356}
357
358// -----
359
360func.func @check_matrixA_dim(%arg0: vector<16xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
361  // expected-error @+1 {{matrixA must be 2 dimensional vector}}
362  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<16xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
363  return %d : vector<2x2xf16>
364}
365
366// -----
367
368func.func @check_matrixB_dim(%arg0: vector<4x4xf16>, %arg1: vector<4xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
369  // expected-error @+1 {{matrixB must be 2 dimensional vector}}
370  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x4xf16>, vector<4xf16>, vector<2x2xf16>) -> vector<2x2xf16>
371  return %d : vector<2x2xf16>
372}
373
374// -----
375
376func.func @check_matrixC_dim(%arg0: vector<4x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<4xf16>) -> vector<2x2xf16> {
377  // expected-error @+1 {{matrixC must be 2 dimensional vector}}
378  %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x4xf16>, vector<2x2xf16>, vector<4xf16>) -> vector<2x2xf16>
379  return %d : vector<2x2xf16>
380}
381