xref: /llvm-project/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir (revision 9816edc9f3ce198d41e364dd3467caa839a0c220)
1// RUN: mlir-opt %s -split-input-file -pass-pipeline="builtin.module(func.func(convert-vector-to-gpu{use-nvgpu=true}))" | FileCheck %s
2
3//#########################################################
4// INT8 row-row-row
5//#########################################################
6
7// CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)>
8// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 16)>
9
10// CHECK-DAG: [[$rowB0_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 39)>
11// CHECK-DAG: [[$colB0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 40)>
12// CHECK-DAG: [[$rowB1_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 40)>
13// CHECK-DAG: [[$rowB2_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 41)>
14// CHECK-DAG: [[$rowB3_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 42)>
15// CHECK-DAG: [[$rowB4_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 55)>
16// CHECK-DAG: [[$rowB5_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 56)>
17// CHECK-DAG: [[$rowB6_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 57)>
18// CHECK-DAG: [[$rowB7_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 58)>
19
20// CHECK-DAG: [[$rowC0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 49)>
21// CHECK-DAG: [[$colC0_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8 + 40)>
22// CHECK-DAG: [[$rowC8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 57)>
23
24
25#map0 = affine_map<(d0, d1) -> (d1, d0)>
26#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
27#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
28#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
29
30// CHECK-LABEL: func @m16n8k32_int8_row_row_row
31func.func @m16n8k32_int8_row_row_row(%arg0: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg1: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg2: memref<128x128xi32>) {
32  %cst_0 = arith.constant dense<0> : vector<32x8xi8>
33  %c0 = arith.constant 0 : index
34  %c1 = arith.constant 1 : index
35  %c17 = arith.constant 17 : index
36  %c39 = arith.constant 39 : index
37  %c40 = arith.constant 40 : index
38  %c49 = arith.constant 49 : index
39  %c50 = arith.constant 50 : index
40  %cst = arith.constant 0 : i8
41  %cst0 = arith.constant 0 : i32
42
43  // Verify that the operandA load is lowered to warp-wide ldmatrix.
44
45  // CHECK: [[m_coord:%.+]] = affine.apply [[$strided_map]]()[{{%.+}}]
46  // CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]()[{{%.+}}]
47  // CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi8, #gpu.address_space<workgroup>> -> vector<4x4xi8>
48
49  // Verify that the operandB load is lowered to scalar load to be able
50  // to transpose at 8-bit granularity. ldmatrix can only transpose at
51  // 16-bit granularity.
52
53  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB0_map]]()[{{%.+}}]
54  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
55  // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
56  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB1_map]]()[{{%.+}}]
57  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
58  // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
59  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB2_map]]()[{{%.+}}]
60  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
61  // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
62  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB3_map]]()[{{%.+}}]
63  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
64  // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
65  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
66  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB4_map]]()[{{%.+}}]
67  // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
68  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB5_map]]()[{{%.+}}]
69  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
70  // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
71  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB6_map]]()[{{%.+}}]
72  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
73  // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
74  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB7_map]]()[{{%.+}}]
75  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}]
76  // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>>
77  // CHECK-NOT: memref.load %arg1
78
79  // Verify that the operand C is distributed to loads correctly.
80  // CHECK: [[row:%.+]] = affine.apply [[$rowC0_map]]()[{{%.+}}]
81  // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}]
82  // CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
83  // CHECK: [[row:%.+]] = affine.apply [[$rowC8_map]]()[{{%.+}}]
84  // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}]
85  // CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
86  // CHECK-NOT: vector.load %arg2{{.*}}
87
88  %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi8, #gpu.address_space<workgroup>>, vector<16x32xi8>
89  %B = vector.transfer_read %arg1[%c39, %c40], %cst {in_bounds = [true, true], permutation_map = #map0} : memref<128x128xi8, #gpu.address_space<workgroup>>, vector<8x32xi8>
90  %C = vector.transfer_read %arg2[%c49, %c40], %cst0 {in_bounds = [true, true]} : memref<128x128xi32>, vector<16x8xi32>
91  // CHECK: [[d:%.+]] = nvgpu.mma.sync({{.*}}) {mmaShape = [16, 8, 32]} : (vector<4x4xi8>, vector<2x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
92  %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x32xi8>, vector<8x32xi8> into vector<16x8xi32>
93
94  // CHECK: [[row:%.+]] = affine.apply [[$rowC0_map]]()[{{%.+}}]
95  // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}]
96  // CHECK: vector.store {{%.+}}, %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
97  // CHECK: [[row:%.+]] = affine.apply [[$rowC8_map]]()[{{%.+}}]
98  // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}]
99  // CHECK: vector.store {{%.+}}, %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
100  vector.transfer_write %D, %arg2[%c49, %c40] {in_bounds = [true, true]} : vector<16x8xi32>, memref<128x128xi32>
101  return
102}
103
104// -----
105
106//#########################################################
107// f64 row-row-row
108//#########################################################
109// CHECK-DAG: [[$rowA0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 1)>
110// CHECK-DAG: [[$colA0_map:#.+]] = affine_map<()[s0] -> (s0 mod 4 + 1)>
111
112// CHECK-DAG: [[$rowb0_map:#.+]] = affine_map<()[s0] -> (s0 mod 4 + 39)>
113// CHECK-DAG: [[$colb0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 40)>
114
115// CHECK-DAG: [[$rowC0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 49)>
116// CHECK-DAG: [[$colC0_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8 + 40)
117
118#map0 = affine_map<(d0, d1) -> (d1, d0)>
119#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
120#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
121#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
122
123// CHECK-LABEL: func @m8n8k4_f64_row_row_row
124func.func @m8n8k4_f64_row_row_row(%arg0: memref<128x128xf64>, %arg1: memref<128x128xf64>, %arg2: memref<128x128xf64>) {
125  %cst_0 = arith.constant dense<0.0> : vector<4x8xf64>
126  %c0 = arith.constant 0 : index
127  %c1 = arith.constant 1 : index
128  %c17 = arith.constant 17 : index
129  %c39 = arith.constant 39 : index
130  %c40 = arith.constant 40 : index
131  %c49 = arith.constant 49 : index
132  %c50 = arith.constant 50 : index
133  %cst = arith.constant 0.0 : f64
134  %cst0 = arith.constant 0.0 : f64
135
136  // Verify that the operand A is distributed to loads correctly.
137
138  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA0_map]]
139  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colA0_map]]
140  // CHECK: vector.load %arg0[[[row]], [[col]]] : memref<128x128xf64>, vector<1xf64>
141
142  // Verify that the operand B is distributed to loads correctly. It's elements
143  // must be loaded in a non-vectorized manner to do the transpose.
144
145  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowb0_map]]
146  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colb0_map]]
147  // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xf64>
148
149  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowC0_map]]
150  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colC0_map]]
151  // CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xf64>, vector<2xf64>
152
153  %A = vector.transfer_read %arg0[%c1, %c1], %cst {in_bounds = [true, true]} : memref<128x128xf64>, vector<8x4xf64>
154  %B = vector.transfer_read %arg1[%c39, %c40], %cst {in_bounds = [true, true], permutation_map = #map0} : memref<128x128xf64>, vector<8x4xf64>
155  %C = vector.transfer_read %arg2[%c49, %c40], %cst0 {in_bounds = [true, true]} : memref<128x128xf64>, vector<8x8xf64>
156  // CHECK: [[d:%.+]] = nvgpu.mma.sync({{.*}}) {mmaShape = [8, 8, 4]} : (vector<1x1xf64>, vector<1x1xf64>, vector<1x2xf64>) -> vector<1x2xf64>
157  %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<8x4xf64>, vector<8x4xf64> into vector<8x8xf64>
158
159  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowC0_map]]
160  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colC0_map]]
161  // CHECK: vector.store {{%.+}}, %arg2[[[row]], [[col]]] : memref<128x128xf64>, vector<2xf64>
162  vector.transfer_write %D, %arg2[%c49, %c40] {in_bounds = [true, true]} : vector<8x8xf64>, memref<128x128xf64>
163  return
164}
165
166// -----
167
168//#########################################################################
169// FP16 row-row-row (ldmatrix x4 for matrixA and ldmatrix x2 for matrixB)
170//#########################################################################
171
172#map0 = affine_map<(d0, d1) -> (d1, d0)>
173#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
174#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
175#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
176
177// CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)>
178// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)>
179
180// CHECK-LABEL: func @m16n8k16_fp16_row_row_row
181func.func @m16n8k16_fp16_row_row_row(%arg0: memref<20x20xf16, #gpu.address_space<workgroup>>, %arg1: memref<20x20xf16, #gpu.address_space<workgroup>>, %arg2: memref<20x20xf16, #gpu.address_space<workgroup>>) {
182  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
183  %c0 = arith.constant 0 : index
184  %cst = arith.constant 0.000000e+00 : f16
185
186  // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
187  // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]
188  // CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false}
189  // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
190  // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]
191  // CHECK: nvgpu.ldmatrix %arg1[[[k_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = true}
192  %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
193  %B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<8x16xf16>
194  %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<16x8xf16>
195  %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
196  vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<20x20xf16, #gpu.address_space<workgroup>>
197  return
198}
199
200// -----
201
202//#########################################################################
203// FP16 row-row-row (ldmatrix x4 for matrixA and ldmatrix x4 for matrixB)
204//#########################################################################
205
206// CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)>
207// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)>
208
209#map0 = affine_map<(d0, d1) -> (d1, d0)>
210#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
211#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
212#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
213
214// CHECK-LABEL: func @m16n16k16_mmasync16816_fp16_f16_row_row_row
215func.func @m16n16k16_mmasync16816_fp16_f16_row_row_row(%arg0: memref<42x32xf16, #gpu.address_space<workgroup>>, %arg1: memref<32x64xf16, #gpu.address_space<workgroup>>, %arg2: memref<42x64xf16, #gpu.address_space<workgroup>>) {
216  %c0 = arith.constant 0 : index
217  %c8 = arith.constant 8 : index
218  %cst = arith.constant 0.000000e+00 : f16
219
220  // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
221  // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]
222  // CHECK: [[fragmentA:%.+]] = nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false}
223  %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<42x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
224
225  // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
226  // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]
227  // CHECK-DAG: [[fragmentB:%.+]] = nvgpu.ldmatrix %arg1[[[k_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = true}
228  %B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<32x64xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
229
230  // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
231  // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
232  // CHECK-DAG: [[fragmentC:%.*]] = nvgpu.ldmatrix %arg2[[[m_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = false}
233  %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<42x64xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
234
235  // CHECK-DAG: [[fragmentB0:%.+]] = vector.extract_strided_slice [[fragmentB]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
236  // CHECK-DAG: [[fragmentC0:%.+]] = vector.extract_strided_slice [[fragmentC]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
237  // CHECK: nvgpu.mma.sync([[fragmentA]], [[fragmentB0]], [[fragmentC0]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
238  %B0 = vector.extract_strided_slice %B {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
239  %C0 = vector.extract_strided_slice %C {offsets = [0, 0], sizes = [16, 8], strides = [1, 1]} : vector<16x16xf16> to vector<16x8xf16>
240  %D0 = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B0, %C0 : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
241  vector.transfer_write %D0, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<42x64xf16, #gpu.address_space<workgroup>>
242
243  // CHECK-DAG: [[fragmentB1:%.+]] = vector.extract_strided_slice [[fragmentB]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
244  // CHECK-DAG: [[fragmentC1:%.+]] = vector.extract_strided_slice [[fragmentC]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
245  // CHECK: nvgpu.mma.sync([[fragmentA]], [[fragmentB1]], [[fragmentC1]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
246  %B1 = vector.extract_strided_slice %B {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
247  %C1 = vector.extract_strided_slice %C {offsets = [0, 8], sizes = [16, 8], strides = [1, 1]} : vector<16x16xf16> to vector<16x8xf16>
248  %D1 = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B1, %C1 : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
249  vector.transfer_write %D1, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<42x64xf16, #gpu.address_space<workgroup>>
250
251  return
252}
253// -----
254
255//#################################################################################################################
256// FP16 row-row-row (Determine the transpose for multi-dimensional vector.transfer_read in vector-to-gpu lowering)
257//#################################################################################################################
258
259// CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)>
260// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)>
261
262#map0 = affine_map<(d0, d1, d2) -> (d2, d1)>
263#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
264#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
265#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
266#map_a = affine_map<(d0, d1, d2, d3) -> (d1, d3)>
267#map_b = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
268
269// CHECK-LABEL: func @multi_dim_m16n8k16_fp16_row_row_row
270func.func @multi_dim_m16n8k16_fp16_row_row_row(%arg0: memref<4x32x1x32xf16, #gpu.address_space<workgroup>>, %arg1: memref<4x1x32x32xf16, #gpu.address_space<workgroup>>, %arg2:  memref<1x32x40xf16, #gpu.address_space<workgroup>>) {
271
272  // CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
273  %c0 = arith.constant 0 : index
274  %cst = arith.constant 0.000000e+00 : f16
275
276  // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
277  // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]
278  // CHECK: [[fragmentA:%.+]] = nvgpu.ldmatrix %arg0[[[c0]], [[m_coord]], [[c0]], [[k_coord]]] {numTiles = 4 : i32, transpose = false}
279  %A = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map_a} : memref<4x32x1x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
280
281  // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
282  // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]
283  // CHECK-DAG: [[fragmentB:%.+]] = nvgpu.ldmatrix %arg1[[[c0]], [[c0]], [[k_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = true}
284  %B = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map_b} : memref<4x1x32x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
285
286  // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
287  // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
288  // CHECK-DAG: [[fragmentC:%.*]] = nvgpu.ldmatrix %arg2[[[c0]], [[m_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = false}
289  %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x32x40xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
290
291  // CHECK-DAG: [[fragmentB0:%.+]] = vector.extract_strided_slice [[fragmentB]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
292  // CHECK-DAG: [[fragmentC0:%.+]] = vector.extract_strided_slice [[fragmentC]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
293  // CHECK: nvgpu.mma.sync([[fragmentA]], [[fragmentB0]], [[fragmentC0]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
294  %B0 = vector.extract_strided_slice %B {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
295  %C0 = vector.extract_strided_slice %C {offsets = [0, 0], sizes = [16, 8], strides = [1, 1]} : vector<16x16xf16> to vector<16x8xf16>
296  %D0 = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B0, %C0 : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
297  vector.transfer_write %D0, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<1x32x40xf16, #gpu.address_space<workgroup>>
298
299  return
300}
301
302// -----
303
304// CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)>
305// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)>
306
307#map0 = affine_map<(d0, d1, d2) -> (d2, d1)>
308#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
309#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
310#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
311
312// CHECK-LABEL: func @batch_m16n8k16_fp16_row_row_row
313func.func @batch_m16n8k16_fp16_row_row_row(%arg0: memref<2x20x20xf16, #gpu.address_space<workgroup>>, %arg1: memref<2x20x20xf16, #gpu.address_space<workgroup>>, %arg2: memref<2x20x20xf16, #gpu.address_space<workgroup>>) {
314  %cst_0 = arith.constant dense<0.000000e+00> : vector<20x20xf16>
315  // CHECK: [[C0:%.+]] = arith.constant 0 : index
316  %c0 = arith.constant 0 : index
317  %cst = arith.constant 0.000000e+00 : f16
318
319  // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
320  // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]
321  // CHECK: nvgpu.ldmatrix %arg0[[[C0]], [[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<2x20x20xf16, #gpu.address_space<workgroup>> -> vector<4x2xf16>
322  %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x20x20xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
323
324  // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
325  // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]
326  // CHECK: nvgpu.ldmatrix %arg1[[[C0]], [[k_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = true} : memref<2x20x20xf16, #gpu.address_space<workgroup>> -> vector<2x2xf16>
327  %B = vector.transfer_read %arg1[%c0, %c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<2x20x20xf16, #gpu.address_space<workgroup>>, vector<8x16xf16>
328
329  // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
330  // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
331  // CHECK: nvgpu.ldmatrix %arg2[[[C0]], [[m_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = false} : memref<2x20x20xf16, #gpu.address_space<workgroup>> -> vector<2x2xf16>
332  %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x20x20xf16, #gpu.address_space<workgroup>>, vector<16x8xf16>
333  %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
334  vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<2x20x20xf16, #gpu.address_space<workgroup>>
335  return
336}
337
338// -----
339
340//#########################################################
341// FP16 row-col-row
342//#########################################################
343
344#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
345#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
346#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
347
348// Affine maps for ldmatrix x4 tile of `16 x 16` f16 elements in `strided x contiguous` dimensions.
349// CHECK: [[$strided_ldmatrix_x4_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)>
350// CHECK: [[$contiguous_ldmatrix_x4_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)>
351
352// CHECK: [[$strided_ldmatrix_x2_map:#.+]] = affine_map<()[s0] -> (s0 mod 8)>
353// CHECK: [[$contiguous_ldmatrix_x2_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 8)>
354
355// CHECK-LABEL: func @m16n8k16_fp16_row_col_row
356func.func @m16n8k16_fp16_row_col_row(%arg0: memref<20x20xf16, #gpu.address_space<workgroup>>, %arg1: memref<20x20xf16, #gpu.address_space<workgroup>>, %arg2: memref<20x20xf16, #gpu.address_space<workgroup>>) {
357  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
358  %c0 = arith.constant 0 : index
359
360  %cst = arith.constant 0.000000e+00 : f16
361  // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_ldmatrix_x4_map]]
362  // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x4_map]]
363  // CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32
364  // CHECK-SAME: transpose = false
365
366  // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$strided_ldmatrix_x2_map]]
367  // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x2_map]]
368  // CHECK: nvgpu.ldmatrix %arg1[[[n_coord]], [[k_coord]]] {numTiles = 2 : i32
369  // CHECK-SAME: transpose = false
370
371  // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_ldmatrix_x4_map]]
372  // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x4_map]]
373  // CHECK: nvgpu.ldmatrix %arg2[[[m_coord]], [[n_coord]]] {numTiles = 2 : i32
374  // CHECK-SAME: transpose = false
375  %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
376  %B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<8x16xf16>
377  %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<16x8xf16>
378  %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
379  vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<20x20xf16, #gpu.address_space<workgroup>>
380  return
381}
382
383// -----
384
385//#########################################################
386// TF32 row-row-row
387//#########################################################
388
389#map0 = affine_map<(d0, d1) -> (d1, d0)>
390#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
391#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
392#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
393
394// CHECK-DAG: [[$rowA_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)>
395// CHECK-DAG: [[$colA_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 4 + 3)>
396
397// CHECK-DAG: [[$rowB_map:#.+]] = affine_map<()[s0] -> (s0 mod 4 + 3)>
398// CHECK-DAG: [[$colB_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 3)>
399
400// CHECK-DAG: [[$rowC_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4)>
401// CHECK-DAG: [[$rowC8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 8)>
402// CHECK-DAG: [[$colC_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)>
403
404// CHECK-LABEL: func @m16n8k4_tf32_f32_row_row_row
405func.func @m16n8k4_tf32_f32_row_row_row(%arg0: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg1: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg2: memref<20x20xf32>) {
406  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf32>
407  %c0 = arith.constant 0 : index
408  %c1 = arith.constant 1 : index
409  %c3 = arith.constant 3 : index
410  %cst = arith.constant 0.000000e+00 : f32
411
412  // CHECK: [[c_frag:%.+]] = arith.constant {{.*}} : vector<2x2xf32>
413
414  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA_map]]
415  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colA_map]]
416  // CHECK: [[a_frag:%.+]] = nvgpu.ldmatrix %arg0[[[row]], [[col]]] {numTiles = 2 : i32, transpose = false}
417
418  // b and c are not loaded by ldmatrix in this test.
419  // CHECK-NOT: nvgpu.ldmatrix
420
421  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB_map]]
422  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB_map]]
423  // CHECK: [[b_el:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>>
424  // CHECK: [[b_frag:%.+]] = vector.insert [[b_el]], {{.*}} : f32 into vector<1x1xf32>
425
426  // CHECK: [[d_frag:%.+]] = nvgpu.mma.sync([[a_frag]], [[b_frag]], [[c_frag]])
427  // CHECK-SAME: mmaShape = [16, 8, 4]
428  // CHECK-SAME: -> vector<2x2xf32>
429  %A = vector.transfer_read %arg0[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<16x4xf32>
430  %B = vector.transfer_read %arg1[%c3, %c3], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<8x4xf32>
431  %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %cst_0 : vector<16x4xf32>, vector<8x4xf32> into vector<16x8xf32>
432
433  // CHECK: vector.extract [[d_frag]][0] : vector<2xf32> from vector<2x2xf32>
434  // CHECK: affine.apply [[$rowC_map]]
435  // CHECK: affine.apply [[$colC_map]]
436  // CHECK: vector.store
437  // CHECK: vector.extract [[d_frag]][1] : vector<2xf32> from vector<2x2xf32>
438  // CHECK: affine.apply [[$rowC8_map]]
439  // CHECK: affine.apply [[$colC_map]]
440  // CHECK: vector.store
441  vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf32>, memref<20x20xf32>
442  return
443}
444
445// -----
446
447//#########################################################
448// TF32 row-row-row
449//#########################################################
450#map0 = affine_map<(d0, d1) -> (d1, d0)>
451#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
452#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
453#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
454
455// CHECK-DAG: [[$rowA_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)>
456// CHECK-DAG: [[$colA_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 4 + 3)>
457
458// CHECK-DAG: [[$rowB_map:#.+]] = affine_map<()[s0] -> (s0 mod 4 + 3)>
459// CHECK-DAG: [[$colB_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 3)>
460
461// CHECK-DAG: [[$rowC_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4)>
462// CHECK-DAG: [[$rowC8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 8)>
463// CHECK-DAG: [[$colC_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)>
464
465// CHECK-LABEL: func @m16n8k8_tf32_f32_row_row_row
466func.func @m16n8k8_tf32_f32_row_row_row(%arg0: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg1: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg2: memref<20x20xf32>) {
467  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf32>
468  %c0 = arith.constant 0 : index
469  %c1 = arith.constant 1 : index
470  %c3 = arith.constant 3 : index
471  %cst = arith.constant 0.000000e+00 : f32
472
473  // CHECK: [[c_frag:%.+]] = arith.constant {{.*}} : vector<2x2xf32>
474
475  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA_map]]
476  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colA_map]]
477  // CHECK: [[a_frag:%.+]] = nvgpu.ldmatrix %arg0[[[row]], [[col]]] {numTiles = 4 : i32, transpose = false}
478
479  // b and c are not loaded by ldmatrix in this test.
480  // CHECK-NOT: nvgpu.ldmatrix
481
482  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB_map]]
483  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB_map]]
484  // CHECK: [[b_el0:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>>
485  // CHECK: [[b_frag0:%.+]] = vector.insert [[b_el0]], {{.*}} : f32 into vector<2x1xf32>
486  // CHECK: [[b_el1:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>>
487  // CHECK: [[b_frag1:%.+]] = vector.insert [[b_el1]], {{.*}} : f32 into vector<2x1xf32>
488
489  // CHECK: [[d_frag:%.+]] = nvgpu.mma.sync([[a_frag]], [[b_frag1]], [[c_frag]])
490  // CHECK-SAME: mmaShape = [16, 8, 8]
491  // CHECK-SAME: -> vector<2x2xf32>
492  %A = vector.transfer_read %arg0[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
493  %B = vector.transfer_read %arg1[%c3, %c3], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<8x8xf32>
494  %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %cst_0 : vector<16x8xf32>, vector<8x8xf32> into vector<16x8xf32>
495
496  // CHECK: vector.extract [[d_frag]][0] : vector<2xf32> from vector<2x2xf32>
497  // CHECK: affine.apply [[$rowC_map]]
498  // CHECK: affine.apply [[$colC_map]]
499  // CHECK: vector.store
500  // CHECK: vector.extract [[d_frag]][1] : vector<2xf32> from vector<2x2xf32>
501  // CHECK: affine.apply [[$rowC8_map]]
502  // CHECK: affine.apply [[$colC_map]]
503  // CHECK: vector.store
504  vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf32>, memref<20x20xf32>
505  return
506}
507
508// -----
509
510//#########################################################
511// TF32 col-col-row
512//#########################################################
513#map0 = affine_map<(d0, d1) -> (d1, d0)>
514#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
515#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
516#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
517
518// CHECK-DAG: [[$rowA0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4)>
519// CHECK-DAG: [[$colA0_map:#.+]] = affine_map<()[s0] -> (s0 mod 4)>
520// CHECK-DAG: [[$rowA8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 8)>
521// CHECK-DAG: [[$colA4_map:#.+]] = affine_map<()[s0] -> (s0 mod 4 + 4)>
522
523// CHECK-DAG: [[$rowB0_map:#.+]] = affine_map<()[s0] -> (s0 mod 8)>
524// CHECK-DAG: [[$colB0_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 4)>
525
526// CHECK-DAG: [[$rowC_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 16)>
527// CHECK-DAG: [[$rowC8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 24)>
528// CHECK-DAG: [[$colC_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8 + 8)>
529
530// CHECK-LABEL: func @m16n8k8_tf32_f32_col_col_row
531func.func @m16n8k8_tf32_f32_col_col_row(%arg0: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg1: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg2: memref<20x20xf32>) {
532  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf32>
533  %c0 = arith.constant 0 : index
534  %c16 = arith.constant 16 : index
535  %c8 = arith.constant 8 : index
536  %cst = arith.constant 0.000000e+00 : f32
537
538  // CHECK: [[c_frag:%.+]] = arith.constant {{.*}} : vector<2x2xf32>
539
540  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA0_map]]
541  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colA0_map]]
542  // CHECK: [[a_el0:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>>
543  // CHECK: [[a_frag0:%.+]] = vector.insert [[a_el0]], {{.*}} [0, 0] : f32 into vector<4x1xf32>
544
545  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA8_map]]
546  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colA0_map]]
547  // CHECK: [[a_el0:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>>
548  // CHECK: [[a_frag0:%.+]] = vector.insert [[a_el0]], {{.*}} [1, 0] : f32 into vector<4x1xf32>
549
550  // CHECK: [[a_el:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>>
551  // CHECK: [[a_frag:%.+]] = vector.insert [[a_el]], {{.*}} [2, 0] : f32 into vector<4x1xf32>
552  // CHECK: [[a_el:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>>
553  // CHECK: [[a_frag:%.+]] = vector.insert [[a_el]], {{.*}} [3, 0] : f32 into vector<4x1xf32>
554
555  // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB0_map]]
556  // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]
557  // CHECK: [[b_frag:%.+]] = nvgpu.ldmatrix %arg1[[[row]], [[col]]] {numTiles = 2 : i32, transpose = false}
558
559  // CHECK: [[d_frag:%.+]] = nvgpu.mma.sync([[a_frag]], [[b_frag]], [[c_frag]])
560  // CHECK-SAME: mmaShape = [16, 8, 8]
561  // CHECK-SAME: -> vector<2x2xf32>
562  %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map0} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
563  %B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<8x8xf32>
564  %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"],
565    kind = #vector.kind<add>} %A, %B, %cst_0 : vector<16x8xf32>, vector<8x8xf32> into vector<16x8xf32>
566
567  // CHECK: vector.extract [[d_frag]][0] : vector<2xf32> from vector<2x2xf32>
568  // CHECK: affine.apply [[$rowC_map]]
569  // CHECK: affine.apply [[$colC_map]]
570  // CHECK: vector.store
571  // CHECK: vector.extract [[d_frag]][1] : vector<2xf32> from vector<2x2xf32>
572  // CHECK: affine.apply [[$rowC8_map]]
573  // CHECK: affine.apply [[$colC_map]]
574  // CHECK: vector.store
575  vector.transfer_write %D, %arg2[%c16, %c8] {in_bounds = [true, true]} : vector<16x8xf32>, memref<20x20xf32>
576  return
577}
578
579// -----
580
581//#########################################################
582// INT4 row-col-row
583//#########################################################
584// Affine maps for loading operandA and operandB
585// maps (laneid -> coordinate pointed by the lane in the ldmatrix operand tile)
586// CHECK-DAG: [[$strided_ldmatrix_x4_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)>
587// CHECK-DAG: [[$contiguous_ldmatrix_x4_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 32)>
588// CHECK-DAG: [[$strided_ldmatrix_x2_map:#.+]] = affine_map<()[s0] -> (s0 mod 8)>
589// CHECK-DAG: [[$contiguous_ldmatrix_x2_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 32)>
590
591// Affine maps for accumulator registers
592// maps (laneid -> coordinate pointed by the lane in accumulator register tile)
593// CHECK-DAG: [[$rowC0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4)>
594// CHECK-DAG: [[$colC0_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8
595// CHECK-DAG: [[$rowC8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 8)>
596
597#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
598#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
599#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
600
601// CHECK-LABEL: func @m16n8k64_int4_row_col_row
602func.func @m16n8k64_int4_row_col_row(%arg0: memref<128x128xi4, #gpu.address_space<workgroup>>, %arg1: memref<128x128xi4, #gpu.address_space<workgroup>>, %arg2: memref<128x128xi32>) {
603  %cst  = arith.constant 0 : i4
604  %cst0  = arith.constant 0 : i32
605  %cst_0 = arith.constant dense<0> : vector<32x8xi4>
606  %c0 = arith.constant 0 : index
607
608  // CHECK: [[lane:%.+]] = gpu.lane_id
609  // CHECK: [[m_coord:%.+]] = affine.apply [[$strided_ldmatrix_x4_map]]()[[[lane]]]
610  // CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x4_map]]()[[[lane]]]
611  // CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi4, #gpu.address_space<workgroup>> -> vector<4x8xi4>
612
613  // CHECK: [[lane:%.+]] = gpu.lane_id
614  // CHECK: [[n_coord:%.+]] = affine.apply [[$strided_ldmatrix_x2_map]]()[[[lane]]]
615  // CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x2_map]]()[[[lane]]]
616  // CHECK: nvgpu.ldmatrix %arg1[[[n_coord]], [[k_coord]]] {numTiles = 2 : i32, transpose = false} : memref<128x128xi4, #gpu.address_space<workgroup>> -> vector<2x8xi4>
617
618  // CHECK: [[lane:%.+]] = gpu.lane_id
619  // CHECK: [[row:%.+]] = affine.apply [[$rowC0_map]]()[{{%.+}}]
620  // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}]
621  // CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
622
623  // CHECK: [[row:%.+]] = affine.apply [[$rowC8_map]]()[{{%.+}}]
624  // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}]
625  // CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
626  // CHECK-NOT: vector.load
627
628  %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi4, #gpu.address_space<workgroup>>, vector<16x64xi4>
629  %B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi4, #gpu.address_space<workgroup>>, vector<8x64xi4>
630  %C = vector.transfer_read %arg2[%c0, %c0], %cst0 {in_bounds = [true, true]} : memref<128x128xi32>, vector<16x8xi32>
631  // CHECK: [[d:%.+]] = nvgpu.mma.sync({{.*}}) {mmaShape = [16, 8, 64]} : (vector<4x8xi4>, vector<2x8xi4>, vector<2x2xi32>) -> vector<2x2xi32>
632  %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x64xi4>, vector<8x64xi4> into vector<16x8xi32>
633
634  // CHECK: [[lane:%.+]] = gpu.lane_id
635  // CHECK: [[v:%.+]] = vector.extract [[d]][0] : vector<2xi32> from vector<2x2xi32>
636  // CHECK: [[row:%.+]] = affine.apply [[$rowC0_map]]()[[[lane]]]
637  // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[[[lane]]]
638  // CHECK: vector.store [[v]], %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
639
640  // CHECK: [[v:%.+]] = vector.extract [[d]][1] : vector<2xi32> from vector<2x2xi32>
641  // CHECK: [[row:%.+]] = affine.apply [[$rowC8_map]]()[[[lane]]]
642  // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[[[lane]]]
643  // CHECK: vector.store [[v]], %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
644  vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xi32>, memref<128x128xi32>
645  return
646}
647
648// -----
649
650//#########################################################
651// INT8 row-col-row
652//#########################################################
653// Affine maps for loading operandA and operandB
654// maps (laneid -> coordinate pointed by the lane in the ldmatrix operand tile)
655// CHECK-DAG: [[$strided_ldmatrix_x4_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)>
656// CHECK-DAG: [[$contiguous_ldmatrix_x4_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 16)>
657// CHECK-DAG: [[$strided_ldmatrix_x2_map:#.+]] = affine_map<()[s0] -> (s0 mod 8)>
658// CHECK-DAG: [[$contiguous_ldmatrix_x2_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 16)>
659
660// Affine maps for accumulator registers
661// maps (laneid -> coordinate pointed by the lane in accumulator register tile)
662// CHECK-DAG: [[$rowC0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4)>
663// CHECK-DAG: [[$colC0_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)>
664// CHECK-DAG: [[$rowC8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 8)>
665
666
667#map0 = affine_map<(d0, d1) -> (d1, d0)>
668#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
669#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
670#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
671
672// CHECK-LABEL: func @m16n8k32_int8_row_col_row
673func.func @m16n8k32_int8_row_col_row(%arg0: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg1: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg2: memref<128x128xi32>) {
674  %cst_0 = arith.constant dense<0> : vector<32x8xi8>
675  %c0 = arith.constant 0 : index
676  %cst = arith.constant 0 : i8
677  %cst0 = arith.constant 0 : i32
678
679  // CHECK: [[lane:%.+]] = gpu.lane_id
680  // CHECK: [[m_coord:%.+]] = affine.apply [[$strided_ldmatrix_x4_map]]()[[[lane]]]
681  // CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x4_map]]()[[[lane]]]
682  // CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi8, #gpu.address_space<workgroup>> -> vector<4x4xi8>
683
684  // CHECK: [[lane:%.+]] = gpu.lane_id
685  // CHECK: [[n_coord:%.+]] = affine.apply [[$strided_ldmatrix_x2_map]]()[[[lane]]]
686  // CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x2_map]]()[[[lane]]]
687  // CHECK: nvgpu.ldmatrix %arg1[[[n_coord]], [[k_coord]]] {numTiles = 2 : i32, transpose = false} : memref<128x128xi8, #gpu.address_space<workgroup>> -> vector<2x4xi8>
688
689  // CHECK: [[lane:%.+]] = gpu.lane_id
690  // CHECK: [[m_coord:%.+]] = affine.apply [[$rowC0_map]]()[[[lane]]]
691  // CHECK: [[n_coord:%.+]] = affine.apply [[$colC0_map]]()[[[lane]]]
692  // CHECK: vector.load %arg2[[[m_coord]], [[n_coord]]] : memref<128x128xi32>, vector<2xi32>
693  // CHECK: [[m_coord:%.+]] = affine.apply [[$rowC8_map]]()[[[lane]]]
694  // CHECK: [[n_coord:%.+]] = affine.apply [[$colC0_map]]()[[[lane]]]
695  // CHECK: vector.load %arg2[[[m_coord]], [[n_coord]]] : memref<128x128xi32>, vector<2xi32>
696  // CHECK-NOT: vector.load %arg2{{.*}}
697
698  %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi8, #gpu.address_space<workgroup>>, vector<16x32xi8>
699  %B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi8, #gpu.address_space<workgroup>>, vector<8x32xi8>
700  %C = vector.transfer_read %arg2[%c0, %c0], %cst0 {in_bounds = [true, true]} : memref<128x128xi32>, vector<16x8xi32>
701  // CHECK: [[d:%.+]] = nvgpu.mma.sync({{.*}}) {mmaShape = [16, 8, 32]} : (vector<4x4xi8>, vector<2x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
702  %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x32xi8>, vector<8x32xi8> into vector<16x8xi32>
703
704  // CHECK: [[lane:%.+]] = gpu.lane_id
705  // CHECK: [[v:%.+]] = vector.extract [[d]][0] : vector<2xi32> from vector<2x2xi32>
706  // CHECK: [[row:%.+]] = affine.apply [[$rowC0_map]]()[[[lane]]]
707  // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[[[lane]]]
708  // CHECK: vector.store [[v]], %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
709  // CHECK: [[v:%.+]] = vector.extract [[d]][1] : vector<2xi32> from vector<2x2xi32>
710  // CHECK: [[row:%.+]] = affine.apply [[$rowC8_map]]()[[[lane]]]
711  // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[[[lane]]]
712  // CHECK: vector.store [[v]], %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32>
713  vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xi32>, memref<128x128xi32>
714  return
715}
716
717// -----
718
719
720#map0 = affine_map<(d0, d1) -> (d1, d0)>
721#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
722#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
723#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
724!smem_type = memref<20x20xf16, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>
725
726// This test case is identical to m16n8k16 test case, but it tests that having
727// n row dimension with unknown stride is handled correctly.
728
729// CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)>
730// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)>
731// CHECK-LABEL: func @strided_memref_read_write
732func.func @strided_memref_read_write(%arg0: !smem_type,
733                                     %arg1: !smem_type,
734                                     %arg2: !smem_type) {
735  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
736  %c0 = arith.constant 0 : index
737  %cst = arith.constant 0.000000e+00 : f16
738
739  // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
740  // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]
741  // CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false}
742  // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
743  // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]
744  // CHECK: nvgpu.ldmatrix %arg1[[[k_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = true}
745  %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x16xf16>
746  %B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : !smem_type, vector<8x16xf16>
747  %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x8xf16>
748  %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
749    %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
750  vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, !smem_type
751  return
752}
753
754// -----
755
756
757#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
758#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
759#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d0, d3)>
760#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
761!smem_type = memref<20x20x20xf16, strided<[?, ?, 1], offset: ?>, #gpu.address_space<workgroup>>
762
763// CHECK-LABEL: func @unsupported_non_2d_load_store
764func.func @unsupported_non_2d_load_store(%arg0: !smem_type,
765                           %arg1: !smem_type,
766                           %arg2: !smem_type) {
767  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
768  %c0 = arith.constant 0 : index
769  %cst = arith.constant 0.000000e+00 : f16
770
771  // CHECK-NOT: nvgpu.ldmatrix
772  // CHECK-NOT: nvgpu.mma
773  %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : !smem_type, vector<1x16x16xf16>
774  %B = vector.transfer_read %arg1[%c0, %c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true, true]} : !smem_type, vector<8x1x16xf16>
775  %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : !smem_type, vector<1x16x8xf16>
776  %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>}
777    %A, %B, %C : vector<1x16x16xf16>, vector<8x1x16xf16> into vector<1x16x8xf16>
778  vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x16x8xf16>, !smem_type
779  return
780}
781
782// -----
783
784#map0 = affine_map<(d0, d1) -> (d1, d0)>
785#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
786#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
787#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
788
789!smem_type = memref<20x20xf16, strided<[?, ?], offset: ?>, #gpu.address_space<workgroup>>
790
791// CHECK-LABEL: func @unsupported_fully_dynamic_strides
792func.func @unsupported_fully_dynamic_strides(%arg0: !smem_type,
793                                         %arg1: !smem_type,
794                                         %arg2: !smem_type) {
795  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
796  %c0 = arith.constant 0 : index
797  %cst = arith.constant 0.000000e+00 : f16
798
799  // CHECK-NOT: nvgpu.ldmatrix
800  // CHECK-NOT: nvgpu.mma
801  %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x16xf16>
802  %B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : !smem_type, vector<8x16xf16>
803  %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x8xf16>
804  %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
805    %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
806  vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, !smem_type
807  return
808}
809
810// -----
811
812#map0 = affine_map<(d0, d1) -> (d1, d0)>
813#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
814#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
815#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
816
817
818!smem_type = memref<20x20xf16, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>
819
820// CHECK-LABEL: func @unsupported_transposed_store
821func.func @unsupported_transposed_store(%arg0: !smem_type,
822                            %arg1: !smem_type,
823                            %arg2: !smem_type) {
824  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
825  %c0 = arith.constant 0 : index
826  %cst = arith.constant 0.000000e+00 : f16
827
828  // CHECK-NOT: nvgpu.ldmatrix
829  // CHECK-NOT: nvgpu.mma
830  %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x16xf16>
831  %B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : !smem_type, vector<8x16xf16>
832  %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x8xf16>
833  %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
834    %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
835  vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true], permutation_map = affine_map<(d0, d1)->(d1, d0)>} : vector<16x8xf16>, !smem_type
836  return
837}
838