1// RUN: mlir-opt %s -split-input-file -pass-pipeline="builtin.module(func.func(convert-vector-to-gpu{use-nvgpu=true}))" | FileCheck %s 2 3//######################################################### 4// INT8 row-row-row 5//######################################################### 6 7// CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)> 8// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 16)> 9 10// CHECK-DAG: [[$rowB0_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 39)> 11// CHECK-DAG: [[$colB0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 40)> 12// CHECK-DAG: [[$rowB1_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 40)> 13// CHECK-DAG: [[$rowB2_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 41)> 14// CHECK-DAG: [[$rowB3_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 42)> 15// CHECK-DAG: [[$rowB4_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 55)> 16// CHECK-DAG: [[$rowB5_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 56)> 17// CHECK-DAG: [[$rowB6_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 57)> 18// CHECK-DAG: [[$rowB7_map:#.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 58)> 19 20// CHECK-DAG: [[$rowC0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 49)> 21// CHECK-DAG: [[$colC0_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8 + 40)> 22// CHECK-DAG: [[$rowC8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 57)> 23 24 25#map0 = affine_map<(d0, d1) -> (d1, d0)> 26#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 27#map2 = affine_map<(d0, d1, d2) -> (d1, d2)> 28#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> 29 30// CHECK-LABEL: func @m16n8k32_int8_row_row_row 31func.func @m16n8k32_int8_row_row_row(%arg0: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg1: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg2: memref<128x128xi32>) { 32 %cst_0 = arith.constant dense<0> : vector<32x8xi8> 33 %c0 = arith.constant 0 : index 34 %c1 = arith.constant 1 : index 35 %c17 = arith.constant 17 : index 36 %c39 = arith.constant 39 : index 37 %c40 = arith.constant 40 : index 38 %c49 = arith.constant 49 : index 39 %c50 = arith.constant 50 : index 40 %cst = arith.constant 0 : i8 41 %cst0 = arith.constant 0 : i32 42 43 // Verify that the operandA load is lowered to warp-wide ldmatrix. 44 45 // CHECK: [[m_coord:%.+]] = affine.apply [[$strided_map]]()[{{%.+}}] 46 // CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]()[{{%.+}}] 47 // CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi8, #gpu.address_space<workgroup>> -> vector<4x4xi8> 48 49 // Verify that the operandB load is lowered to scalar load to be able 50 // to transpose at 8-bit granularity. ldmatrix can only transpose at 51 // 16-bit granularity. 52 53 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB0_map]]()[{{%.+}}] 54 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}] 55 // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>> 56 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB1_map]]()[{{%.+}}] 57 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}] 58 // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>> 59 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB2_map]]()[{{%.+}}] 60 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}] 61 // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>> 62 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB3_map]]()[{{%.+}}] 63 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}] 64 // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>> 65 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}] 66 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB4_map]]()[{{%.+}}] 67 // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>> 68 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB5_map]]()[{{%.+}}] 69 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}] 70 // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>> 71 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB6_map]]()[{{%.+}}] 72 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}] 73 // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>> 74 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB7_map]]()[{{%.+}}] 75 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]]()[{{%.+}}] 76 // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xi8, #gpu.address_space<workgroup>> 77 // CHECK-NOT: memref.load %arg1 78 79 // Verify that the operand C is distributed to loads correctly. 80 // CHECK: [[row:%.+]] = affine.apply [[$rowC0_map]]()[{{%.+}}] 81 // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}] 82 // CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32> 83 // CHECK: [[row:%.+]] = affine.apply [[$rowC8_map]]()[{{%.+}}] 84 // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}] 85 // CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32> 86 // CHECK-NOT: vector.load %arg2{{.*}} 87 88 %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi8, #gpu.address_space<workgroup>>, vector<16x32xi8> 89 %B = vector.transfer_read %arg1[%c39, %c40], %cst {in_bounds = [true, true], permutation_map = #map0} : memref<128x128xi8, #gpu.address_space<workgroup>>, vector<8x32xi8> 90 %C = vector.transfer_read %arg2[%c49, %c40], %cst0 {in_bounds = [true, true]} : memref<128x128xi32>, vector<16x8xi32> 91 // CHECK: [[d:%.+]] = nvgpu.mma.sync({{.*}}) {mmaShape = [16, 8, 32]} : (vector<4x4xi8>, vector<2x4xi8>, vector<2x2xi32>) -> vector<2x2xi32> 92 %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x32xi8>, vector<8x32xi8> into vector<16x8xi32> 93 94 // CHECK: [[row:%.+]] = affine.apply [[$rowC0_map]]()[{{%.+}}] 95 // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}] 96 // CHECK: vector.store {{%.+}}, %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32> 97 // CHECK: [[row:%.+]] = affine.apply [[$rowC8_map]]()[{{%.+}}] 98 // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}] 99 // CHECK: vector.store {{%.+}}, %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32> 100 vector.transfer_write %D, %arg2[%c49, %c40] {in_bounds = [true, true]} : vector<16x8xi32>, memref<128x128xi32> 101 return 102} 103 104// ----- 105 106//######################################################### 107// f64 row-row-row 108//######################################################### 109// CHECK-DAG: [[$rowA0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 1)> 110// CHECK-DAG: [[$colA0_map:#.+]] = affine_map<()[s0] -> (s0 mod 4 + 1)> 111 112// CHECK-DAG: [[$rowb0_map:#.+]] = affine_map<()[s0] -> (s0 mod 4 + 39)> 113// CHECK-DAG: [[$colb0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 40)> 114 115// CHECK-DAG: [[$rowC0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 49)> 116// CHECK-DAG: [[$colC0_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8 + 40) 117 118#map0 = affine_map<(d0, d1) -> (d1, d0)> 119#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 120#map2 = affine_map<(d0, d1, d2) -> (d1, d2)> 121#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> 122 123// CHECK-LABEL: func @m8n8k4_f64_row_row_row 124func.func @m8n8k4_f64_row_row_row(%arg0: memref<128x128xf64>, %arg1: memref<128x128xf64>, %arg2: memref<128x128xf64>) { 125 %cst_0 = arith.constant dense<0.0> : vector<4x8xf64> 126 %c0 = arith.constant 0 : index 127 %c1 = arith.constant 1 : index 128 %c17 = arith.constant 17 : index 129 %c39 = arith.constant 39 : index 130 %c40 = arith.constant 40 : index 131 %c49 = arith.constant 49 : index 132 %c50 = arith.constant 50 : index 133 %cst = arith.constant 0.0 : f64 134 %cst0 = arith.constant 0.0 : f64 135 136 // Verify that the operand A is distributed to loads correctly. 137 138 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA0_map]] 139 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colA0_map]] 140 // CHECK: vector.load %arg0[[[row]], [[col]]] : memref<128x128xf64>, vector<1xf64> 141 142 // Verify that the operand B is distributed to loads correctly. It's elements 143 // must be loaded in a non-vectorized manner to do the transpose. 144 145 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowb0_map]] 146 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colb0_map]] 147 // CHECK: memref.load %arg1[[[row]], [[col]]] : memref<128x128xf64> 148 149 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowC0_map]] 150 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colC0_map]] 151 // CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xf64>, vector<2xf64> 152 153 %A = vector.transfer_read %arg0[%c1, %c1], %cst {in_bounds = [true, true]} : memref<128x128xf64>, vector<8x4xf64> 154 %B = vector.transfer_read %arg1[%c39, %c40], %cst {in_bounds = [true, true], permutation_map = #map0} : memref<128x128xf64>, vector<8x4xf64> 155 %C = vector.transfer_read %arg2[%c49, %c40], %cst0 {in_bounds = [true, true]} : memref<128x128xf64>, vector<8x8xf64> 156 // CHECK: [[d:%.+]] = nvgpu.mma.sync({{.*}}) {mmaShape = [8, 8, 4]} : (vector<1x1xf64>, vector<1x1xf64>, vector<1x2xf64>) -> vector<1x2xf64> 157 %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<8x4xf64>, vector<8x4xf64> into vector<8x8xf64> 158 159 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowC0_map]] 160 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colC0_map]] 161 // CHECK: vector.store {{%.+}}, %arg2[[[row]], [[col]]] : memref<128x128xf64>, vector<2xf64> 162 vector.transfer_write %D, %arg2[%c49, %c40] {in_bounds = [true, true]} : vector<8x8xf64>, memref<128x128xf64> 163 return 164} 165 166// ----- 167 168//######################################################################### 169// FP16 row-row-row (ldmatrix x4 for matrixA and ldmatrix x2 for matrixB) 170//######################################################################### 171 172#map0 = affine_map<(d0, d1) -> (d1, d0)> 173#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 174#map2 = affine_map<(d0, d1, d2) -> (d1, d2)> 175#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> 176 177// CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)> 178// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)> 179 180// CHECK-LABEL: func @m16n8k16_fp16_row_row_row 181func.func @m16n8k16_fp16_row_row_row(%arg0: memref<20x20xf16, #gpu.address_space<workgroup>>, %arg1: memref<20x20xf16, #gpu.address_space<workgroup>>, %arg2: memref<20x20xf16, #gpu.address_space<workgroup>>) { 182 %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16> 183 %c0 = arith.constant 0 : index 184 %cst = arith.constant 0.000000e+00 : f16 185 186 // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]] 187 // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]] 188 // CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} 189 // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]] 190 // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]] 191 // CHECK: nvgpu.ldmatrix %arg1[[[k_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = true} 192 %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<16x16xf16> 193 %B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<8x16xf16> 194 %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<16x8xf16> 195 %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16> 196 vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<20x20xf16, #gpu.address_space<workgroup>> 197 return 198} 199 200// ----- 201 202//######################################################################### 203// FP16 row-row-row (ldmatrix x4 for matrixA and ldmatrix x4 for matrixB) 204//######################################################################### 205 206// CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)> 207// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)> 208 209#map0 = affine_map<(d0, d1) -> (d1, d0)> 210#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 211#map2 = affine_map<(d0, d1, d2) -> (d1, d2)> 212#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> 213 214// CHECK-LABEL: func @m16n16k16_mmasync16816_fp16_f16_row_row_row 215func.func @m16n16k16_mmasync16816_fp16_f16_row_row_row(%arg0: memref<42x32xf16, #gpu.address_space<workgroup>>, %arg1: memref<32x64xf16, #gpu.address_space<workgroup>>, %arg2: memref<42x64xf16, #gpu.address_space<workgroup>>) { 216 %c0 = arith.constant 0 : index 217 %c8 = arith.constant 8 : index 218 %cst = arith.constant 0.000000e+00 : f16 219 220 // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]] 221 // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]] 222 // CHECK: [[fragmentA:%.+]] = nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} 223 %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<42x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16> 224 225 // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]] 226 // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]] 227 // CHECK-DAG: [[fragmentB:%.+]] = nvgpu.ldmatrix %arg1[[[k_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = true} 228 %B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<32x64xf16, #gpu.address_space<workgroup>>, vector<16x16xf16> 229 230 // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]] 231 // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]] 232 // CHECK-DAG: [[fragmentC:%.*]] = nvgpu.ldmatrix %arg2[[[m_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = false} 233 %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<42x64xf16, #gpu.address_space<workgroup>>, vector<16x16xf16> 234 235 // CHECK-DAG: [[fragmentB0:%.+]] = vector.extract_strided_slice [[fragmentB]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16> 236 // CHECK-DAG: [[fragmentC0:%.+]] = vector.extract_strided_slice [[fragmentC]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16> 237 // CHECK: nvgpu.mma.sync([[fragmentA]], [[fragmentB0]], [[fragmentC0]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> 238 %B0 = vector.extract_strided_slice %B {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> 239 %C0 = vector.extract_strided_slice %C {offsets = [0, 0], sizes = [16, 8], strides = [1, 1]} : vector<16x16xf16> to vector<16x8xf16> 240 %D0 = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B0, %C0 : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16> 241 vector.transfer_write %D0, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<42x64xf16, #gpu.address_space<workgroup>> 242 243 // CHECK-DAG: [[fragmentB1:%.+]] = vector.extract_strided_slice [[fragmentB]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16> 244 // CHECK-DAG: [[fragmentC1:%.+]] = vector.extract_strided_slice [[fragmentC]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16> 245 // CHECK: nvgpu.mma.sync([[fragmentA]], [[fragmentB1]], [[fragmentC1]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> 246 %B1 = vector.extract_strided_slice %B {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> 247 %C1 = vector.extract_strided_slice %C {offsets = [0, 8], sizes = [16, 8], strides = [1, 1]} : vector<16x16xf16> to vector<16x8xf16> 248 %D1 = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B1, %C1 : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16> 249 vector.transfer_write %D1, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<42x64xf16, #gpu.address_space<workgroup>> 250 251 return 252} 253// ----- 254 255//################################################################################################################# 256// FP16 row-row-row (Determine the transpose for multi-dimensional vector.transfer_read in vector-to-gpu lowering) 257//################################################################################################################# 258 259// CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)> 260// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)> 261 262#map0 = affine_map<(d0, d1, d2) -> (d2, d1)> 263#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 264#map2 = affine_map<(d0, d1, d2) -> (d1, d2)> 265#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> 266#map_a = affine_map<(d0, d1, d2, d3) -> (d1, d3)> 267#map_b = affine_map<(d0, d1, d2, d3) -> (d3, d2)> 268 269// CHECK-LABEL: func @multi_dim_m16n8k16_fp16_row_row_row 270func.func @multi_dim_m16n8k16_fp16_row_row_row(%arg0: memref<4x32x1x32xf16, #gpu.address_space<workgroup>>, %arg1: memref<4x1x32x32xf16, #gpu.address_space<workgroup>>, %arg2: memref<1x32x40xf16, #gpu.address_space<workgroup>>) { 271 272 // CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index 273 %c0 = arith.constant 0 : index 274 %cst = arith.constant 0.000000e+00 : f16 275 276 // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]] 277 // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]] 278 // CHECK: [[fragmentA:%.+]] = nvgpu.ldmatrix %arg0[[[c0]], [[m_coord]], [[c0]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} 279 %A = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map_a} : memref<4x32x1x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16> 280 281 // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]] 282 // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]] 283 // CHECK-DAG: [[fragmentB:%.+]] = nvgpu.ldmatrix %arg1[[[c0]], [[c0]], [[k_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = true} 284 %B = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map_b} : memref<4x1x32x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16> 285 286 // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]] 287 // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]] 288 // CHECK-DAG: [[fragmentC:%.*]] = nvgpu.ldmatrix %arg2[[[c0]], [[m_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = false} 289 %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x32x40xf16, #gpu.address_space<workgroup>>, vector<16x16xf16> 290 291 // CHECK-DAG: [[fragmentB0:%.+]] = vector.extract_strided_slice [[fragmentB]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16> 292 // CHECK-DAG: [[fragmentC0:%.+]] = vector.extract_strided_slice [[fragmentC]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16> 293 // CHECK: nvgpu.mma.sync([[fragmentA]], [[fragmentB0]], [[fragmentC0]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> 294 %B0 = vector.extract_strided_slice %B {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> 295 %C0 = vector.extract_strided_slice %C {offsets = [0, 0], sizes = [16, 8], strides = [1, 1]} : vector<16x16xf16> to vector<16x8xf16> 296 %D0 = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B0, %C0 : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16> 297 vector.transfer_write %D0, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<1x32x40xf16, #gpu.address_space<workgroup>> 298 299 return 300} 301 302// ----- 303 304// CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)> 305// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)> 306 307#map0 = affine_map<(d0, d1, d2) -> (d2, d1)> 308#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 309#map2 = affine_map<(d0, d1, d2) -> (d1, d2)> 310#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> 311 312// CHECK-LABEL: func @batch_m16n8k16_fp16_row_row_row 313func.func @batch_m16n8k16_fp16_row_row_row(%arg0: memref<2x20x20xf16, #gpu.address_space<workgroup>>, %arg1: memref<2x20x20xf16, #gpu.address_space<workgroup>>, %arg2: memref<2x20x20xf16, #gpu.address_space<workgroup>>) { 314 %cst_0 = arith.constant dense<0.000000e+00> : vector<20x20xf16> 315 // CHECK: [[C0:%.+]] = arith.constant 0 : index 316 %c0 = arith.constant 0 : index 317 %cst = arith.constant 0.000000e+00 : f16 318 319 // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]] 320 // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]] 321 // CHECK: nvgpu.ldmatrix %arg0[[[C0]], [[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<2x20x20xf16, #gpu.address_space<workgroup>> -> vector<4x2xf16> 322 %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x20x20xf16, #gpu.address_space<workgroup>>, vector<16x16xf16> 323 324 // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]] 325 // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]] 326 // CHECK: nvgpu.ldmatrix %arg1[[[C0]], [[k_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = true} : memref<2x20x20xf16, #gpu.address_space<workgroup>> -> vector<2x2xf16> 327 %B = vector.transfer_read %arg1[%c0, %c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<2x20x20xf16, #gpu.address_space<workgroup>>, vector<8x16xf16> 328 329 // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]] 330 // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]] 331 // CHECK: nvgpu.ldmatrix %arg2[[[C0]], [[m_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = false} : memref<2x20x20xf16, #gpu.address_space<workgroup>> -> vector<2x2xf16> 332 %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<2x20x20xf16, #gpu.address_space<workgroup>>, vector<16x8xf16> 333 %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16> 334 vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<2x20x20xf16, #gpu.address_space<workgroup>> 335 return 336} 337 338// ----- 339 340//######################################################### 341// FP16 row-col-row 342//######################################################### 343 344#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 345#map2 = affine_map<(d0, d1, d2) -> (d1, d2)> 346#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> 347 348// Affine maps for ldmatrix x4 tile of `16 x 16` f16 elements in `strided x contiguous` dimensions. 349// CHECK: [[$strided_ldmatrix_x4_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)> 350// CHECK: [[$contiguous_ldmatrix_x4_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)> 351 352// CHECK: [[$strided_ldmatrix_x2_map:#.+]] = affine_map<()[s0] -> (s0 mod 8)> 353// CHECK: [[$contiguous_ldmatrix_x2_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 8)> 354 355// CHECK-LABEL: func @m16n8k16_fp16_row_col_row 356func.func @m16n8k16_fp16_row_col_row(%arg0: memref<20x20xf16, #gpu.address_space<workgroup>>, %arg1: memref<20x20xf16, #gpu.address_space<workgroup>>, %arg2: memref<20x20xf16, #gpu.address_space<workgroup>>) { 357 %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16> 358 %c0 = arith.constant 0 : index 359 360 %cst = arith.constant 0.000000e+00 : f16 361 // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_ldmatrix_x4_map]] 362 // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x4_map]] 363 // CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32 364 // CHECK-SAME: transpose = false 365 366 // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$strided_ldmatrix_x2_map]] 367 // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x2_map]] 368 // CHECK: nvgpu.ldmatrix %arg1[[[n_coord]], [[k_coord]]] {numTiles = 2 : i32 369 // CHECK-SAME: transpose = false 370 371 // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_ldmatrix_x4_map]] 372 // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x4_map]] 373 // CHECK: nvgpu.ldmatrix %arg2[[[m_coord]], [[n_coord]]] {numTiles = 2 : i32 374 // CHECK-SAME: transpose = false 375 %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<16x16xf16> 376 %B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<8x16xf16> 377 %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf16, #gpu.address_space<workgroup>>, vector<16x8xf16> 378 %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16> 379 vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<20x20xf16, #gpu.address_space<workgroup>> 380 return 381} 382 383// ----- 384 385//######################################################### 386// TF32 row-row-row 387//######################################################### 388 389#map0 = affine_map<(d0, d1) -> (d1, d0)> 390#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 391#map2 = affine_map<(d0, d1, d2) -> (d1, d2)> 392#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> 393 394// CHECK-DAG: [[$rowA_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)> 395// CHECK-DAG: [[$colA_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 4 + 3)> 396 397// CHECK-DAG: [[$rowB_map:#.+]] = affine_map<()[s0] -> (s0 mod 4 + 3)> 398// CHECK-DAG: [[$colB_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 3)> 399 400// CHECK-DAG: [[$rowC_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4)> 401// CHECK-DAG: [[$rowC8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 8)> 402// CHECK-DAG: [[$colC_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)> 403 404// CHECK-LABEL: func @m16n8k4_tf32_f32_row_row_row 405func.func @m16n8k4_tf32_f32_row_row_row(%arg0: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg1: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg2: memref<20x20xf32>) { 406 %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf32> 407 %c0 = arith.constant 0 : index 408 %c1 = arith.constant 1 : index 409 %c3 = arith.constant 3 : index 410 %cst = arith.constant 0.000000e+00 : f32 411 412 // CHECK: [[c_frag:%.+]] = arith.constant {{.*}} : vector<2x2xf32> 413 414 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA_map]] 415 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colA_map]] 416 // CHECK: [[a_frag:%.+]] = nvgpu.ldmatrix %arg0[[[row]], [[col]]] {numTiles = 2 : i32, transpose = false} 417 418 // b and c are not loaded by ldmatrix in this test. 419 // CHECK-NOT: nvgpu.ldmatrix 420 421 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB_map]] 422 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB_map]] 423 // CHECK: [[b_el:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>> 424 // CHECK: [[b_frag:%.+]] = vector.insert [[b_el]], {{.*}} : f32 into vector<1x1xf32> 425 426 // CHECK: [[d_frag:%.+]] = nvgpu.mma.sync([[a_frag]], [[b_frag]], [[c_frag]]) 427 // CHECK-SAME: mmaShape = [16, 8, 4] 428 // CHECK-SAME: -> vector<2x2xf32> 429 %A = vector.transfer_read %arg0[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<16x4xf32> 430 %B = vector.transfer_read %arg1[%c3, %c3], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<8x4xf32> 431 %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %cst_0 : vector<16x4xf32>, vector<8x4xf32> into vector<16x8xf32> 432 433 // CHECK: vector.extract [[d_frag]][0] : vector<2xf32> from vector<2x2xf32> 434 // CHECK: affine.apply [[$rowC_map]] 435 // CHECK: affine.apply [[$colC_map]] 436 // CHECK: vector.store 437 // CHECK: vector.extract [[d_frag]][1] : vector<2xf32> from vector<2x2xf32> 438 // CHECK: affine.apply [[$rowC8_map]] 439 // CHECK: affine.apply [[$colC_map]] 440 // CHECK: vector.store 441 vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf32>, memref<20x20xf32> 442 return 443} 444 445// ----- 446 447//######################################################### 448// TF32 row-row-row 449//######################################################### 450#map0 = affine_map<(d0, d1) -> (d1, d0)> 451#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 452#map2 = affine_map<(d0, d1, d2) -> (d1, d2)> 453#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> 454 455// CHECK-DAG: [[$rowA_map:#.+]] = affine_map<()[s0] -> (s0 mod 16 + 1)> 456// CHECK-DAG: [[$colA_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 4 + 3)> 457 458// CHECK-DAG: [[$rowB_map:#.+]] = affine_map<()[s0] -> (s0 mod 4 + 3)> 459// CHECK-DAG: [[$colB_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 3)> 460 461// CHECK-DAG: [[$rowC_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4)> 462// CHECK-DAG: [[$rowC8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 8)> 463// CHECK-DAG: [[$colC_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)> 464 465// CHECK-LABEL: func @m16n8k8_tf32_f32_row_row_row 466func.func @m16n8k8_tf32_f32_row_row_row(%arg0: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg1: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg2: memref<20x20xf32>) { 467 %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf32> 468 %c0 = arith.constant 0 : index 469 %c1 = arith.constant 1 : index 470 %c3 = arith.constant 3 : index 471 %cst = arith.constant 0.000000e+00 : f32 472 473 // CHECK: [[c_frag:%.+]] = arith.constant {{.*}} : vector<2x2xf32> 474 475 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA_map]] 476 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colA_map]] 477 // CHECK: [[a_frag:%.+]] = nvgpu.ldmatrix %arg0[[[row]], [[col]]] {numTiles = 4 : i32, transpose = false} 478 479 // b and c are not loaded by ldmatrix in this test. 480 // CHECK-NOT: nvgpu.ldmatrix 481 482 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB_map]] 483 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB_map]] 484 // CHECK: [[b_el0:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>> 485 // CHECK: [[b_frag0:%.+]] = vector.insert [[b_el0]], {{.*}} : f32 into vector<2x1xf32> 486 // CHECK: [[b_el1:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>> 487 // CHECK: [[b_frag1:%.+]] = vector.insert [[b_el1]], {{.*}} : f32 into vector<2x1xf32> 488 489 // CHECK: [[d_frag:%.+]] = nvgpu.mma.sync([[a_frag]], [[b_frag1]], [[c_frag]]) 490 // CHECK-SAME: mmaShape = [16, 8, 8] 491 // CHECK-SAME: -> vector<2x2xf32> 492 %A = vector.transfer_read %arg0[%c1, %c3], %cst {in_bounds = [true, true]} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32> 493 %B = vector.transfer_read %arg1[%c3, %c3], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<8x8xf32> 494 %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %cst_0 : vector<16x8xf32>, vector<8x8xf32> into vector<16x8xf32> 495 496 // CHECK: vector.extract [[d_frag]][0] : vector<2xf32> from vector<2x2xf32> 497 // CHECK: affine.apply [[$rowC_map]] 498 // CHECK: affine.apply [[$colC_map]] 499 // CHECK: vector.store 500 // CHECK: vector.extract [[d_frag]][1] : vector<2xf32> from vector<2x2xf32> 501 // CHECK: affine.apply [[$rowC8_map]] 502 // CHECK: affine.apply [[$colC_map]] 503 // CHECK: vector.store 504 vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf32>, memref<20x20xf32> 505 return 506} 507 508// ----- 509 510//######################################################### 511// TF32 col-col-row 512//######################################################### 513#map0 = affine_map<(d0, d1) -> (d1, d0)> 514#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 515#map2 = affine_map<(d0, d1, d2) -> (d1, d2)> 516#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> 517 518// CHECK-DAG: [[$rowA0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4)> 519// CHECK-DAG: [[$colA0_map:#.+]] = affine_map<()[s0] -> (s0 mod 4)> 520// CHECK-DAG: [[$rowA8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 8)> 521// CHECK-DAG: [[$colA4_map:#.+]] = affine_map<()[s0] -> (s0 mod 4 + 4)> 522 523// CHECK-DAG: [[$rowB0_map:#.+]] = affine_map<()[s0] -> (s0 mod 8)> 524// CHECK-DAG: [[$colB0_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 4)> 525 526// CHECK-DAG: [[$rowC_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 16)> 527// CHECK-DAG: [[$rowC8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 24)> 528// CHECK-DAG: [[$colC_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8 + 8)> 529 530// CHECK-LABEL: func @m16n8k8_tf32_f32_col_col_row 531func.func @m16n8k8_tf32_f32_col_col_row(%arg0: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg1: memref<20x20xf32, #gpu.address_space<workgroup>>, %arg2: memref<20x20xf32>) { 532 %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf32> 533 %c0 = arith.constant 0 : index 534 %c16 = arith.constant 16 : index 535 %c8 = arith.constant 8 : index 536 %cst = arith.constant 0.000000e+00 : f32 537 538 // CHECK: [[c_frag:%.+]] = arith.constant {{.*}} : vector<2x2xf32> 539 540 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA0_map]] 541 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colA0_map]] 542 // CHECK: [[a_el0:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>> 543 // CHECK: [[a_frag0:%.+]] = vector.insert [[a_el0]], {{.*}} [0, 0] : f32 into vector<4x1xf32> 544 545 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowA8_map]] 546 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colA0_map]] 547 // CHECK: [[a_el0:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>> 548 // CHECK: [[a_frag0:%.+]] = vector.insert [[a_el0]], {{.*}} [1, 0] : f32 into vector<4x1xf32> 549 550 // CHECK: [[a_el:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>> 551 // CHECK: [[a_frag:%.+]] = vector.insert [[a_el]], {{.*}} [2, 0] : f32 into vector<4x1xf32> 552 // CHECK: [[a_el:%.+]] = memref.load {{%.+}} : memref<20x20xf32, #gpu.address_space<workgroup>> 553 // CHECK: [[a_frag:%.+]] = vector.insert [[a_el]], {{.*}} [3, 0] : f32 into vector<4x1xf32> 554 555 // CHECK-DAG: [[row:%.+]] = affine.apply [[$rowB0_map]] 556 // CHECK-DAG: [[col:%.+]] = affine.apply [[$colB0_map]] 557 // CHECK: [[b_frag:%.+]] = nvgpu.ldmatrix %arg1[[[row]], [[col]]] {numTiles = 2 : i32, transpose = false} 558 559 // CHECK: [[d_frag:%.+]] = nvgpu.mma.sync([[a_frag]], [[b_frag]], [[c_frag]]) 560 // CHECK-SAME: mmaShape = [16, 8, 8] 561 // CHECK-SAME: -> vector<2x2xf32> 562 %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map0} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32> 563 %B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<20x20xf32, #gpu.address_space<workgroup>>, vector<8x8xf32> 564 %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], 565 kind = #vector.kind<add>} %A, %B, %cst_0 : vector<16x8xf32>, vector<8x8xf32> into vector<16x8xf32> 566 567 // CHECK: vector.extract [[d_frag]][0] : vector<2xf32> from vector<2x2xf32> 568 // CHECK: affine.apply [[$rowC_map]] 569 // CHECK: affine.apply [[$colC_map]] 570 // CHECK: vector.store 571 // CHECK: vector.extract [[d_frag]][1] : vector<2xf32> from vector<2x2xf32> 572 // CHECK: affine.apply [[$rowC8_map]] 573 // CHECK: affine.apply [[$colC_map]] 574 // CHECK: vector.store 575 vector.transfer_write %D, %arg2[%c16, %c8] {in_bounds = [true, true]} : vector<16x8xf32>, memref<20x20xf32> 576 return 577} 578 579// ----- 580 581//######################################################### 582// INT4 row-col-row 583//######################################################### 584// Affine maps for loading operandA and operandB 585// maps (laneid -> coordinate pointed by the lane in the ldmatrix operand tile) 586// CHECK-DAG: [[$strided_ldmatrix_x4_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)> 587// CHECK-DAG: [[$contiguous_ldmatrix_x4_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 32)> 588// CHECK-DAG: [[$strided_ldmatrix_x2_map:#.+]] = affine_map<()[s0] -> (s0 mod 8)> 589// CHECK-DAG: [[$contiguous_ldmatrix_x2_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 32)> 590 591// Affine maps for accumulator registers 592// maps (laneid -> coordinate pointed by the lane in accumulator register tile) 593// CHECK-DAG: [[$rowC0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4)> 594// CHECK-DAG: [[$colC0_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8 595// CHECK-DAG: [[$rowC8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 8)> 596 597#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 598#map2 = affine_map<(d0, d1, d2) -> (d1, d2)> 599#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> 600 601// CHECK-LABEL: func @m16n8k64_int4_row_col_row 602func.func @m16n8k64_int4_row_col_row(%arg0: memref<128x128xi4, #gpu.address_space<workgroup>>, %arg1: memref<128x128xi4, #gpu.address_space<workgroup>>, %arg2: memref<128x128xi32>) { 603 %cst = arith.constant 0 : i4 604 %cst0 = arith.constant 0 : i32 605 %cst_0 = arith.constant dense<0> : vector<32x8xi4> 606 %c0 = arith.constant 0 : index 607 608 // CHECK: [[lane:%.+]] = gpu.lane_id 609 // CHECK: [[m_coord:%.+]] = affine.apply [[$strided_ldmatrix_x4_map]]()[[[lane]]] 610 // CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x4_map]]()[[[lane]]] 611 // CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi4, #gpu.address_space<workgroup>> -> vector<4x8xi4> 612 613 // CHECK: [[lane:%.+]] = gpu.lane_id 614 // CHECK: [[n_coord:%.+]] = affine.apply [[$strided_ldmatrix_x2_map]]()[[[lane]]] 615 // CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x2_map]]()[[[lane]]] 616 // CHECK: nvgpu.ldmatrix %arg1[[[n_coord]], [[k_coord]]] {numTiles = 2 : i32, transpose = false} : memref<128x128xi4, #gpu.address_space<workgroup>> -> vector<2x8xi4> 617 618 // CHECK: [[lane:%.+]] = gpu.lane_id 619 // CHECK: [[row:%.+]] = affine.apply [[$rowC0_map]]()[{{%.+}}] 620 // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}] 621 // CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32> 622 623 // CHECK: [[row:%.+]] = affine.apply [[$rowC8_map]]()[{{%.+}}] 624 // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[{{%.+}}] 625 // CHECK: vector.load %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32> 626 // CHECK-NOT: vector.load 627 628 %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi4, #gpu.address_space<workgroup>>, vector<16x64xi4> 629 %B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi4, #gpu.address_space<workgroup>>, vector<8x64xi4> 630 %C = vector.transfer_read %arg2[%c0, %c0], %cst0 {in_bounds = [true, true]} : memref<128x128xi32>, vector<16x8xi32> 631 // CHECK: [[d:%.+]] = nvgpu.mma.sync({{.*}}) {mmaShape = [16, 8, 64]} : (vector<4x8xi4>, vector<2x8xi4>, vector<2x2xi32>) -> vector<2x2xi32> 632 %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x64xi4>, vector<8x64xi4> into vector<16x8xi32> 633 634 // CHECK: [[lane:%.+]] = gpu.lane_id 635 // CHECK: [[v:%.+]] = vector.extract [[d]][0] : vector<2xi32> from vector<2x2xi32> 636 // CHECK: [[row:%.+]] = affine.apply [[$rowC0_map]]()[[[lane]]] 637 // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[[[lane]]] 638 // CHECK: vector.store [[v]], %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32> 639 640 // CHECK: [[v:%.+]] = vector.extract [[d]][1] : vector<2xi32> from vector<2x2xi32> 641 // CHECK: [[row:%.+]] = affine.apply [[$rowC8_map]]()[[[lane]]] 642 // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[[[lane]]] 643 // CHECK: vector.store [[v]], %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32> 644 vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xi32>, memref<128x128xi32> 645 return 646} 647 648// ----- 649 650//######################################################### 651// INT8 row-col-row 652//######################################################### 653// Affine maps for loading operandA and operandB 654// maps (laneid -> coordinate pointed by the lane in the ldmatrix operand tile) 655// CHECK-DAG: [[$strided_ldmatrix_x4_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)> 656// CHECK-DAG: [[$contiguous_ldmatrix_x4_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 16)> 657// CHECK-DAG: [[$strided_ldmatrix_x2_map:#.+]] = affine_map<()[s0] -> (s0 mod 8)> 658// CHECK-DAG: [[$contiguous_ldmatrix_x2_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 16)> 659 660// Affine maps for accumulator registers 661// maps (laneid -> coordinate pointed by the lane in accumulator register tile) 662// CHECK-DAG: [[$rowC0_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4)> 663// CHECK-DAG: [[$colC0_map:#.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 4) * 8)> 664// CHECK-DAG: [[$rowC8_map:#.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 8)> 665 666 667#map0 = affine_map<(d0, d1) -> (d1, d0)> 668#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 669#map2 = affine_map<(d0, d1, d2) -> (d1, d2)> 670#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> 671 672// CHECK-LABEL: func @m16n8k32_int8_row_col_row 673func.func @m16n8k32_int8_row_col_row(%arg0: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg1: memref<128x128xi8, #gpu.address_space<workgroup>>, %arg2: memref<128x128xi32>) { 674 %cst_0 = arith.constant dense<0> : vector<32x8xi8> 675 %c0 = arith.constant 0 : index 676 %cst = arith.constant 0 : i8 677 %cst0 = arith.constant 0 : i32 678 679 // CHECK: [[lane:%.+]] = gpu.lane_id 680 // CHECK: [[m_coord:%.+]] = affine.apply [[$strided_ldmatrix_x4_map]]()[[[lane]]] 681 // CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x4_map]]()[[[lane]]] 682 // CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} : memref<128x128xi8, #gpu.address_space<workgroup>> -> vector<4x4xi8> 683 684 // CHECK: [[lane:%.+]] = gpu.lane_id 685 // CHECK: [[n_coord:%.+]] = affine.apply [[$strided_ldmatrix_x2_map]]()[[[lane]]] 686 // CHECK: [[k_coord:%.+]] = affine.apply [[$contiguous_ldmatrix_x2_map]]()[[[lane]]] 687 // CHECK: nvgpu.ldmatrix %arg1[[[n_coord]], [[k_coord]]] {numTiles = 2 : i32, transpose = false} : memref<128x128xi8, #gpu.address_space<workgroup>> -> vector<2x4xi8> 688 689 // CHECK: [[lane:%.+]] = gpu.lane_id 690 // CHECK: [[m_coord:%.+]] = affine.apply [[$rowC0_map]]()[[[lane]]] 691 // CHECK: [[n_coord:%.+]] = affine.apply [[$colC0_map]]()[[[lane]]] 692 // CHECK: vector.load %arg2[[[m_coord]], [[n_coord]]] : memref<128x128xi32>, vector<2xi32> 693 // CHECK: [[m_coord:%.+]] = affine.apply [[$rowC8_map]]()[[[lane]]] 694 // CHECK: [[n_coord:%.+]] = affine.apply [[$colC0_map]]()[[[lane]]] 695 // CHECK: vector.load %arg2[[[m_coord]], [[n_coord]]] : memref<128x128xi32>, vector<2xi32> 696 // CHECK-NOT: vector.load %arg2{{.*}} 697 698 %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi8, #gpu.address_space<workgroup>>, vector<16x32xi8> 699 %B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true]} : memref<128x128xi8, #gpu.address_space<workgroup>>, vector<8x32xi8> 700 %C = vector.transfer_read %arg2[%c0, %c0], %cst0 {in_bounds = [true, true]} : memref<128x128xi32>, vector<16x8xi32> 701 // CHECK: [[d:%.+]] = nvgpu.mma.sync({{.*}}) {mmaShape = [16, 8, 32]} : (vector<4x4xi8>, vector<2x4xi8>, vector<2x2xi32>) -> vector<2x2xi32> 702 %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x32xi8>, vector<8x32xi8> into vector<16x8xi32> 703 704 // CHECK: [[lane:%.+]] = gpu.lane_id 705 // CHECK: [[v:%.+]] = vector.extract [[d]][0] : vector<2xi32> from vector<2x2xi32> 706 // CHECK: [[row:%.+]] = affine.apply [[$rowC0_map]]()[[[lane]]] 707 // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[[[lane]]] 708 // CHECK: vector.store [[v]], %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32> 709 // CHECK: [[v:%.+]] = vector.extract [[d]][1] : vector<2xi32> from vector<2x2xi32> 710 // CHECK: [[row:%.+]] = affine.apply [[$rowC8_map]]()[[[lane]]] 711 // CHECK: [[col:%.+]] = affine.apply [[$colC0_map]]()[[[lane]]] 712 // CHECK: vector.store [[v]], %arg2[[[row]], [[col]]] : memref<128x128xi32>, vector<2xi32> 713 vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xi32>, memref<128x128xi32> 714 return 715} 716 717// ----- 718 719 720#map0 = affine_map<(d0, d1) -> (d1, d0)> 721#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 722#map2 = affine_map<(d0, d1, d2) -> (d1, d2)> 723#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> 724!smem_type = memref<20x20xf16, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>> 725 726// This test case is identical to m16n8k16 test case, but it tests that having 727// n row dimension with unknown stride is handled correctly. 728 729// CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)> 730// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)> 731// CHECK-LABEL: func @strided_memref_read_write 732func.func @strided_memref_read_write(%arg0: !smem_type, 733 %arg1: !smem_type, 734 %arg2: !smem_type) { 735 %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16> 736 %c0 = arith.constant 0 : index 737 %cst = arith.constant 0.000000e+00 : f16 738 739 // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]] 740 // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]] 741 // CHECK: nvgpu.ldmatrix %arg0[[[m_coord]], [[k_coord]]] {numTiles = 4 : i32, transpose = false} 742 // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]] 743 // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]] 744 // CHECK: nvgpu.ldmatrix %arg1[[[k_coord]], [[n_coord]]] {numTiles = 2 : i32, transpose = true} 745 %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x16xf16> 746 %B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : !smem_type, vector<8x16xf16> 747 %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x8xf16> 748 %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} 749 %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16> 750 vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, !smem_type 751 return 752} 753 754// ----- 755 756 757#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> 758#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> 759#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d0, d3)> 760#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> 761!smem_type = memref<20x20x20xf16, strided<[?, ?, 1], offset: ?>, #gpu.address_space<workgroup>> 762 763// CHECK-LABEL: func @unsupported_non_2d_load_store 764func.func @unsupported_non_2d_load_store(%arg0: !smem_type, 765 %arg1: !smem_type, 766 %arg2: !smem_type) { 767 %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16> 768 %c0 = arith.constant 0 : index 769 %cst = arith.constant 0.000000e+00 : f16 770 771 // CHECK-NOT: nvgpu.ldmatrix 772 // CHECK-NOT: nvgpu.mma 773 %A = vector.transfer_read %arg0[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : !smem_type, vector<1x16x16xf16> 774 %B = vector.transfer_read %arg1[%c0, %c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true, true]} : !smem_type, vector<8x1x16xf16> 775 %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : !smem_type, vector<1x16x8xf16> 776 %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} 777 %A, %B, %C : vector<1x16x16xf16>, vector<8x1x16xf16> into vector<1x16x8xf16> 778 vector.transfer_write %D, %arg2[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x16x8xf16>, !smem_type 779 return 780} 781 782// ----- 783 784#map0 = affine_map<(d0, d1) -> (d1, d0)> 785#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 786#map2 = affine_map<(d0, d1, d2) -> (d1, d2)> 787#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> 788 789!smem_type = memref<20x20xf16, strided<[?, ?], offset: ?>, #gpu.address_space<workgroup>> 790 791// CHECK-LABEL: func @unsupported_fully_dynamic_strides 792func.func @unsupported_fully_dynamic_strides(%arg0: !smem_type, 793 %arg1: !smem_type, 794 %arg2: !smem_type) { 795 %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16> 796 %c0 = arith.constant 0 : index 797 %cst = arith.constant 0.000000e+00 : f16 798 799 // CHECK-NOT: nvgpu.ldmatrix 800 // CHECK-NOT: nvgpu.mma 801 %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x16xf16> 802 %B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : !smem_type, vector<8x16xf16> 803 %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x8xf16> 804 %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} 805 %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16> 806 vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, !smem_type 807 return 808} 809 810// ----- 811 812#map0 = affine_map<(d0, d1) -> (d1, d0)> 813#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 814#map2 = affine_map<(d0, d1, d2) -> (d1, d2)> 815#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> 816 817 818!smem_type = memref<20x20xf16, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>> 819 820// CHECK-LABEL: func @unsupported_transposed_store 821func.func @unsupported_transposed_store(%arg0: !smem_type, 822 %arg1: !smem_type, 823 %arg2: !smem_type) { 824 %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16> 825 %c0 = arith.constant 0 : index 826 %cst = arith.constant 0.000000e+00 : f16 827 828 // CHECK-NOT: nvgpu.ldmatrix 829 // CHECK-NOT: nvgpu.mma 830 %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x16xf16> 831 %B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : !smem_type, vector<8x16xf16> 832 %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : !smem_type, vector<16x8xf16> 833 %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} 834 %A, %B, %C : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16> 835 vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true], permutation_map = affine_map<(d0, d1)->(d1, d0)>} : vector<16x8xf16>, !smem_type 836 return 837} 838