1// RUN: mlir-opt %s --transform-interpreter -canonicalize -cse -split-input-file -verify-diagnostics | FileCheck %s 2 3// Offset per thread: 4// CHECK-DAG: affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 10))> 5// Per thread tile size. 6// CHECK-DAG: affine_map<(d0)[s0] -> (-(d0 * (s0 ceildiv 10)) + s0, s0 ceildiv 10)> 7// CHECK-DAG: affine_map<(d0)[s0] -> (d0 * (s0 ceildiv 20))> 8// CHECK-DAG: affine_map<(d0)[s0] -> (-(d0 * (s0 ceildiv 20)) + s0, s0 ceildiv 20)> 9 10module { 11// CHECK-LABEL: matmul( 12// CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<?x?xf32> 13// CHECK-SAME: %[[B:[0-9a-z]+]]: tensor<?x?xf32> 14// CHECK-SAME: %[[C:[0-9a-z]+]]: tensor<?x?xf32> 15 func.func @matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> { 16 // CHECK: scf.forall ({{.*}}) in (10, 20) shared_outs(%[[C_BLK:.*]] = %[[C]]) -> (tensor<?x?xf32>) { 17 // CHECK: %[[tA:.*]] = tensor.extract_slice %[[A]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32> 18 // CHECK: %[[tB:.*]] = tensor.extract_slice %[[B]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32> 19 // CHECK: %[[tC:.*]] = tensor.extract_slice %[[C_BLK]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32> 20 // CHECK: %[[RES:.*]] = linalg.matmul 21 // CHECK-SAME: ins(%[[tA]], %[[tB]] : tensor<?x?xf32>, tensor<?x?xf32>) 22 // CHECK-SAME: outs(%[[tC]] : tensor<?x?xf32>) -> tensor<?x?xf32> 23 // CHECK: scf.forall.in_parallel { 24 // CHECK-NEXT: tensor.parallel_insert_slice %[[RES]] into %[[C_BLK]]{{.*}} : 25 // CHECK-SAME: tensor<?x?xf32> into tensor<?x?xf32> 26 // CHECK-NEXT: } 27 // CHECK-NEXT: } {mapping = [#gpu.thread<y>, #gpu.thread<x>]} 28 %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 29 outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>) 30 return %0 : tensor<?x?xf32> 31 } 32 33 module attributes {transform.with_named_sequence} { 34 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 35 %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op 36 %1:2 = transform.structured.tile_using_forall %0 num_threads [10, 20] (mapping = [ #gpu.thread<y>, #gpu.thread<x> ] ) 37 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 38 transform.yield 39 } 40 } 41} 42 43// ----- 44 45module { 46 // CHECK-LABEL: func @matmul_memref( 47 // CHECK: scf.forall (%{{.*}}, %{{.*}}) in (10, 20) { 48 // CHECK: memref.subview 49 // CHECK: memref.subview 50 // CHECK: memref.subview 51 // CHECK: linalg.matmul 52 // CHECK: } {mapping = [#gpu.thread<y>, #gpu.thread<x>]} 53 func.func @matmul_memref(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) { 54 linalg.matmul ins(%A, %B : memref<?x?xf32>, memref<?x?xf32>) 55 outs(%C : memref<?x?xf32>) 56 return 57 } 58 59 module attributes {transform.with_named_sequence} { 60 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 61 %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op 62 %1:2 = transform.structured.tile_using_forall %0 num_threads [10, 20] (mapping = [ #gpu.thread<y>, #gpu.thread<x> ] ) 63 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 64 transform.yield 65 } 66 } 67} 68 69// ----- 70 71module { 72 // CHECK-LABEL: func @copy_memref( 73 // CHECK: scf.forall (%{{.*}}, %{{.*}}) in (10, 20) { 74 // CHECK: memref.subview 75 // CHECK: memref.subview 76 // CHECK: linalg.copy 77 // CHECK: } {mapping = [#gpu.thread<y>, #gpu.thread<x>]} 78 func.func @copy_memref(%A: memref<?x?xf32>, %B: memref<?x?xf32>) { 79 linalg.copy ins(%A: memref<?x?xf32>) 80 outs(%B : memref<?x?xf32>) 81 return 82 } 83 84 module attributes {transform.with_named_sequence} { 85 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 86 %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op 87 %1:2 = transform.structured.tile_using_forall %0 num_threads [10, 20] (mapping = [ #gpu.thread<y>, #gpu.thread<x> ] ) 88 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 89 transform.yield 90 } 91 } 92} 93 94// ----- 95 96// In this test case, matmul dims and tile size are dynamic. 97 98// CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)> 99// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0, s1] -> (-(d0 * s1) + s0, s1)> 100// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * s0)> 101 102// CHECK-LABEL: matmul_tile_size_dynamic_dynamic( 103// CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<?x?xf32> 104// CHECK-SAME: %[[B:[0-9a-z]+]]: tensor<?x?xf32> 105// CHECK-SAME: %[[C:[0-9a-z]+]]: tensor<?x?xf32> 106func.func @matmul_tile_size_dynamic_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> { 107 // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index 108 // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index 109 // CHECK-DAG: %[[tile_size_1:.*]] = "test.dummy"() 110 // CHECK-DAG: %[[tile_size_2:.*]] = "test.dummy"() 111 // CHECK-DAG: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] : 112 // CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %c1 : 113 // CHECK-DAG: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]], %[[tile_size_1]]] 114 // CHECK-DAG: %[[NT1:.+]] = affine.apply #[[$map0]]()[%[[N]], %[[tile_size_2]]] 115 // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) 116 // CHECK: tensor.extract_slice %[[A]] 117 // CHECK: tensor.extract_slice %[[B]] 118 // CHECK: tensor.extract_slice %[[C_BLK]] 119 // CHECK: linalg.matmul 120 // CHECK: scf.forall.in_parallel 121 // CHECK-NEXT: tensor.parallel_insert_slice 122 %tile_size_1 = "test.dummy"() : () -> (index) 123 %tile_size_2 = "test.dummy"() : () -> (index) 124 %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 125 outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>) 126 return %0 : tensor<?x?xf32> 127} 128 129module attributes {transform.with_named_sequence} { 130 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 131 %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op 132 %sz = transform.structured.match ops{["test.dummy"]} in %arg1 : (!transform.any_op) -> !transform.any_op 133 %1:2 = transform.structured.tile_using_forall %0 tile_sizes *(%sz) 134 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) 135 transform.yield 136 } 137} 138 139// ----- 140 141// Tests that dimension 0 can eliminate affine.min/max, dimension 1 cannot. 142 143// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * -15 + 300, 15)> 144// CHECK-DAG: #[[$map1:.+]] = affine_map<(d0) -> (0, d0)> 145// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0) -> (d0 * 10)> 146// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (d0 * 15)> 147 148// CHECK-LABEL: matmul_static( 149// CHECK-SAME: %[[A:[0-9a-z]+]]: tensor 150// CHECK-SAME: %[[B:[0-9a-z]+]]: tensor 151// CHECK-SAME: %[[C:[0-9a-z]+]]: tensor 152func.func @matmul_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> { 153 // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (10, 21) shared_outs(%[[C_BLK:.*]] = %[[C]]) 154 // CHECK: %[[TSMIN:.+]] = affine.min #[[$map0]](%[[IV1]]) 155 // CHECK: %[[TS:.+]] = affine.max #[[$map1]](%[[TSMIN]]) 156 // CHECK-NOT: affine.min 157 // CHECK-NOT: affine.max 158 // CHECK: %[[LB0:.+]] = affine.apply #[[$map2]](%[[IV0]]) 159 // CHECK: %[[LB1:.+]] = affine.apply #[[$map3]](%[[IV1]]) 160 // CHECK: %[[tA:.+]] = tensor.extract_slice %[[A]][%[[LB0]], 0] [10, 200] [1, 1] : 161 // CHECK: %[[tB:.+]] = tensor.extract_slice %[[B]][0, %[[LB1]]] [200, %[[TS]]] [1, 1] : 162 // CHECK: %[[tC:.+]] = tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [10, %[[TS]]] [1, 1] : 163 // CHECK: linalg.matmul 164 // CHECK: scf.forall.in_parallel 165 // CHECK-NEXT: tensor.parallel_insert_slice 166 %0 = linalg.matmul ins(%A, %B : tensor<100x200xf32>, tensor<200x300xf32>) 167 outs(%C : tensor<100x300xf32>) -> (tensor<100x300xf32>) 168 return %0 : tensor<100x300xf32> 169} 170 171module attributes {transform.with_named_sequence} { 172 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 173 %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op 174 %1:2 = transform.structured.tile_using_forall %0 num_threads [10, 21] 175 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 176 transform.yield 177 } 178} 179 180// ----- 181 182// CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)> 183// CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)> 184// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)> 185// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> 186// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 10)> 187// CHECK-DAG: #[[$map6:.+]] = affine_map<(d0) -> (d0 * 20)> 188 189// CHECK-LABEL: matmul_tile_size_dynamic( 190// CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<?x?xf32> 191// CHECK-SAME: %[[B:[0-9a-z]+]]: tensor<?x?xf32> 192// CHECK-SAME: %[[C:[0-9a-z]+]]: tensor<?x?xf32> 193func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> { 194 // CHECK: %[[M:.+]] = tensor.dim %[[A]], %c0 : 195 // CHECK: %[[N:.+]] = tensor.dim %[[B]], %c1 : 196 // CHECK: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]]] 197 // CHECK: %[[NT1:.+]] = affine.apply #[[$map1]]()[%[[N]]] 198 // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) 199 // CHECK-DAG: %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] 200 // CHECK-DAG: %[[TS1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]] 201 // CHECK-DAG: %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]]) 202 // CHECK-DAG: %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]]) 203 // CHECK: tensor.extract_slice %[[A]] 204 // CHECK: tensor.extract_slice %[[B]] 205 // CHECK: tensor.extract_slice %[[C_BLK]] 206 // CHECK: linalg.matmul 207 // CHECK: scf.forall.in_parallel 208 // CHECK-NEXT: tensor.parallel_insert_slice 209 %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 210 outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>) 211 return %0 : tensor<?x?xf32> 212} 213 214module attributes {transform.with_named_sequence} { 215 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 216 %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op 217 %1:2 = transform.structured.tile_using_forall %0 tile_sizes [10, 20] 218 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 219 transform.yield 220 } 221} 222// ----- 223 224// Tests that dimension 0 can eliminate affine.min/max, dimension 1 cannot. 225 226// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * -21 + 300, 21)> 227// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0) -> (d0 * 10)> 228// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (d0 * 21)> 229 230// CHECK-LABEL: matmul_tile_size_static( 231// CHECK-SAME: %[[A:[0-9a-z]+]]: tensor 232// CHECK-SAME: %[[B:[0-9a-z]+]]: tensor 233// CHECK-SAME: %[[C:[0-9a-z]+]]: tensor 234func.func @matmul_tile_size_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> { 235 // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (10, 15) shared_outs(%[[C_BLK:.*]] = %[[C]]) 236 // CHECK-DAG: %[[TS:.+]] = affine.min #[[$map0]](%[[IV1]]) 237 // CHECK-DAG: %[[LB0:.+]] = affine.apply #[[$map2]](%[[IV0]]) 238 // CHECK-DAG: %[[LB1:.+]] = affine.apply #[[$map3]](%[[IV1]]) 239 // CHECK-NOT: affine.max 240 // CHECK-NOT: affine.min 241 // CHECK: %[[tA:.+]] = tensor.extract_slice %[[A]][%[[LB0]], 0] [10, 200] [1, 1] : 242 // CHECK: %[[tB:.+]] = tensor.extract_slice %[[B]][0, %[[LB1]]] [200, %[[TS]]] [1, 1] : 243 // CHECK: %[[tC:.+]] = tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [10, %[[TS]]] [1, 1] : 244 // CHECK: linalg.matmul 245 // CHECK: scf.forall.in_parallel 246 // CHECK-NEXT: tensor.parallel_insert_slice 247 %0 = linalg.matmul ins(%A, %B : tensor<100x200xf32>, tensor<200x300xf32>) 248 outs(%C : tensor<100x300xf32>) -> (tensor<100x300xf32>) 249 return %0 : tensor<100x300xf32> 250} 251 252module attributes {transform.with_named_sequence} { 253 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 254 %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op 255 %1:2 = transform.structured.tile_using_forall %0 tile_sizes [10, 21] 256 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 257 transform.yield 258 } 259} 260 261// ----- 262 263module { 264 func.func @extract_source(%A: tensor<4xf32>, %B: tensor<16xf32>) -> tensor<4xf32> { 265 %B1 = tensor.extract_slice %B[10] [4] [1] : tensor<16xf32> to tensor<4xf32> 266 %result = linalg.generic {indexing_maps = [ 267 affine_map<(d0) -> (d0)>,affine_map<(d0) -> (d0)>], 268 iterator_types = ["parallel"]} 269 ins(%A : tensor<4xf32>) outs(%B1 : tensor<4xf32>) { 270 ^bb0(%arg3: f32, %arg4: f32): // no predecessors 271 %2 = arith.addf %arg3, %arg3 : f32 272 linalg.yield %2 : f32 273 } -> tensor<4xf32> 274 return %result : tensor<4xf32> 275 } 276 277 module attributes {transform.with_named_sequence} { 278 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 279 %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op 280 %1:2 = transform.structured.tile_using_forall %0 num_threads [2] ( mapping = [#gpu.thread<x>]) 281 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 282 transform.yield 283 } 284 } 285} 286// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * 2)> 287 288// CHECK-LABEL: extract_source( 289// CHECK: scf.forall (%[[ARG:.*]]) in (2) shared_outs(%{{.*}} = %{{.*}}) -> (tensor<4xf32>) { 290// CHECK: %[[OFF:.*]] = affine.apply #[[$map0]](%[[ARG]]) 291// CHECK: scf.forall.in_parallel { 292// CHECK: tensor.parallel_insert_slice %{{.*}} into %{{.*}}[%[[OFF]]] [2] [1] : tensor<2xf32> into tensor<4xf32> 293 294// ----- 295 296// In this test case, matmul dims and tile size are dynamic. 297 298// CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)> 299// CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)> 300// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0, s1] -> (-(d0 * s1) + s0, s1)> 301// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> 302// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0)[s0] -> (d0 * s0)> 303// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 20)> 304 305// CHECK-LABEL: matmul_tile_size_dynamic_dynamic( 306// CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<?x?xf32> 307// CHECK-SAME: %[[B:[0-9a-z]+]]: tensor<?x?xf32> 308// CHECK-SAME: %[[C:[0-9a-z]+]]: tensor<?x?xf32> 309func.func @matmul_tile_size_dynamic_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> { 310 // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index 311 // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index 312 // CHECK-DAG: %[[tile_size:.*]] = "test.dummy"() 313 // CHECK-DAG: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] : 314 // CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %c1 : 315 // CHECK-DAG: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]], %[[tile_size]]] 316 // CHECK-DAG: %[[NT1:.+]] = affine.apply #[[$map1]]()[%[[N]]] 317 // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) 318 // CHECK: tensor.extract_slice %[[A]] 319 // CHECK: tensor.extract_slice %[[B]] 320 // CHECK: tensor.extract_slice %[[C_BLK]] 321 // CHECK: linalg.matmul 322 // CHECK: scf.forall.in_parallel 323 // CHECK-NEXT: tensor.parallel_insert_slice 324 %tile_size = "test.dummy"() : () -> (index) 325 %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 326 outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>) 327 return %0 : tensor<?x?xf32> 328} 329 330module attributes {transform.with_named_sequence} { 331 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 332 %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op 333 %sz = transform.structured.match ops{["test.dummy"]} in %arg1 : (!transform.any_op) -> !transform.any_op 334 %1:2 = transform.structured.tile_using_forall %0 tile_sizes [%sz, 20] 335 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) 336 transform.yield 337 } 338} 339 340// ----- 341 342// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * -15 + 100, 15)> 343// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0) -> (d0 * 15)> 344// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (d0)> 345 346// CHECK-LABEL: tile_output_multi_1d_static( 347// CHECK-SAME: %[[IN1:[0-9a-z]+]]: tensor<100xf32> 348// CHECK-SAME: %[[IN2:[0-9a-z]+]]: tensor<100xf32> 349// CHECK-SAME: %[[ORGOUT1:[0-9a-z]+]]: tensor<100xf32> 350// CHECK-SAME: %[[ORGOUT2:[0-9a-z]+]]: tensor<100xf32> 351 func.func @tile_output_multi_1d_static(%IN1: tensor<100xf32>, %IN2: tensor<100xf32>, 352 %OUT1: tensor<100xf32>, %OUT2: tensor<100xf32>) 353 -> (tensor<100xf32>, tensor<100xf32>) { 354// CHECK: scf.forall (%[[IV0:.+]]) in (7) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]]) 355// CHECK: %[[TS:.+]] = affine.min #[[$map0]](%[[IV0]]) 356// CHECK-NOT: affine.min 357// CHECK-NOT: affine.max 358// CHECK: %[[LB:.+]] = affine.apply #[[$map2]](%[[IV0]]) 359// CHECK: %[[tIN1:.+]] = tensor.extract_slice %[[IN1]][%[[LB]]] [%[[TS]]] [1] : 360// CHECK: %[[tIN2:.+]] = tensor.extract_slice %[[IN2]][%[[LB]]] [%[[TS]]] [1] : 361// CHECK: %[[tOUT1:.+]] = tensor.extract_slice %[[OUT1]][%[[LB]]] [%[[TS]]] [1] : 362// CHECK: %[[tOUT2:.+]] = tensor.extract_slice %[[OUT2]][%[[LB]]] [%[[TS]]] [1] : 363// CHECK: %[[RES1:[0-9]+]]:[[RES2:[0-9]+]] = linalg.generic 364// CHECK: scf.forall.in_parallel 365// CHECK-NEXT: tensor.parallel_insert_slice %[[RES1]]#0 into %[[OUT1]][%[[LB]]] [%[[TS]]] [1] : 366// CHECK-NEXT: tensor.parallel_insert_slice %[[RES1]]#1 into %[[OUT2]][%[[LB]]] [%[[TS]]] [1] : 367 %res1, %res2 = linalg.generic 368 { 369 indexing_maps = [affine_map<(d0) -> (d0)>, 370 affine_map<(d0) -> (d0)>, 371 affine_map<(d0) -> (d0)>, 372 affine_map<(d0) -> (d0)>], 373 iterator_types = ["parallel"] 374 } ins(%IN1, %IN2 : tensor<100xf32>, tensor<100xf32>) 375 outs(%OUT1, %OUT2 : tensor<100xf32>, tensor<100xf32>) 376 { 377 ^bb0(%a1: f32, %a2: f32, %a3: f32, %a4: f32): 378 %1 = arith.addf %a1, %a3 : f32 379 %2 = arith.addf %a2, %a4 : f32 380 linalg.yield %1, %2 : f32,f32 381 } -> (tensor<100xf32>, tensor<100xf32>) 382 return %res1, %res2 : tensor<100xf32>, tensor<100xf32> 383 } 384 385 module attributes {transform.with_named_sequence} { 386 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 387 %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op 388 %tiled_generic, %forall = transform.structured.tile_using_forall %0 num_threads [7] 389 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 390 transform.yield 391 } 392 } 393 394// ----- 395 396// CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * 75)> 397// CHECK-DAG: #[[$map1:.+]] = affine_map<(d0, d1) -> (d1)> 398// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0, d1) -> (d1, d0) 399// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0, d1) -> (d0)> 400// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0, d1) -> (d0, d1)> 401 402// CHECK-LABEL: tile_output_multi_1d2d_static( 403// CHECK-SAME: %[[IN1:[0-9a-z]+]]: tensor<100xf32> 404// CHECK-SAME: %[[IN2:[0-9a-z]+]]: tensor<100x300xf32> 405// CHECK-SAME: %[[IN3:[0-9a-z]+]]: tensor<300xf32> 406// CHECK-SAME: %[[ORGOUT1:[0-9a-z]+]]: tensor<300x100xf32> 407// CHECK-SAME: %[[ORGOUT2:[0-9a-z]+]]: tensor<300xf32> 408 func.func @tile_output_multi_1d2d_static(%IN1: tensor<100xf32>, %IN2: tensor<100x300xf32>, %IN3: tensor<300xf32>, 409 %OUT1: tensor<300x100xf32>, %OUT2: tensor<300xf32>) 410 -> (tensor<300x100xf32>, tensor<300xf32>) { 411// CHECK: scf.forall (%[[IV0:.+]]) in (4) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]]) 412// CHECK: %[[LB:.+]] = affine.apply #[[$map0]](%[[IV0]]) 413// CHECK: %[[tIN1:.+]] = tensor.extract_slice %[[IN2]][0, %[[LB]]] [100, 75] 414// CHECK: %[[tIN2:.+]] = tensor.extract_slice %[[IN3]][%[[LB]]] [75] 415// CHECK: %[[tOUT1:.+]] = tensor.extract_slice %[[OUT1]][%[[LB]], 0] [75, 100] 416// CHECK: %[[tOUT2:.+]] = tensor.extract_slice %[[OUT2]][%[[LB]]] [75] 417// CHECK: %[[RES1:[0-9]+]]:[[RES2:[0-9]+]] = linalg.generic 418// CHECK: scf.forall.in_parallel 419// CHECK-NEXT: tensor.parallel_insert_slice %[[RES1]]#0 into %[[OUT1]][%[[LB]], 0] [75, 100] 420// CHECK-NEXT: tensor.parallel_insert_slice %[[RES1]]#1 into %[[OUT2]][%[[LB]]] [75] 421 %res2, %res3 = linalg.generic { 422 indexing_maps = [affine_map<(d0,d1) -> (d1)>, 423 affine_map<(d0,d1) -> (d1,d0)>, 424 affine_map<(d0,d1) -> (d0)>, 425 affine_map<(d0,d1) -> (d0,d1)>, 426 affine_map<(d0,d1) -> (d0)> 427 ], 428 iterator_types = ["parallel", "parallel"] 429 } ins(%IN1, %IN2, %IN3 : tensor<100xf32>, tensor<100x300xf32>, tensor<300xf32>) 430 outs(%OUT1, %OUT2: tensor<300x100xf32>, tensor<300xf32>) { 431 ^bb0(%i1: f32, %i2: f32, %i3: f32, %o1: f32, %o2: f32): 432 %1 = arith.addf %i1, %o1 : f32 433 %2 = arith.addf %i2, %1 : f32 434 %3 = arith.addf %i3, %2 : f32 435 linalg.yield %3, %i3 : f32, f32 436 } -> (tensor<300x100xf32>, tensor<300xf32>) 437 438 return %res2, %res3 : tensor<300x100xf32>, tensor<300xf32> 439 } 440 441 module attributes {transform.with_named_sequence} { 442 transform.named_sequence @__transform_main(%IN_MAT2: !transform.any_op {transform.readonly}) { 443 %0 = transform.structured.match ops{["linalg.generic"]} in %IN_MAT2 : (!transform.any_op) -> !transform.any_op 444 %tiled_generic, %forall = transform.structured.tile_using_forall %0 num_threads [4] 445 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 446 transform.yield 447 } 448 } 449 450// ----- 451 452// CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)> 453// CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)> 454// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)> 455// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> 456// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0) -> (d0 * 10)> 457// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 20)> 458 459// CHECK-LABEL: matmul_tile_size_dynamic( 460// CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<?x?xf32> 461// CHECK-SAME: %[[B:[0-9a-z]+]]: tensor<?x?xf32> 462// CHECK-SAME: %[[C:[0-9a-z]+]]: tensor<?x?xf32> 463func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> { 464 // CHECK: %[[c1:.*]] = arith.constant 1 : index 465 // CHECK: %[[c0:.*]] = arith.constant 0 : index 466 // CHECK-DAG: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] : 467 // CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] : 468 // CHECK-DAG: %[[NT0:.+]] = affine.apply #map()[%[[M]]] 469 // CHECK-DAG: %[[NT1:.+]] = affine.apply #map1()[%[[N]]] 470 // CHECK-DAG: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] : 471 // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) 472 // CHECK-DAG: %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] 473 // CHECK-DAG: %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]] 474 // CHECK-DAG: %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]]) 475 // CHECK-DAG: %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]]) 476 // CHECK: tensor.extract_slice %[[A]][%[[LB0]], 0] [%[[TS0]], %[[K]]] [1, 1] : 477 // CHECK: tensor.extract_slice %[[B]][0, %[[LB1]]] [%[[K]], %[[TS1]]] [1, 1] : 478 // CHECK: tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [%[[TS0]], %[[TS1]]] [1, 1] : 479 // CHECK: linalg.matmul 480 // CHECK: scf.forall.in_parallel 481 // CHECK-NEXT: tensor.parallel_insert_slice 482 %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 483 outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>) 484 return %0 : tensor<?x?xf32> 485} 486 487module attributes {transform.with_named_sequence} { 488 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 489 %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op 490 %sz = transform.param.constant 10 : i64 -> !transform.param<i64> 491 %1:2 = transform.structured.tile_using_forall %0 tile_sizes [%sz, 20] 492 : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op) 493 transform.yield 494 } 495} 496 497// ----- 498 499func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> { 500 %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 501 outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>) 502 return %0 : tensor<?x?xf32> 503} 504 505module attributes {transform.with_named_sequence} { 506 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 507 %0 = transform.structured.match ops{["linalg.matmul_transpose_b"]} in %arg1 : (!transform.any_op) -> !transform.any_op 508 %c10 = transform.param.constant 10 : i64 -> !transform.param<i64> 509 %c20 = transform.param.constant 20 : i64 -> !transform.param<i64> 510 %sz = transform.merge_handles %c10, %c20 : !transform.param<i64> 511 // expected-error @below {{requires exactly one parameter associated}} 512 %1:2 = transform.structured.tile_using_forall %0 tile_sizes [%sz, 20] 513 : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op) 514 transform.yield 515 } 516} 517 518// ----- 519 520// CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)> 521// CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 ceildiv 20)> 522// CHECK-DAG: #[[$map2:.+]] = affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)> 523// CHECK-DAG: #[[$map3:.+]] = affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)> 524// CHECK-DAG: #[[$map4:.+]] = affine_map<(d0) -> (d0 * 10)> 525// CHECK-DAG: #[[$map5:.+]] = affine_map<(d0) -> (d0 * 20)> 526 527// CHECK-LABEL: matmul_tile_size_dynamic( 528// CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<?x?xf32> 529// CHECK-SAME: %[[B:[0-9a-z]+]]: tensor<?x?xf32> 530// CHECK-SAME: %[[C:[0-9a-z]+]]: tensor<?x?xf32> 531func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> { 532 // CHECK: %[[c1:.*]] = arith.constant 1 : index 533 // CHECK: %[[c0:.*]] = arith.constant 0 : index 534 // CHECK-DAG: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] : 535 // CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] : 536 // CHECK-DAG: %[[NT0:.+]] = affine.apply #map()[%[[M]]] 537 // CHECK-DAG: %[[NT1:.+]] = affine.apply #map1()[%[[N]]] 538 // CHECK-DAG: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] : 539 // CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]]) 540 // CHECK-DAG: %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]] 541 // CHECK-DAG: %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]] 542 // CHECK-DAG: %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]]) 543 // CHECK-DAG: %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]]) 544 // CHECK: tensor.extract_slice %[[A]][%[[LB0]], 0] [%[[TS0]], %[[K]]] [1, 1] : 545 // CHECK: tensor.extract_slice %[[B]][0, %[[LB1]]] [%[[K]], %[[TS1]]] [1, 1] : 546 // CHECK: tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [%[[TS0]], %[[TS1]]] [1, 1] : 547 // CHECK: linalg.matmul 548 // CHECK: scf.forall.in_parallel 549 // CHECK-NEXT: tensor.parallel_insert_slice 550 %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 551 outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>) 552 return %0 : tensor<?x?xf32> 553} 554 555module attributes {transform.with_named_sequence} { 556 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 557 %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op 558 %c10 = transform.param.constant 10 : i64 -> !transform.any_param 559 %c20 = transform.param.constant 20 : i64 -> !transform.any_param 560 %sz = transform.merge_handles %c10, %c20 : !transform.any_param 561 %1:2 = transform.structured.tile_using_forall %0 tile_sizes *(%sz) 562 : (!transform.any_op, !transform.any_param) -> (!transform.any_op, !transform.any_op) 563 transform.yield 564 } 565} 566 567// ----- 568 569func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> { 570 %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 571 outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>) 572 return %0 : tensor<?x?xf32> 573} 574 575module attributes {transform.with_named_sequence} { 576 transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { 577 %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op 578 %sz = transform.param.constant "[10 : i64, 20 : i64]" -> !transform.any_param 579 // expected-error @below {{expected the parameter to be associated with an integer attribute}} 580 %1:2 = transform.structured.tile_using_forall %0 tile_sizes *(%sz) 581 : (!transform.any_op, !transform.any_param) -> (!transform.any_op, !transform.any_op) 582 transform.yield 583 } 584} 585 586// ----- 587 588#map = affine_map<(d0, d1) -> (d0, d1)> 589#map1 = affine_map<(d0, d1) -> (d0)> 590 591func.func @tile_thread_safety1(%arg0: tensor<100x300xf32>, %arg1: tensor<100xf32>) -> tensor<100xf32> { 592 // expected-warning@below {{tiling is not thread safe at axis #1}} 593 %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%arg0 : tensor<100x300xf32>) outs(%arg1 : tensor<100xf32>) { 594 ^bb0(%in: f32, %out: f32): 595 %1 = arith.addf %in, %out : f32 596 linalg.yield %1 : f32 597 } -> tensor<100xf32> 598 return %0 : tensor<100xf32> 599} 600 601module attributes {transform.with_named_sequence} { 602 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 603 %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op 604 %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [4, 2] 605 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 606 transform.yield 607 } 608} 609 610// ----- 611 612#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> 613#map1 = affine_map<(d0, d1, d2) -> (d1, d2)> 614 615func.func @tile_thread_safety2(%arg0: tensor<100x300x8xf32>, %arg1: tensor<300x8xf32>) -> tensor<300x8xf32> { 616 // expected-warning@below {{tiling is not thread safe at axis #0}} 617 %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["reduction", "parallel", "parallel"]} ins(%arg0 : tensor<100x300x8xf32>) outs(%arg1 : tensor<300x8xf32>) { 618 ^bb0(%in: f32, %out: f32): 619 %1 = arith.addf %in, %out : f32 620 linalg.yield %1 : f32 621 } -> tensor<300x8xf32> 622 return %0 : tensor<300x8xf32> 623} 624 625module attributes {transform.with_named_sequence} { 626 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 627 %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op 628 %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [8] 629 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 630 transform.yield 631 } 632} 633 634// ----- 635 636#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> 637#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 638 639func.func @tile_thread_safety3(%arg0: tensor<100x300x8xf32>, %arg1: tensor<100x8xf32>) -> tensor<100x8xf32> { 640 // expected-warning@below {{tiling is not thread safe at axis #1}} 641 %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg0 : tensor<100x300x8xf32>) outs(%arg1 : tensor<100x8xf32>) { 642 ^bb0(%in: f32, %out: f32): 643 %1 = arith.addf %in, %out : f32 644 linalg.yield %1 : f32 645 } -> tensor<100x8xf32> 646 return %0 : tensor<100x8xf32> 647} 648 649module attributes {transform.with_named_sequence} { 650 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 651 %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op 652 %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [8, 4, 2] 653 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 654 transform.yield 655 } 656} 657 658// ----- 659 660#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> 661#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> 662#map2 = affine_map<(d0, d1, d2) -> (d2)> 663 664func.func @tile_thread_safety4(%arg0: tensor<100x300x8xf32>, %arg1: tensor<100x8xf32>, %arg2 : tensor<8xf32>) -> (tensor<100x8xf32>, tensor<8xf32>) { 665 // expected-warning@+2 {{tiling is not thread safe at axis #0}} 666 // expected-warning@below {{tiling is not thread safe at axis #1}} 667 %0:2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "reduction", "parallel"]} ins(%arg0 : tensor<100x300x8xf32>) outs(%arg1, %arg2 : tensor<100x8xf32>, tensor<8xf32>) { 668 ^bb0(%in: f32, %out1: f32, %out2: f32): 669 %1 = arith.addf %in, %out1 : f32 670 %2 = arith.addf %in, %out2 : f32 671 linalg.yield %1, %2 : f32, f32 672 } -> (tensor<100x8xf32>, tensor<8xf32>) 673 return %0#0, %0#1 : tensor<100x8xf32>, tensor<8xf32> 674} 675 676module attributes {transform.with_named_sequence} { 677 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 678 %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op 679 %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [8, 4, 2] 680 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 681 transform.yield 682 } 683} 684 685// ----- 686 687#map = affine_map<(d0, d1) -> (d0, d1)> 688#map1 = affine_map<(d0, d1) -> (d0)> 689 690func.func @tile_thread_safety5(%arg0: tensor<100x300xf32>, %arg1: tensor<100xf32>) -> tensor<100xf32> { 691 // expected-warning@below {{tiling is not thread safe at axis #1}} 692 %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%arg0 : tensor<100x300xf32>) outs(%arg1 : tensor<100xf32>) { 693 ^bb0(%in: f32, %out: f32): 694 %1 = arith.addf %in, %out : f32 695 linalg.yield %1 : f32 696 } -> tensor<100xf32> 697 return %0 : tensor<100xf32> 698} 699 700module attributes {transform.with_named_sequence} { 701 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 702 %0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op 703 %forall, %tiled_generic = transform.structured.tile_using_forall %0 tile_sizes [10, 1] 704 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 705 transform.yield 706 } 707} 708 709// ----- 710 711func.func @tile_thread_safety6(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> { 712 // expected-warning@below {{tiling is not thread safe at axis #2}} 713 %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 714 outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>) 715 return %0 : tensor<?x?xf32> 716} 717 718module attributes {transform.with_named_sequence} { 719 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 720 %0 = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op 721 %forall, %tiled_generic = transform.structured.tile_using_forall %0 num_threads [2, 0, 8] 722 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 723 transform.yield 724 } 725} 726