1// RUN: mlir-opt %s -transform-interpreter --cse --canonicalize -split-input-file -verify-diagnostics | FileCheck %s 2// RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics | FileCheck %s --check-prefix CHECK-NOCLEANUP 3 4// CHECK: func.func @fuse_1st_for_into_2nd([[A:%.*]]: {{.*}}, [[B:%.*]]: {{.*}} 5func.func @fuse_1st_for_into_2nd(%A: tensor<128xf32>, %B: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) { 6 // CHECK-DAG: [[C0:%.*]] = arith.constant 0 : index 7 // CHECK-DAG: [[C16:%.*]] = arith.constant 16 : index 8 // CHECK-DAG: [[C128:%.*]] = arith.constant 128 : index 9 // CHECK-DAG: [[ZERO:%.*]] = arith.constant 0.000000e+00 : f32 10 %c0 = arith.constant 0 : index 11 %c16 = arith.constant 16 : index 12 %c128 = arith.constant 128 : index 13 %cst = arith.constant 0.000000e+00 : f32 14 // CHECK: [[R0:%.*]]:2 = scf.for [[IV:%.*]] = [[C0]] to [[C128]] step [[C16]] iter_args([[IA:%.*]] = [[A]], [[IB:%.*]] = [[B]]) {{.*}} 15 %1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %A) -> (tensor<128xf32>) { 16 // CHECK-DAG: [[ASLICE:%.*]] = vector.transfer_read [[A]][[[IV]]], [[ZERO]] 17 // CHECK-DAG: [[SLICE0:%.*]] = vector.transfer_read [[IA]][[[IV]]], [[ZERO]] 18 // CHECK: [[OUT1:%.*]] = arith.addf [[SLICE0]], [[ASLICE]] 19 // CHECK-NEXT: [[WRT0:%.*]] = vector.transfer_write [[OUT1]], [[IA]][[[IV]]] 20 %2 = vector.transfer_read %A[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 21 %3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 22 %5 = arith.addf %3, %2 : vector<16xf32> 23 %6 = vector.transfer_write %5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32> 24 scf.yield %6 : tensor<128xf32> 25 } 26 %dup1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %B) -> (tensor<128xf32>) { 27 // CHECK-DAG: [[SLICE1:%.*]] = vector.transfer_read [[IB]][[[IV]]], [[ZERO]] 28 // CHECK: [[OUT2:%.*]] = arith.addf [[SLICE1]], [[ASLICE]] 29 // CHECK-NEXT: [[WRT1:%.*]] = vector.transfer_write [[OUT2]], [[IB]][[[IV]]] 30 %dup2 = vector.transfer_read %A[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 31 %dup3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 32 %dup5 = arith.addf %dup3, %dup2 : vector<16xf32> 33 %dup6 = vector.transfer_write %dup5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32> 34 // CHECK: scf.yield [[WRT0]], [[WRT1]] : {{.*}} 35 scf.yield %dup6 : tensor<128xf32> 36 } 37 return %1, %dup1 : tensor<128xf32>, tensor<128xf32> 38} 39module attributes {transform.with_named_sequence} { 40 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 41 %0 = transform.structured.match ops{["scf.for"]} in %arg0 : (!transform.any_op) -> !transform.any_op 42 %for:2 = transform.split_handle %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 43 %fused = transform.loop.fuse_sibling %for#0 into %for#1 : (!transform.any_op,!transform.any_op) -> !transform.any_op 44 transform.yield 45 } 46} 47 48// ----- 49 50// CHECK: func.func @fuse_2nd_for_into_1st([[A:%.*]]: {{.*}}, [[B:%.*]]: {{.*}} 51func.func @fuse_2nd_for_into_1st(%A: tensor<128xf32>, %B: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) { 52 // CHECK-DAG: [[C0:%.*]] = arith.constant 0 : index 53 // CHECK-DAG: [[C16:%.*]] = arith.constant 16 : index 54 // CHECK-DAG: [[C128:%.*]] = arith.constant 128 : index 55 // CHECK-DAG: [[ZERO:%.*]] = arith.constant 0.000000e+00 : f32 56 %c0 = arith.constant 0 : index 57 %c16 = arith.constant 16 : index 58 %c128 = arith.constant 128 : index 59 %cst = arith.constant 0.000000e+00 : f32 60 // CHECK: [[R0:%.*]]:2 = scf.for [[IV:%.*]] = [[C0]] to [[C128]] step [[C16]] iter_args([[IB:%.*]] = [[B]], [[IA:%.*]] = [[A]]) {{.*}} 61 %1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %A) -> (tensor<128xf32>) { 62 // CHECK-DAG: [[ASLICE:%.*]] = vector.transfer_read [[A]][[[IV]]], [[ZERO]] 63 // CHECK-DAG: [[SLICE0:%.*]] = vector.transfer_read [[IB]][[[IV]]], [[ZERO]] 64 // CHECK: [[OUT1:%.*]] = arith.addf [[SLICE0]], [[ASLICE]] 65 // CHECK-NEXT: [[WRT0:%.*]] = vector.transfer_write [[OUT1]], [[IB]][[[IV]]] 66 %2 = vector.transfer_read %A[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 67 %3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 68 %5 = arith.addf %3, %2 : vector<16xf32> 69 %6 = vector.transfer_write %5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32> 70 scf.yield %6 : tensor<128xf32> 71 } 72 %dup1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %B) -> (tensor<128xf32>) { 73 // CHECK-DAG: [[SLICE1:%.*]] = vector.transfer_read [[IA]][[[IV]]], [[ZERO]] 74 // CHECK: [[OUT2:%.*]] = arith.addf [[SLICE1]], [[ASLICE]] 75 // CHECK-NEXT: [[WRT1:%.*]] = vector.transfer_write [[OUT2]], [[IA]][[[IV]]] 76 %dup2 = vector.transfer_read %A[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 77 // NB: the dominance check used to fail on the following line, 78 // however the defining op for the value of %arg3 occurs above the source loop and hence is safe 79 // and %arg4 is a block argument of the scope of the loops and hence is safe 80 %dup3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 81 %dup5 = arith.addf %dup3, %dup2 : vector<16xf32> 82 %dup6 = vector.transfer_write %dup5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32> 83 // CHECK: scf.yield [[WRT0]], [[WRT1]] : {{.*}} 84 scf.yield %dup6 : tensor<128xf32> 85 } 86 return %1, %dup1 : tensor<128xf32>, tensor<128xf32> 87} 88module attributes {transform.with_named_sequence} { 89 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 90 %0 = transform.structured.match ops{["scf.for"]} in %arg0 : (!transform.any_op) -> !transform.any_op 91 %for:2 = transform.split_handle %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 92 %fused = transform.loop.fuse_sibling %for#1 into %for#0 : (!transform.any_op,!transform.any_op) -> !transform.any_op 93 transform.yield 94 } 95} 96 97// ----- 98 99// CHECK: func.func @matmul_fuse_1st_forall_into_2nd([[A1:%.*]]: {{.*}}, [[A2:%.*]]: {{.*}}, [[B:%.*]]: {{.*}} 100func.func @matmul_fuse_1st_forall_into_2nd(%A1 : tensor<128x128xf32>, %A2 : tensor<128x128xf32>, %B : tensor<128x128xf32>) -> (tensor<128x128xf32>, tensor<128x128xf32>) { 101 %zero = arith.constant 0.0 : f32 102 %out_alloc = tensor.empty() : tensor<128x128xf32> 103 %out = linalg.fill ins(%zero : f32) outs(%out_alloc : tensor<128x128xf32>) -> tensor<128x128xf32> 104 105 // CHECK: scf.forall ([[I:%.*]]) in (4) shared_outs([[S1:%.*]] = [[IN1:%.*]], [[S2:%.*]] = [[IN2:%.*]]) -> (tensor<128x128xf32>, tensor<128x128xf32>) { 106 // CHECK: [[T:%.*]] = affine.apply 107 // CHECK: tensor.extract_slice [[A2]][[[T]], 0] [32, 128] [1, 1] 108 // CHECK: tensor.extract_slice [[S1]][[[T]], 0] [32, 128] [1, 1] 109 // CHECK: [[OUT1:%.*]] = linalg.matmul 110 // CHECK: tensor.extract_slice [[A1]][[[T]], 0] [32, 128] [1, 1] 111 // CHECK: tensor.extract_slice [[S2]][[[T]], 0] [32, 128] [1, 1] 112 // CHECK: [[OUT2:%.*]] = linalg.matmul 113 // CHECK: scf.forall.in_parallel { 114 // CHECK: tensor.parallel_insert_slice [[OUT1]] into [[S1]][[[T]], 0] [32, 128] [1, 1] 115 // CHECK: tensor.parallel_insert_slice [[OUT2]] into [[S2]][[[T]], 0] [32, 128] [1, 1] 116 // CHECK: } 117 // CHECK: } 118 %out1 = linalg.matmul ins(%A1, %B : tensor<128x128xf32>, tensor<128x128xf32>) outs(%out : tensor<128x128xf32>) -> tensor<128x128xf32> 119 %out2 = linalg.matmul ins(%A2, %B : tensor<128x128xf32>, tensor<128x128xf32>) outs(%out : tensor<128x128xf32>) -> tensor<128x128xf32> 120 121 func.return %out1, %out2 : tensor<128x128xf32>, tensor<128x128xf32> 122} 123module attributes {transform.with_named_sequence} { 124 transform.named_sequence @__transform_main(%variant_op : !transform.any_op {transform.readonly}) { 125 %matched = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> (!transform.any_op) 126 127 %mm1, %mm2 = transform.split_handle %matched : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 128 129 %tiled_mm1, %loop1 = transform.structured.tile_using_forall %mm1 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 130 %tiled_mm2, %loop2 = transform.structured.tile_using_forall %mm2 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 131 132 %fused_loop = transform.loop.fuse_sibling %loop2 into %loop1 : (!transform.any_op, !transform.any_op) -> !transform.any_op 133 transform.yield 134 } 135} 136 137// ----- 138 139// CHECK: func.func @matmul_fuse_2nd_forall_into_1st([[A1:%.*]]: {{.*}}, [[A2:%.*]]: {{.*}}, [[B:%.*]]: {{.*}} 140func.func @matmul_fuse_2nd_forall_into_1st(%A1 : tensor<128x128xf32>, %A2 : tensor<128x128xf32>, %B : tensor<128x128xf32>) -> (tensor<128x128xf32>, tensor<128x128xf32>) { 141 %zero = arith.constant 0.0 : f32 142 %out_alloc = tensor.empty() : tensor<128x128xf32> 143 %out = linalg.fill ins(%zero : f32) outs(%out_alloc : tensor<128x128xf32>) -> tensor<128x128xf32> 144 145 // CHECK: scf.forall ([[I:%.*]]) in (4) shared_outs([[S1:%.*]] = [[IN1:%.*]], [[S2:%.*]] = [[IN2:%.*]]) -> (tensor<128x128xf32>, tensor<128x128xf32>) { 146 // CHECK: [[T:%.*]] = affine.apply 147 // CHECK: tensor.extract_slice [[A1]][[[T]], 0] [32, 128] [1, 1] 148 // CHECK: tensor.extract_slice [[S1]][[[T]], 0] [32, 128] [1, 1] 149 // CHECK: [[OUT1:%.*]] = linalg.matmul 150 // CHECK: tensor.extract_slice [[A2]][[[T]], 0] [32, 128] [1, 1] 151 // CHECK: tensor.extract_slice [[S2]][[[T]], 0] [32, 128] [1, 1] 152 // CHECK: [[OUT2:%.*]] = linalg.matmul 153 // CHECK: scf.forall.in_parallel { 154 // CHECK: tensor.parallel_insert_slice [[OUT1]] into [[S1]][[[T]], 0] [32, 128] [1, 1] 155 // CHECK: tensor.parallel_insert_slice [[OUT2]] into [[S2]][[[T]], 0] [32, 128] [1, 1] 156 // CHECK: } 157 // CHECK: } 158 %out1 = linalg.matmul ins(%A1, %B : tensor<128x128xf32>, tensor<128x128xf32>) outs(%out : tensor<128x128xf32>) -> tensor<128x128xf32> 159 %out2 = linalg.matmul ins(%A2, %B : tensor<128x128xf32>, tensor<128x128xf32>) outs(%out : tensor<128x128xf32>) -> tensor<128x128xf32> 160 161 func.return %out1, %out2 : tensor<128x128xf32>, tensor<128x128xf32> 162} 163module attributes {transform.with_named_sequence} { 164 transform.named_sequence @__transform_main(%variant_op : !transform.any_op {transform.readonly}) { 165 %matched = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> (!transform.any_op) 166 167 %mm1, %mm2 = transform.split_handle %matched : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 168 169 %tiled_mm1, %loop1 = transform.structured.tile_using_forall %mm1 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 170 %tiled_mm2, %loop2 = transform.structured.tile_using_forall %mm2 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 171 172 %fused_loop = transform.loop.fuse_sibling %loop1 into %loop2 : (!transform.any_op, !transform.any_op) -> !transform.any_op 173 transform.yield 174 } 175} 176 177// ----- 178 179// CHECK-NOCLEANUP: func.func @fuse_no_iter_args([[A:%.*]]: {{.*}}, [[B:%.*]]: {{.*}} 180func.func @fuse_no_iter_args(%A: tensor<128xf32>, %B: tensor<128xf32>) { 181 // CHECK-NOCLEANUP: [[C0:%.*]] = arith.constant 0 : index 182 // CHECK-NOCLEANUP: [[C16:%.*]] = arith.constant 16 : index 183 // CHECK-NOCLEANUP: [[C128:%.*]] = arith.constant 128 : index 184 // CHECK-NOCLEANUP: [[ZERO:%.*]] = arith.constant 0.000000e+00 : f32 185 %c0 = arith.constant 0 : index 186 %c16 = arith.constant 16 : index 187 %c128 = arith.constant 128 : index 188 %cst = arith.constant 0.000000e+00 : f32 189 // CHECK-NOCLEANUP: scf.for [[IV:%.*]] = [[C0]] to [[C128]] step [[C16]] {{.*}} 190 scf.for %arg0 = %c0 to %c128 step %c16 { 191 // CHECK-NOCLEANUP: [[ASLICE:%.*]] = vector.transfer_read [[A]][[[IV]]], [[ZERO]] 192 %2 = vector.transfer_read %A[%arg0], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 193 scf.yield 194 } 195 scf.for %arg0 = %c0 to %c128 step %c16 { 196 // CHECK-NOCLEANUP: [[BSLICE:%.*]] = vector.transfer_read [[B]][[[IV]]], [[ZERO]] 197 %dup2 = vector.transfer_read %B[%arg0], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 198 scf.yield 199 } 200 return 201} 202module attributes {transform.with_named_sequence} { 203 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 204 %0 = transform.structured.match ops{["scf.for"]} in %arg0 : (!transform.any_op) -> !transform.any_op 205 %for:2 = transform.split_handle %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 206 %fused = transform.loop.fuse_sibling %for#0 into %for#1 : (!transform.any_op,!transform.any_op) -> !transform.any_op 207 transform.yield 208 } 209} 210 211// ----- 212 213func.func @source_for_uses_result_of_target_for_err(%A: tensor<128xf32>, %B: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) { 214 %c0 = arith.constant 0 : index 215 %c16 = arith.constant 16 : index 216 %c128 = arith.constant 128 : index 217 %cst = arith.constant 0.000000e+00 : f32 218 // expected-error @below {{user of results of target should be properly dominated by source}} 219 %1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %A) -> (tensor<128xf32>) { 220 %2 = vector.transfer_read %A[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 221 %3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 222 %5 = arith.addf %3, %2 : vector<16xf32> 223 %6 = vector.transfer_write %5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32> 224 scf.yield %6 : tensor<128xf32> 225 } 226 %dup1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %1) -> (tensor<128xf32>) { 227 %dup2 = vector.transfer_read %A[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 228 %dup3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 229 %dup5 = arith.addf %dup3, %dup2 : vector<16xf32> 230 %dup6 = vector.transfer_write %dup5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32> 231 scf.yield %dup6 : tensor<128xf32> 232 } 233 return %1, %dup1 : tensor<128xf32>, tensor<128xf32> 234} 235module attributes {transform.with_named_sequence} { 236 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 237 %0 = transform.structured.match ops{["scf.for"]} in %arg0 : (!transform.any_op) -> !transform.any_op 238 %for:2 = transform.split_handle %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 239 %fused = transform.loop.fuse_sibling %for#0 into %for#1 : (!transform.any_op,!transform.any_op) -> !transform.any_op 240 transform.yield 241 } 242} 243 244// ----- 245 246func.func @source_forall_uses_result_of_target_forall_err(%A : tensor<128x128xf32>, %B1 : tensor<128x128xf32>, %B2 : tensor<128x128xf32>) -> (tensor<128x128xf32>, tensor<128x128xf32>) { 247 %zero = arith.constant 0.0 : f32 248 %out_alloc = tensor.empty() : tensor<128x128xf32> 249 %out = linalg.fill ins(%zero : f32) outs(%out_alloc : tensor<128x128xf32>) -> tensor<128x128xf32> 250 251 // expected-error @below {{user of results of target should be properly dominated by source}} 252 %out1 = linalg.matmul ins(%A, %B1 : tensor<128x128xf32>, tensor<128x128xf32>) outs(%out : tensor<128x128xf32>) -> tensor<128x128xf32> 253 %out2 = linalg.matmul ins(%A, %out1 : tensor<128x128xf32>, tensor<128x128xf32>) outs(%out : tensor<128x128xf32>) -> tensor<128x128xf32> 254 255 func.return %out1, %out2 : tensor<128x128xf32>, tensor<128x128xf32> 256} 257module attributes {transform.with_named_sequence} { 258 transform.named_sequence @__transform_main(%variant_op : !transform.any_op {transform.readonly}) { 259 %matched = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> (!transform.any_op) 260 261 %mm1, %mm2 = transform.split_handle %matched : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 262 263 %tiled_mm1, %loop1 = transform.structured.tile_using_forall %mm1 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 264 %tiled_mm2, %loop2 = transform.structured.tile_using_forall %mm2 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 265 266 %fused_loop = transform.loop.fuse_sibling %loop1 into %loop2 : (!transform.any_op, !transform.any_op) -> !transform.any_op 267 transform.yield 268 } 269} 270 271// ----- 272 273func.func @target_for_region_uses_result_of_source_for_err(%A: tensor<128xf32>, %B: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) { 274 %c0 = arith.constant 0 : index 275 %c16 = arith.constant 16 : index 276 %c128 = arith.constant 128 : index 277 %cst = arith.constant 0.000000e+00 : f32 278 %1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %A) -> (tensor<128xf32>) { 279 %2 = vector.transfer_read %A[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 280 %3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 281 %5 = arith.addf %3, %2 : vector<16xf32> 282 %6 = vector.transfer_write %5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32> 283 scf.yield %6 : tensor<128xf32> 284 } 285 %dup1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %B) -> (tensor<128xf32>) { 286 // expected-error @below {{values used inside regions of target should be properly dominated by source}} 287 %dup2 = vector.transfer_read %1[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 288 %dup3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 289 %dup5 = arith.addf %dup3, %dup2 : vector<16xf32> 290 %dup6 = vector.transfer_write %dup5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32> 291 scf.yield %dup6 : tensor<128xf32> 292 } 293 return %1, %dup1 : tensor<128xf32>, tensor<128xf32> 294} 295module attributes {transform.with_named_sequence} { 296 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 297 %0 = transform.structured.match ops{["scf.for"]} in %arg0 : (!transform.any_op) -> !transform.any_op 298 %for:2 = transform.split_handle %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 299 %fused = transform.loop.fuse_sibling %for#1 into %for#0 : (!transform.any_op,!transform.any_op) -> !transform.any_op 300 transform.yield 301 } 302} 303 304// ----- 305 306func.func @target_forall_depends_on_value_not_dominated_by_source_forall_err(%A1 : tensor<128x128xf32>, %A2 : tensor<128x128xf32>, %B : tensor<128x128xf32>) -> (tensor<128x128xf32>, tensor<128x128xf32>) { 307 %zero = arith.constant 0.0 : f32 308 %buf1_alloc = tensor.empty() : tensor<128x128xf32> 309 %buf1 = linalg.fill ins(%zero : f32) outs(%buf1_alloc : tensor<128x128xf32>) -> tensor<128x128xf32> 310 %out1 = linalg.matmul ins(%A1, %B : tensor<128x128xf32>, tensor<128x128xf32>) outs(%buf1 : tensor<128x128xf32>) -> tensor<128x128xf32> 311 %out_alloc2 = tensor.empty() : tensor<128x128xf32> 312 %buf2 = linalg.fill ins(%zero : f32) outs(%buf1_alloc : tensor<128x128xf32>) -> tensor<128x128xf32> 313 // expected-error @below {{operands of target should be properly dominated by source}} 314 %out2 = linalg.matmul ins(%A2, %B : tensor<128x128xf32>, tensor<128x128xf32>) outs(%buf2 : tensor<128x128xf32>) -> tensor<128x128xf32> 315 316 func.return %out1, %out2 : tensor<128x128xf32>, tensor<128x128xf32> 317} 318module attributes {transform.with_named_sequence} { 319 transform.named_sequence @__transform_main(%variant_op : !transform.any_op {transform.readonly}) { 320 %matched = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> (!transform.any_op) 321 322 %mm1, %mm2 = transform.split_handle %matched : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 323 324 %tiled_mm1, %loop1 = transform.structured.tile_using_forall %mm1 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 325 %tiled_mm2, %loop2 = transform.structured.tile_using_forall %mm2 tile_sizes [32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) 326 327 %fused_loop = transform.loop.fuse_sibling %loop2 into %loop1 : (!transform.any_op, !transform.any_op) -> !transform.any_op 328 transform.yield 329 } 330} 331// ----- 332 333// CHECK: func.func @foreach_loop_pair_fuse([[A:%.*]]: {{.*}}, [[B:%.*]]: {{.*}} 334func.func @foreach_loop_pair_fuse(%arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>) { 335 // CHECK-DAG: [[C0:%.*]] = arith.constant 0 : index 336 // CHECK-DAG: [[C16:%.*]] = arith.constant 16 : index 337 // CHECK-DAG: [[C128:%.*]] = arith.constant 128 : index 338 // CHECK-DAG: [[ZERO:%.*]] = arith.constant 0.000000e+00 : f32 339 %c0 = arith.constant 0 : index 340 %c16 = arith.constant 16 : index 341 %c32 = arith.constant 32 : index 342 %c128 = arith.constant 128 : index 343 %cst = arith.constant 0.000000e+00 : f32 344 // CHECK: [[RST:%.*]]:2 = scf.for [[IV:%.*]] = [[C0]] to [[C128]] step [[C16]] iter_args([[IB0:%.*]] = [[B]], [[IB1:%.*]] = [[B]]) {{.*}} 345 %1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %arg2) -> (tensor<128xf32>) { 346 // CHECK-DAG: [[ASLICE:%.*]] = vector.transfer_read [[A]][[[IV]]], [[ZERO]] 347 // CHECK-DAG: [[SLICE0:%.*]] = vector.transfer_read [[IB0]][[[IV]]], [[ZERO]] 348 // CHECK: [[OUT1:%.*]] = arith.addf [[SLICE0]], [[ASLICE]] 349 // CHECK-NEXT: [[WRT0:%.*]] = vector.transfer_write [[OUT1]], [[IB0]][[[IV]]] 350 %2 = vector.transfer_read %arg1[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 351 %3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 352 %5 = arith.addf %3, %2 : vector<16xf32> 353 %6 = vector.transfer_write %5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32> 354 scf.yield %6 : tensor<128xf32> 355 } {target_loops} 356 %dup1 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %arg2) -> (tensor<128xf32>) { 357 // CHECK-DAG: [[SLICE1:%.*]] = vector.transfer_read [[IB1]][[[IV]]], [[ZERO]] 358 // CHECK: [[OUT2:%.*]] = arith.addf [[SLICE1]], [[ASLICE]] 359 // CHECK-NEXT: [[WRT1:%.*]] = vector.transfer_write [[OUT2]], [[IB1]][[[IV]]] 360 %dup2 = vector.transfer_read %arg1[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 361 %dup3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<16xf32> 362 %dup5 = arith.addf %dup3, %dup2 : vector<16xf32> 363 %dup6 = vector.transfer_write %dup5, %arg4[%arg3] {in_bounds = [true]} : vector<16xf32>, tensor<128xf32> 364 // CHECK: scf.yield [[WRT0]], [[WRT1]] : {{.*}} 365 scf.yield %dup6 : tensor<128xf32> 366 } {source_loops} 367 %2 = scf.for %arg3 = %c0 to %c128 step %c32 iter_args(%arg4 = %arg2) -> (tensor<128xf32>) { 368 // CHECK-DAG: [[ASLICE:%.*]] = vector.transfer_read [[A]][[[IV]]], [[ZERO]] 369 // CHECK-DAG: [[SLICE0:%.*]] = vector.transfer_read [[IB0]][[[IV]]], [[ZERO]] 370 // CHECK: [[OUT1:%.*]] = arith.addf [[SLICE0]], [[ASLICE]] 371 // CHECK-NEXT: [[WRT0:%.*]] = vector.transfer_write [[OUT1]], [[IB0]][[[IV]]] 372 %2 = vector.transfer_read %arg1[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<32xf32> 373 %3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<32xf32> 374 %5 = arith.addf %3, %2 : vector<32xf32> 375 %6 = vector.transfer_write %5, %arg4[%arg3] {in_bounds = [true]} : vector<32xf32>, tensor<128xf32> 376 scf.yield %6 : tensor<128xf32> 377 } {target_loops} 378 %dup2 = scf.for %arg3 = %c0 to %c128 step %c32 iter_args(%arg4 = %arg2) -> (tensor<128xf32>) { 379 // CHECK-DAG: [[SLICE1:%.*]] = vector.transfer_read [[IB1]][[[IV]]], [[ZERO]] 380 // CHECK: [[OUT2:%.*]] = arith.addf [[SLICE1]], [[ASLICE]] 381 // CHECK-NEXT: [[WRT1:%.*]] = vector.transfer_write [[OUT2]], [[IB1]][[[IV]]] 382 %dup2 = vector.transfer_read %arg1[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<32xf32> 383 %dup3 = vector.transfer_read %arg4[%arg3], %cst {in_bounds = [true]} : tensor<128xf32>, vector<32xf32> 384 %dup5 = arith.addf %dup3, %dup2 : vector<32xf32> 385 %dup6 = vector.transfer_write %dup5, %arg4[%arg3] {in_bounds = [true]} : vector<32xf32>, tensor<128xf32> 386 // CHECK: scf.yield [[WRT0]], [[WRT1]] : {{.*}} 387 scf.yield %dup6 : tensor<128xf32> 388 } {source_loops} 389 return %1, %dup1, %2, %dup2 : tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32> 390} 391 392 393module attributes {transform.with_named_sequence} { 394 transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { 395 %target_loops = transform.structured.match ops{["scf.for"]} attributes {target_loops} in %arg0 : (!transform.any_op) -> !transform.any_op 396 %source_loops = transform.structured.match ops{["scf.for"]} attributes {source_loops} in %arg0 : (!transform.any_op) -> !transform.any_op 397 transform.foreach %target_loops, %source_loops : !transform.any_op, !transform.any_op { 398 ^bb0(%target_loop: !transform.any_op, %source_loop: !transform.any_op): 399 %fused = transform.loop.fuse_sibling %target_loop into %source_loop : (!transform.any_op,!transform.any_op) -> !transform.any_op 400 } 401 transform.yield 402 } 403} 404