1// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion))' -split-input-file | FileCheck %s 2// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{fusion-maximal}))' -split-input-file | FileCheck %s --check-prefix=MAXIMAL 3 4// Part I of fusion tests in mlir/test/Transforms/loop-fusion.mlir. 5// Part III of fusion tests in mlir/test/Transforms/loop-fusion-3.mlir 6// Part IV of fusion tests in mlir/test/Transforms/loop-fusion-4.mlir 7 8// ----- 9 10// CHECK-LABEL: func @should_fuse_at_depth_above_loop_carried_dependence(%{{.*}}: memref<64x4xf32>, %{{.*}}: memref<64x4xf32>) { 11func.func @should_fuse_at_depth_above_loop_carried_dependence(%arg0: memref<64x4xf32>, %arg1: memref<64x4xf32>) { 12 %out = memref.alloc() : memref<64x4xf32> 13 %0 = arith.constant 0.0 : f32 14 affine.for %i0 = 0 to 64 { 15 affine.for %i1 = 0 to 4 { 16 affine.store %0, %out[%i0, %i1] : memref<64x4xf32> 17 } 18 } 19 affine.for %i2 = 0 to 4 { 20 affine.for %i3 = 0 to 4 { 21 affine.for %i4 = 0 to 16 { 22 %v = affine.load %arg1[16 * %i3 - %i4 + 15, %i2] : memref<64x4xf32> 23 "op0"(%v) : (f32) -> () 24 } 25 affine.for %i5 = 0 to 4 { 26 affine.for %i6 = 0 to 16 { 27 %v = affine.load %arg0[16 * %i5 - %i6 + 15, %i3] : memref<64x4xf32> 28 "op1"(%v) : (f32) -> () 29 } 30 affine.for %i7 = 0 to 16 { 31 %r = "op2"() : () -> (f32) 32 %v = affine.load %out[16 * %i5 + %i7, %i2] : memref<64x4xf32> 33 %s = arith.addf %v, %r : f32 34 affine.store %s, %out[16 * %i5 + %i7, %i2] : memref<64x4xf32> 35 } 36 } 37 } 38 } 39 40 // We can fuse source loop nest '%i0' into dst loop nest '%i2', but the 41 // depth at which we can insert the src loop nest slice into the dst loop 42 // lest must be decreased because of a loop carried dependence on loop '%i3'. 43 // As a result, the source loop nest is inserted at dst loop nest depth 1, 44 // just above the loop with the carried dependence. In addition, the source 45 // loop nest iteration bounds on its loop '%i1' are reduced to 1, so the 46 // memref size can be reduced to 128x1xf32. 47 48 // CHECK: memref.alloc() : memref<64x1xf32> 49 // CHECK: affine.for %{{.*}} = 0 to 4 { 50 // CHECK-NEXT: affine.for %{{.*}} = 0 to 64 { 51 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, 0] : memref<64x1xf32> 52 // CHECK-NEXT: } 53 // CHECK-NEXT: affine.for %{{.*}} = 0 to 4 { 54 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 55 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}} * 16 - %{{.*}} + 15, %{{.*}}] : memref<64x4xf32> 56 // CHECK-NEXT: "op0"(%{{.*}}) : (f32) -> () 57 // CHECK-NEXT: } 58 // CHECK-NEXT: affine.for %{{.*}} = 0 to 4 { 59 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 60 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}} * 16 - %{{.*}} + 15, %{{.*}}] : memref<64x4xf32> 61 // CHECK-NEXT: "op1"(%{{.*}}) : (f32) -> () 62 // CHECK-NEXT: } 63 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 64 // CHECK-NEXT: %{{.*}} = "op2"() : () -> f32 65 // CHECK: affine.load %{{.*}}[%{{.*}} * 16 + %{{.*}}, 0] : memref<64x1xf32> 66 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32 67 // CHECK: affine.store %{{.*}}, %{{.*}}[%{{.*}} * 16 + %{{.*}}, 0] : memref<64x1xf32> 68 // CHECK-NEXT: } 69 // CHECK-NEXT: } 70 // CHECK-NEXT: } 71 // CHECK-NEXT: } 72 // CHECK-NEXT: return 73 return 74} 75 76// ----- 77 78// CHECK-LABEL: func @should_fuse_only_two_loops_and_remove_producer() { 79func.func @should_fuse_only_two_loops_and_remove_producer() { 80 %a = memref.alloc() : memref<10xf32> 81 %b = memref.alloc() : memref<10xf32> 82 83 %cf7 = arith.constant 7.0 : f32 84 85 affine.for %i0 = 0 to 10 { 86 affine.store %cf7, %a[%i0] : memref<10xf32> 87 } 88 affine.for %i1 = 0 to 10 { 89 %v0 = affine.load %a[%i1] : memref<10xf32> 90 affine.store %v0, %b[%i1] : memref<10xf32> 91 } 92 affine.for %i2 = 0 to 10 { 93 %v1 = affine.load %a[%i2] : memref<10xf32> 94 affine.store %v1, %b[%i2] : memref<10xf32> 95 } 96 97 // On the first visit to '%i2', the fusion algorithm can not fuse loop nest 98 // '%i0' into '%i2' because of the dependences '%i0' and '%i2' each have on 99 // '%i1'. Then, '%i0' is fused into '%i1' and no private memref is created for 100 // memref '%a' to be able to remove '%i0' and still preserve the depencence on 101 // '%a' with '%i2'. 102 // TODO: Alternatively, we could fuse '%i0' into '%i1' with a private memref, 103 // the dependence between '%i0' and '%i1' on memref '%a' would no longer exist, 104 // and '%i0' could be fused into '%i2' as well. Note that this approach would 105 // duplicate the computation in loop nest '%i0' to loop nests '%i1' and '%i2', 106 // which would limit its profitability. 107 // CHECK: affine.for %{{.*}} = 0 to 10 { 108 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 109 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 110 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 111 // CHECK-NEXT: } 112 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 113 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 114 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 115 // CHECK-NEXT: } 116 // CHECK-NEXT: return 117 return 118} 119 120// ----- 121 122// CHECK-LABEL: func @should_fuse_after_one_loop_interchange() { 123func.func @should_fuse_after_one_loop_interchange() { 124 %a = memref.alloc() : memref<10xf32> 125 126 %cf0 = arith.constant 0.0 : f32 127 affine.for %i0 = 0 to 10 { 128 affine.store %cf0, %a[%i0] : memref<10xf32> 129 } 130 131 affine.for %i1 = 0 to 5 { 132 affine.for %i2 = 0 to 10 { 133 %v0 = affine.load %a[%i2] : memref<10xf32> 134 affine.store %v0, %a[%i2] : memref<10xf32> 135 } 136 } 137 138 // The dependence between the load and affine.store is carried on loop '%i1', and 139 // cannot be fused with loop '%i0' without violating this dependence. 140 // Once loops '%i1' and %i2' are interchanged, loop '%i0' can be fused 141 // at loop depth 1, because the loop carrying the dependence has been 142 // interchanged and is now at depth 2. 143 144 // CHECK: affine.for %{{.*}} = 0 to 10 { 145 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 146 // CHECK-NEXT: affine.for %{{.*}} = 0 to 5 { 147 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 148 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 149 // CHECK-NEXT: } 150 // CHECK-NEXT: } 151 // CHECK-NEXT: return 152 return 153} 154 155// ----- 156// CHECK-LABEL: func @should_fuse_after_two_loop_interchanges() { 157func.func @should_fuse_after_two_loop_interchanges() { 158 %a = memref.alloc() : memref<6x8xf32> 159 160 %cf0 = arith.constant 0.0 : f32 161 affine.for %i0 = 0 to 6 { 162 affine.for %i1 = 0 to 8 { 163 affine.store %cf0, %a[%i0, %i1] : memref<6x8xf32> 164 } 165 } 166 167 affine.for %i2 = 0 to 4 { 168 affine.for %i3 = 0 to 6 { 169 affine.for %i4 = 0 to 2 { 170 affine.for %i5 = 0 to 8 { 171 %v0 = affine.load %a[%i3, %i5] : memref<6x8xf32> 172 %v1 = arith.addf %v0, %v0 : f32 173 affine.store %v1, %a[%i3, %i5] : memref<6x8xf32> 174 } 175 } 176 } 177 } 178 179 // The dependence between the load and affine.store is carried on loops '%i2' and 180 // '%i4', and cannot be fused with loop '%i0' without violating this 181 // dependence. 182 // Once loop '%i2' is interchanged with loop '%i3', and again with loop 183 // '%i5', then loop '%i0' can be fused at loop depth 2, because the loop 184 // carrying the dependences have been interchanged with loops at depth > 2. 185 186 // CHECK: affine.for %{{.*}} = 0 to 6 { 187 // CHECK-NEXT: affine.for %{{.*}} = 0 to 8 { 188 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32> 189 // CHECK-NEXT: affine.for %{{.*}} = 0 to 4 { 190 // CHECK-NEXT: affine.for %{{.*}} = 0 to 2 { 191 // CHECK-NEXT: affine.load %{{.*}}[0, 0] : memref<1x1xf32> 192 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32 193 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32> 194 // CHECK-NEXT: } 195 // CHECK-NEXT: } 196 // CHECK-NEXT: } 197 // CHECK-NEXT: } 198 // CHECK-NEXT: return 199 return 200} 201 202// ----- 203 204func.func @should_fuse_live_out_writer(%arg0 : memref<10xf32>) -> memref<10xf32> { 205 %cst = arith.constant 0.000000e+00 : f32 206 affine.for %i0 = 0 to 10 { 207 affine.store %cst, %arg0[%i0] : memref<10xf32> 208 } 209 affine.for %i1 = 0 to 10 { 210 %1 = affine.load %arg0[%i1] : memref<10xf32> 211 affine.store %1, %arg0[%i1] : memref<10xf32> 212 } 213 return %arg0 : memref<10xf32> 214 215 // CHECK: %{{.*}} = arith.constant 0.000000e+00 : f32 216 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 217 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 218 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 219 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 220 // CHECK-NEXT: } 221 // CHECK-NEXT: return %{{.*}} : memref<10xf32> 222} 223 224// ----- 225 226// The fused slice has 16 iterations from along %i0. 227 228// CHECK-DAG: [[$MAP_LB:#map[0-9]*]] = affine_map<(d0) -> (d0 * 16)> 229// CHECK-DAG: [[$MAP_UB:#map[0-9]*]] = affine_map<(d0) -> (d0 * 16 + 16)> 230 231// CHECK-LABEL: slice_tile 232func.func @slice_tile(%arg0: memref<128x8xf32>, %arg1: memref<32x8xf32>, %0 : f32) -> memref<32x8xf32> { 233 affine.for %i0 = 0 to 32 { 234 affine.for %i1 = 0 to 8 { 235 affine.store %0, %arg1[%i0, %i1] : memref<32x8xf32> 236 } 237 } 238 affine.for %i = 0 to 2 { 239 affine.for %j = 0 to 8 { 240 affine.for %k = 0 to 8 { 241 affine.for %kk = 0 to 16 { 242 %v = affine.load %arg0[16 * %k + %kk, %j] : memref<128x8xf32> 243 %r = "foo"(%v) : (f32) -> f32 244 } 245 affine.for %ii = 0 to 16 { 246 %v = affine.load %arg1[16 * %i + %ii, %j] : memref<32x8xf32> 247 %s = arith.addf %v, %v : f32 248 affine.store %s, %arg1[16 * %i + %ii, %j] : memref<32x8xf32> 249 } 250 } 251 } 252 } 253 return %arg1 : memref<32x8xf32> 254} 255// CHECK: affine.for %{{.*}} = 0 to 2 { 256// CHECK-NEXT: affine.for %{{.*}} = 0 to 8 { 257// CHECK-NEXT: affine.for %{{.*}} = [[$MAP_LB]](%{{.*}}) to [[$MAP_UB]](%{{.*}}) { 258// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<32x8xf32> 259// CHECK-NEXT: } 260// CHECK-NEXT: affine.for %{{.*}} = 0 to 8 { 261// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 262// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<128x8xf32> 263// CHECK-NEXT: "foo"(%{{.*}}) : (f32) -> f32 264// CHECK-NEXT: } 265// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 266// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<32x8xf32> 267// CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32 268// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<32x8xf32> 269// CHECK-NEXT: } 270// CHECK-NEXT: } 271// CHECK-NEXT: } 272// CHECK-NEXT: } 273// CHECK-NEXT: return %{{.*}} : memref<32x8xf32> 274// CHECK-NEXT:} 275 276// ----- 277 278// Test case which illustrates fix for b/126454413 279func.func @test_add_slice_bounds() { 280 %a = memref.alloc() : memref<10xf32> 281 %b = memref.alloc() : memref<10xf32> 282 %cf7 = arith.constant 7.0 : f32 283 %c0 = arith.constant 0 : index 284 285 affine.for %i0 = 0 to 10 { 286 affine.for %i1 = 0 to 10 { 287 affine.for %i2 = 0 to 10 { 288 %a0 = affine.apply affine_map<(d0) -> (d0)> (%i0) 289 %a1 = affine.apply affine_map<(d0) -> (d0)> (%i0) 290 %a2 = affine.apply affine_map<(d0, d1) -> (d0 - d1)> (%a0, %a1) 291 affine.store %cf7, %a[%a2] : memref<10xf32> 292 } 293 } 294 } 295 affine.for %i3 = 0 to 10 { 296 affine.for %i4 = 0 to 10 { 297 affine.for %i5 = 0 to 10 { 298 %v0 = affine.load %a[%c0] : memref<10xf32> 299 } 300 } 301 } 302 303// CHECK: affine.for %{{.*}} = 0 to 10 { 304// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 305// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 306// CHECK-NEXT: affine.apply #map(%{{.*}}) 307// CHECK-NEXT: affine.apply #map(%{{.*}}) 308// CHECK-NEXT: affine.apply #map1(%{{.*}}, %{{.*}}) 309// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 310// CHECK-NEXT: } 311// CHECK-NEXT: } 312// CHECK-NEXT: } 313// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 314// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 315// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 316// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 317// CHECK-NEXT: } 318// CHECK-NEXT: } 319// CHECK-NEXT: } 320 return 321} 322 323// ----- 324 325func.func @should_fuse_init_loops_siblings_then_shared_producer(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>) { 326 %0 = memref.alloc() : memref<10x10xf32> 327 %cst = arith.constant 0.000000e+00 : f32 328 %cst_0 = arith.constant 1.000000e+00 : f32 329 %cst_1 = arith.constant 7.000000e+00 : f32 330 affine.for %i0 = 0 to 10 { 331 affine.for %i1 = 0 to 10 { 332 affine.store %cst_1, %0[%i0, %i1] : memref<10x10xf32> 333 } 334 } 335 affine.for %i2 = 0 to 3 { 336 affine.for %i3 = 0 to 3 { 337 affine.store %cst, %arg0[%i2, %i3] : memref<10x10xf32> 338 } 339 } 340 affine.for %i4 = 0 to 3 { 341 affine.for %i5 = 0 to 3 { 342 %1 = affine.load %0[%i4, %i5] : memref<10x10xf32> 343 %2 = affine.load %arg0[%i4, %i5] : memref<10x10xf32> 344 %3 = arith.mulf %1, %2 : f32 345 affine.store %3, %arg0[%i4, %i5] : memref<10x10xf32> 346 } 347 } 348 affine.for %i6 = 0 to 3 { 349 affine.for %i7 = 0 to 3 { 350 affine.store %cst_0, %arg1[%i6, %i7] : memref<10x10xf32> 351 } 352 } 353 affine.for %i8 = 0 to 3 { 354 affine.for %i9 = 0 to 3 { 355 %4 = affine.load %0[%i8, %i9] : memref<10x10xf32> 356 %5 = affine.load %arg1[%i8, %i9] : memref<10x10xf32> 357 %6 = arith.addf %4, %5 : f32 358 affine.store %6, %arg1[%i8, %i9] : memref<10x10xf32> 359 } 360 } 361 362 // Pass 1: should fuse single-use producer loop nests into their unique user, 363 // so '%i2' will fuse into '%i4' and '%i6' will fuse into '%i8'. 364 // Pass 2: should fuse sibling loop nests which share no dependence edges, 365 // so should fuse '%i4' into '%i8'. 366 // Pass 3: should fuse single-use producer loop nest '%i0' into '%i8'. Note 367 // that loop nest '%i0' now has a single user after Pass 2 fused its 368 // two users together). 369 370// CHECK: affine.for %{{.*}} = 0 to 3 { 371// CHECK-NEXT: affine.for %{{.*}} = 0 to 3 { 372// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32> 373// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32> 374// CHECK-NEXT: affine.load %{{.*}}[0, 0] : memref<1x1xf32> 375// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32> 376// CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32 377// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32> 378// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32> 379// CHECK-NEXT: affine.load %{{.*}}[0, 0] : memref<1x1xf32> 380// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32> 381// CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32 382// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32> 383// CHECK-NEXT: } 384// CHECK-NEXT: } 385// CHECK-NEXT: return 386 387 return 388} 389 390// ----- 391 392func.func @two_matrix_vector_products() { 393 %in_matrix = memref.alloc() : memref<10x10xf32> 394 %in_vec0 = memref.alloc() : memref<10xf32> 395 %in_vec1 = memref.alloc() : memref<10xf32> 396 %out_vec0 = memref.alloc() : memref<10xf32> 397 %out_vec1 = memref.alloc() : memref<10xf32> 398 %cf7 = arith.constant 7.0 : f32 399 400 // Populate input matrix. 401 affine.for %i0 = 0 to 10 { 402 affine.for %i1 = 0 to 10 { 403 affine.store %cf7, %in_matrix[%i0, %i1] : memref<10x10xf32> 404 } 405 } 406 // out_vec0 = in_matrix x in_vec0 407 affine.for %i2 = 0 to 10 { 408 affine.for %i3 = 0 to 10 { 409 %v0 = affine.load %in_matrix[%i2, %i3] : memref<10x10xf32> 410 %v1 = affine.load %in_vec0[%i3] : memref<10xf32> 411 %v2 = arith.mulf %v0, %v1 : f32 412 %v3 = affine.load %out_vec0[%i3] : memref<10xf32> 413 %v4 = arith.addf %v2, %v3 : f32 414 affine.store %v4, %out_vec0[%i3] : memref<10xf32> 415 } 416 } 417 // out_vec1 = in_matrix x in_vec1 418 affine.for %i4 = 0 to 10 { 419 affine.for %i5 = 0 to 10 { 420 %v5 = affine.load %in_matrix[%i4, %i5] : memref<10x10xf32> 421 %v6 = affine.load %in_vec1[%i5] : memref<10xf32> 422 %v7 = arith.mulf %v5, %v6 : f32 423 %v8 = affine.load %out_vec1[%i5] : memref<10xf32> 424 %v9 = arith.addf %v7, %v8 : f32 425 affine.store %v9, %out_vec1[%i5] : memref<10xf32> 426 } 427 } 428 429// CHECK: affine.for %{{.*}} = 0 to 10 { 430// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 431// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, 0] : memref<10x1xf32> 432// CHECK-NEXT: } 433// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 434// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, 0] : memref<10x1xf32> 435// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 436// CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32 437// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 438// CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32 439// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 440// CHECK-NEXT: } 441// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 442// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, 0] : memref<10x1xf32> 443// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 444// CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32 445// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 446// CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32 447// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 448// CHECK-NEXT: } 449// CHECK-NEXT: } 450// CHECK-NEXT: return 451 return 452} 453 454// ----- 455 456func.func @should_not_slice_past_slice_barrier() { 457 %0 = memref.alloc() : memref<100x16xf32> 458 affine.for %i0 = 0 to 100 { 459 affine.for %i1 = 0 to 16 { 460 %1 = "op1"() : () -> f32 461 affine.store %1, %0[%i0, %i1] : memref<100x16xf32> 462 } {slice_fusion_barrier = true} 463 } 464 affine.for %i2 = 0 to 100 { 465 affine.for %i3 = 0 to 16 { 466 %2 = affine.load %0[%i2, %i3] : memref<100x16xf32> 467 "op2"(%2) : (f32) -> () 468 } 469 } 470 // The 'slice_fusion_barrier' attribute on '%i1' prevents slicing the 471 // iteration space of '%i1' and any enclosing loop nests. 472// CHECK: affine.for %{{.*}} = 0 to 100 { 473// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 474// CHECK-NEXT: %{{.*}} = "op1"() : () -> f32 475// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, %{{.*}}] : memref<1x16xf32> 476// CHECK-NEXT: } {slice_fusion_barrier = true} 477// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 478// CHECK-NEXT: affine.load %{{.*}}[0, %{{.*}}] : memref<1x16xf32> 479// CHECK-NEXT: "op2"(%{{.*}}) : (f32) -> () 480// CHECK-NEXT: } 481// CHECK-NEXT: } 482 return 483} 484 485// ----- 486 487#map = affine_map<(d0, d1) -> (d0 * 16 + d1)> 488func.func @fuse_across_dim_mismatch(%arg0: memref<4x4x16x1xf32>, %arg1: memref<144x9xf32>, %arg2: memref<9xf32>) { 489 %1 = memref.alloc() : memref<144x4xf32> 490 %2 = arith.constant 0.0 : f32 491 affine.for %i2 = 0 to 9 { 492 affine.for %i3 = 0 to 4 { 493 affine.for %i5 = 0 to 16 { 494 %7 = affine.apply #map(%i2, %i5) 495 affine.store %2, %1[%7, %i3] : memref<144x4xf32> 496 } 497 } 498 } 499 affine.for %i6 = 0 to 9 { 500 affine.for %i7 = 0 to 9 { 501 affine.for %i8 = 0 to 4 { 502 affine.for %i10 = 0 to 16 { 503 %10 = affine.apply #map(%i6, %i10) 504 %11 = affine.load %1[%10, %i8] : memref<144x4xf32> 505 } 506 } 507 } 508 } 509 return 510} 511// MAXIMAL: #[[$MAP:.*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)> 512// MAXIMAL-LABEL: func @fuse_across_dim_mismatch 513// MAXIMAL: memref.alloc() : memref<1x1xf32> 514// MAXIMAL: affine.for %{{.*}} = 0 to 9 { 515// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 9 { 516// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 4 { 517// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 16 { 518// MAXIMAL-NEXT: affine.apply #[[$MAP]](%{{.*}}, %{{.*}}) 519// MAXIMAL-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32> 520// MAXIMAL-NEXT: affine.apply #[[$MAP]](%{{.*}}, %{{.*}}) 521// MAXIMAL-NEXT: affine.load %{{.*}}[0, 0] : memref<1x1xf32> 522// MAXIMAL-NEXT: } 523// MAXIMAL-NEXT: } 524// MAXIMAL-NEXT: } 525// MAXIMAL-NEXT: } 526 527// ----- 528 529#map3 = affine_map<(d0, d1) -> ((d0 * 72 + d1) floordiv 2304)> 530#map4 = affine_map<(d0, d1) -> (((d0 * 72 + d1) mod 2304) floordiv 1152)> 531#map5 = affine_map<(d0, d1) -> (((((d0 * 72 + d1) mod 2304) mod 1152) floordiv 9) floordiv 8)> 532#map6 = affine_map<(d0, d1) -> (((((d0 * 72 + d1) mod 2304) mod 1152) mod 9) floordiv 3)> 533#map7 = affine_map<(d0, d1) -> (((((d0 * 72 + d1) mod 2304) mod 1152) mod 9) mod 3)> 534#map10 = affine_map<(d0, d1) -> (d0 * 16 + d1)> 535#map11 = affine_map<(d0, d1) -> (d0 * 16 + d1)> 536#map12 = affine_map<(d0, d1) -> (d0 * 16 - d1 + 15)> 537func.func @fuse_across_varying_dims_complex(%arg0: f32) { 538 %c0 = arith.constant 0 : index 539 %0 = memref.alloc() : memref<2x2x3x3x16x1xf32> 540 %1 = memref.alloc() : memref<64x9xf32> 541 %2 = memref.alloc() : memref<144x4xf32> 542 affine.for %i0 = 0 to 64 { 543 affine.for %i1 = 0 to 9 { 544 %4 = affine.apply #map3(%i0, %i1) 545 %5 = affine.apply #map4(%i0, %i1) 546 %6 = affine.apply #map5(%i0, %i1) 547 %7 = affine.apply #map6(%i0, %i1) 548 %8 = affine.apply #map7(%i0, %i1) 549 %9 = affine.load %0[%4, %5, %7, %8, %6, %c0] : memref<2x2x3x3x16x1xf32> 550 affine.store %9, %1[%i0, %i1] : memref<64x9xf32> 551 } 552 } 553 affine.for %i2 = 0 to 9 { 554 affine.for %i3 = 0 to 4 { 555 affine.for %i4 = 0 to 16 { 556 %10 = affine.apply #map10(%i3, %i4) 557 %11 = affine.load %1[%10, %i2] : memref<64x9xf32> 558 } 559 affine.for %i5 = 0 to 16 { 560 %14 = affine.apply #map11(%i2, %i5) 561 affine.store %arg0, %2[%14, %i3] : memref<144x4xf32> 562 } 563 } 564 } 565 affine.for %i6 = 0 to 9 { 566 affine.for %i7 = 0 to 9 { 567 affine.for %i8 = 0 to 4 { 568 affine.for %i9 = 0 to 16 { 569 %15 = affine.apply #map12(%i8, %i9) 570 %16 = affine.load %1[%15, %i7] : memref<64x9xf32> 571 } 572 } 573 } 574 } 575 return 576} 577// MAXIMAL-DAG: [[$MAP0:#map[0-9]*]] = affine_map<(d0, d1) -> ((d0 * 72 + d1) floordiv 2304)> 578// MAXIMAL-DAG: [[$MAP1:#map[0-9]*]] = affine_map<(d0, d1) -> (((d0 * 72 + d1) mod 2304) floordiv 1152)> 579// MAXIMAL-DAG: [[$MAP2:#map[0-9]*]] = affine_map<(d0, d1) -> ((((d0 * 72 + d1) mod 1152) floordiv 9) floordiv 8)> 580// MAXIMAL-DAG: [[$MAP3:#map[0-9]*]] = affine_map<(d0, d1) -> ((d1 mod 9) floordiv 3)> 581// MAXIMAL-DAG: [[$MAP4:#map[0-9]*]] = affine_map<(d0, d1) -> (d1 mod 3)> 582// MAXIMAL-DAG: [[$MAP7:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)> 583// MAXIMAL-DAG: [[$MAP8:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 - d1 + 15)> 584// MAXIMAL-LABEL: func @fuse_across_varying_dims_complex 585// MAXIMAL-NEXT: memref.alloc() : memref<64x1xf32> 586// MAXIMAL-NEXT: arith.constant 0 : index 587// MAXIMAL-NEXT: memref.alloc() : memref<2x2x3x3x16x1xf32> 588// MAXIMAL-NEXT: memref.alloc() : memref<144x4xf32> 589// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 9 { 590// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 4 { 591// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 16 { 592// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 64 { 593// MAXIMAL-NEXT: affine.apply [[$MAP0]](%{{.*}}, %{{.*}}) 594// MAXIMAL-NEXT: affine.apply [[$MAP1]](%{{.*}}, %{{.*}}) 595// MAXIMAL-NEXT: affine.apply [[$MAP2]](%{{.*}}, %{{.*}}) 596// MAXIMAL-NEXT: affine.apply [[$MAP3]](%{{.*}}, %{{.*}}) 597// MAXIMAL-NEXT: affine.apply [[$MAP4]](%{{.*}}, %{{.*}}) 598// MAXIMAL-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<2x2x3x3x16x1xf32> 599// MAXIMAL-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, 0] : memref<64x1xf32> 600// MAXIMAL-NEXT: } 601// MAXIMAL-NEXT: affine.apply [[$MAP7]](%{{.*}}, %{{.*}}) 602// MAXIMAL-NEXT: affine.load %{{.*}}[%{{.*}} * 16 + %{{.*}}, 0] : memref<64x1xf32> 603// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 9 { 604// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 4 { 605// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 16 { 606// MAXIMAL-NEXT: affine.apply [[$MAP8]](%{{.*}}, %{{.*}}) 607// MAXIMAL-NEXT: affine.load %{{.*}}[%{{.*}} * 16 - %{{.*}} + 15, 0] : memref<64x1xf32> 608// MAXIMAL-NEXT: } 609// MAXIMAL-NEXT: } 610// MAXIMAL-NEXT: } 611// MAXIMAL-NEXT: } 612// MAXIMAL-NEXT: affine.for %{{.*}} = 0 to 16 { 613// MAXIMAL-NEXT: affine.apply [[$MAP7]](%{{.*}}, %{{.*}}) 614// MAXIMAL-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<144x4xf32> 615// MAXIMAL-NEXT: } 616// MAXIMAL-NEXT: } 617// MAXIMAL-NEXT: } 618 619// ----- 620 621func.func @should_fuse_with_slice_union() { 622 %a = memref.alloc() : memref<100xf32> 623 %c0 = arith.constant 0 : index 624 %cf0 = arith.constant 0.0 : f32 625 626 affine.for %i0 = 0 to 100 { 627 affine.store %cf0, %a[%i0]: memref<100xf32> 628 } 629 630 affine.for %i1 = 10 to 20 { 631 %v0 = affine.load %a[%i1]: memref<100xf32> 632 affine.for %i2 = 15 to 25 { 633 %v1 = affine.load %a[%i2]: memref<100xf32> 634 } 635 } 636 // The union of two slice bounds (calculated between the store and each of 637 // the loads) is computed and used in the fusion cost calculation, index 638 // remapping, and private memref size. The result is that the temporary 639 // memref is reduced from 100xf32 to 15xf32 and properly indexed by 640 // the fused loops based on the union calculation. 641// CHECK: affine.for %{{.*}} = 10 to 20 { 642// CHECK-NEXT: affine.for %{{.*}} = 10 to 25 { 643// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}} - 10] : memref<15xf32> 644// CHECK-NEXT: } 645// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} - 10] : memref<15xf32> 646// CHECK-NEXT: affine.for %{{.*}} = 15 to 25 { 647// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} - 10] : memref<15xf32> 648// CHECK-NEXT: } 649// CHECK-NEXT: } 650// CHECK-NEXT: return 651 return 652} 653 654// ----- 655 656func.func @affine_add_mm_fused(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>, %arg3: memref<1024x1024xf32>) { 657 affine.for %i2 = 0 to 1024 { 658 affine.for %i3 = 0 to 1024 { 659 %0 = affine.load %arg3[%i2, %i3] : memref<1024x1024xf32> 660 %1 = affine.load %arg2[%i2, %i3] : memref<1024x1024xf32> 661 %2 = arith.addf %1, %0 : f32 662 affine.store %2, %arg2[%i2, %i3] : memref<1024x1024xf32> 663 } 664 } 665 affine.for %i4 = 0 to 1024 { 666 affine.for %i5 = 0 to 1024 { 667 affine.for %i6 = 0 to 1024 { 668 %3 = affine.load %arg1[%i6, %i5] : memref<1024x1024xf32> 669 %4 = affine.load %arg0[%i4, %i6] : memref<1024x1024xf32> 670 %5 = arith.mulf %4, %3 : f32 671 %6 = affine.load %arg2[%i4, %i5] : memref<1024x1024xf32> 672 %7 = arith.addf %6, %5 : f32 673 affine.store %7, %arg2[%i4, %i5] : memref<1024x1024xf32> 674 } 675 } 676 } 677 // Should fuse elementwise add loop at loop depth 2, above loop-carried 678 // dependence between load/store on '%arg2', carried on reduction loop %i6. 679 // CHECK: affine.for %{{.*}} = 0 to 1024 { 680 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 { 681 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 682 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 683 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32 684 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 685 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 { 686 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 687 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 688 // CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32 689 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 690 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32 691 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 692 // CHECK-NEXT: } 693 // CHECK-NEXT: } 694 // CHECK-NEXT: } 695 return 696} 697 698// ----- 699 700func.func @affine_2mm_fused(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>, %arg3: memref<1024x1024xf32>, %arg4: memref<1024x1024xf32>) { 701 %cst = arith.constant 0.000000e+00 : f32 702 affine.for %i0 = 0 to 1024 { 703 affine.for %i1 = 0 to 1024 { 704 affine.store %cst, %arg2[%i0, %i1] : memref<1024x1024xf32> 705 } 706 } 707 affine.for %i2 = 0 to 1024 { 708 affine.for %i3 = 0 to 1024 { 709 affine.store %cst, %arg4[%i2, %i3] : memref<1024x1024xf32> 710 } 711 } 712 affine.for %i4 = 0 to 1024 { 713 affine.for %i5 = 0 to 1024 { 714 affine.for %i6 = 0 to 1024 { 715 %0 = affine.load %arg1[%i6, %i5] : memref<1024x1024xf32> 716 %1 = affine.load %arg0[%i4, %i6] : memref<1024x1024xf32> 717 %2 = arith.mulf %1, %0 : f32 718 %3 = affine.load %arg2[%i4, %i5] : memref<1024x1024xf32> 719 %4 = arith.addf %3, %2 : f32 720 affine.store %4, %arg2[%i4, %i5] : memref<1024x1024xf32> 721 } 722 } 723 } 724 affine.for %i7 = 0 to 1024 { 725 affine.for %i8 = 0 to 1024 { 726 affine.for %i9 = 0 to 1024 { 727 %5 = affine.load %arg1[%i9, %i8] : memref<1024x1024xf32> 728 %6 = affine.load %arg0[%i7, %i9] : memref<1024x1024xf32> 729 %7 = arith.mulf %6, %5 : f32 730 %8 = affine.load %arg4[%i7, %i8] : memref<1024x1024xf32> 731 %9 = arith.addf %8, %7 : f32 732 affine.store %9, %arg4[%i7, %i8] : memref<1024x1024xf32> 733 } 734 } 735 } 736 737 // Should fuse MM initialization loops into their consumers, then fuse the 738 // two matmul loops together for input reuse on '%arg0/%arg1'. 739 740 // CHECK: affine.for %{{.*}} = 0 to 1024 { 741 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 { 742 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 743 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 { 744 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 745 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 746 // CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32 747 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 748 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32 749 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 750 // CHECK-NEXT: } 751 // CHECK-NEXT: } 752 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 { 753 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 754 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 { 755 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 756 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 757 // CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32 758 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 759 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32 760 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 761 // CHECK-NEXT: } 762 // CHECK-NEXT: } 763 // CHECK-NEXT: } 764 765 return 766} 767 768// ----- 769 770func.func @affine_2_dependent_mm_fused(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>, %arg3: memref<1024x1024xf32>, %arg4: memref<1024x1024xf32>) { 771 affine.for %i0 = 0 to 1024 { 772 affine.for %i1 = 0 to 1024 { 773 affine.for %i2 = 0 to 1024 { 774 %0 = affine.load %arg1[%i2, %i1] : memref<1024x1024xf32> 775 %1 = affine.load %arg0[%i0, %i2] : memref<1024x1024xf32> 776 %2 = arith.mulf %1, %0 : f32 777 %3 = affine.load %arg2[%i0, %i1] : memref<1024x1024xf32> 778 %4 = arith.addf %3, %2 : f32 779 affine.store %4, %arg2[%i0, %i1] : memref<1024x1024xf32> 780 } 781 } 782 } 783 affine.for %i3 = 0 to 1024 { 784 affine.for %i4 = 0 to 1024 { 785 affine.for %i5 = 0 to 1024 { 786 %5 = affine.load %arg3[%i5, %i4] : memref<1024x1024xf32> 787 %6 = affine.load %arg2[%i3, %i5] : memref<1024x1024xf32> 788 %7 = arith.mulf %6, %5 : f32 789 %8 = affine.load %arg4[%i3, %i4] : memref<1024x1024xf32> 790 %9 = arith.addf %8, %7 : f32 791 affine.store %9, %arg4[%i3, %i4] : memref<1024x1024xf32> 792 } 793 } 794 } 795 796 // CHECK: affine.for %{{.*}} = 0 to 1024 { 797 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 { 798 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 { 799 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 800 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 801 // CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32 802 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 803 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32 804 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 805 // CHECK-NEXT: } 806 // CHECK-NEXT: } 807 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 { 808 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 { 809 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 810 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 811 // CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32 812 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 813 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32 814 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 815 // CHECK-NEXT: } 816 // CHECK-NEXT: } 817 // CHECK-NEXT: } 818 return 819} 820 821// ----- 822 823// CHECK-LABEL: func @should_fuse_self_dependence_multi_store_producer() { 824func.func @should_fuse_self_dependence_multi_store_producer() { 825 %m = memref.alloc() : memref<10xf32> 826 %local_m = memref.alloc() : memref<10xf32> 827 %cf7 = arith.constant 7.0 : f32 828 829 affine.for %i0 = 0 to 10 { 830 affine.store %cf7, %local_m[%i0] : memref<10xf32> 831 %v0 = affine.load %local_m[%i0] : memref<10xf32> 832 affine.store %v0, %m[%i0] : memref<10xf32> 833 } 834 affine.for %i1 = 0 to 10 { 835 %v1 = affine.load %m[%i1] : memref<10xf32> 836 } 837 // CHECK: affine.for %[[i0:.*]] = 0 to 10 { 838 // CHECK-NEXT: affine.store %{{.*}}, [[LOCAL_M:%.*]][%[[i0]]] : memref<10xf32> 839 // CHECK-NEXT: [[v0:%.*]] = affine.load [[LOCAL_M]][%[[i0]]] : memref<10xf32> 840 // CHECK-NEXT: affine.store [[v0]], %{{.*}}[0] : memref<1xf32> 841 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 842 // CHECK-NEXT: } 843 // CHECK-NEXT: return 844 return 845} 846 847// ----- 848 849// CHECK-LABEL: func @should_fuse_dead_multi_store_producer() { 850func.func @should_fuse_dead_multi_store_producer() { 851 %m = memref.alloc() : memref<10xf32> 852 %dead_m = memref.alloc() : memref<10xf32> 853 %cf7 = arith.constant 7.0 : f32 854 855 affine.for %i0 = 0 to 10 { 856 affine.store %cf7, %dead_m[%i0] : memref<10xf32> 857 affine.store %cf7, %m[%i0] : memref<10xf32> 858 } 859 affine.for %i1 = 0 to 10 { 860 %v0 = affine.load %m[%i1] : memref<10xf32> 861 } 862 // CHECK: affine.for %[[i0:.*]] = 0 to 10 { 863 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[i0]]] : memref<10xf32> 864 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 865 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 866 // CHECK-NEXT: } 867 // CHECK-NEXT: return 868 return 869} 870 871// ----- 872 873// CHECK-LABEL: func @should_fuse_function_live_out_multi_store_producer 874func.func @should_fuse_function_live_out_multi_store_producer(%live_in_out_m : memref<10xf32>) { 875 %m = memref.alloc() : memref<10xf32> 876 %cf7 = arith.constant 7.0 : f32 877 878 affine.for %i0 = 0 to 10 { 879 affine.store %cf7, %live_in_out_m[%i0] : memref<10xf32> 880 affine.store %cf7, %m[%i0] : memref<10xf32> 881 } 882 affine.for %i1 = 0 to 10 { 883 %v0 = affine.load %m[%i1] : memref<10xf32> 884 } 885 // CHECK: affine.for %[[i0:.*]] = 0 to 10 { 886 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[i0]]] : memref<10xf32> 887 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 888 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 889 // CHECK-NEXT: } 890 // CHECK-NEXT: return 891 return 892} 893 894// Add further tests in mlir/test/Transforms/loop-fusion-4.mlir 895