1// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion))' -split-input-file | FileCheck %s 2 3// Part II of fusion tests in mlir/test/Transforms/loop-fusion=2.mlir. 4// Part III of fusion tests in mlir/test/Transforms/loop-fusion-3.mlir 5// Part IV of fusion tests in mlir/test/Transforms/loop-fusion-4.mlir 6 7// TODO: Add more tests: 8// *) Add nested fusion test cases when non-constant loop bound support is 9// added to iteration domain in dependence check. 10// *) Add a test w/ floordiv/ceildiv/mod when supported in dependence check. 11// *) Add tests which check fused computation slice indexing and loop bounds. 12// TODO: Test clean up: move memref allocs to func args. 13 14// ----- 15 16// CHECK-LABEL: func @should_fuse_raw_dep_for_locality() { 17func.func @should_fuse_raw_dep_for_locality() { 18 %m = memref.alloc() : memref<10xf32> 19 %cf7 = arith.constant 7.0 : f32 20 21 affine.for %i0 = 0 to 10 { 22 affine.store %cf7, %m[%i0] : memref<10xf32> 23 } 24 affine.for %i1 = 0 to 10 { 25 %v0 = affine.load %m[%i1] : memref<10xf32> 26 } 27 // CHECK: affine.for %{{.*}} = 0 to 10 { 28 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 29 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 30 // CHECK-NEXT: } 31 // CHECK-NEXT: return 32 return 33} 34 35// ----- 36 37// CHECK-LABEL: func @should_fuse_reduction_to_pointwise() { 38func.func @should_fuse_reduction_to_pointwise() { 39 %a = memref.alloc() : memref<10x10xf32> 40 %b = memref.alloc() : memref<10xf32> 41 %c = memref.alloc() : memref<10xf32> 42 43 %cf7 = arith.constant 7.0 : f32 44 45 affine.for %i0 = 0 to 10 { 46 affine.for %i1 = 0 to 10 { 47 %v0 = affine.load %b[%i0] : memref<10xf32> 48 %v1 = affine.load %a[%i0, %i1] : memref<10x10xf32> 49 %v3 = arith.addf %v0, %v1 : f32 50 affine.store %v3, %b[%i0] : memref<10xf32> 51 } 52 } 53 affine.for %i2 = 0 to 10 { 54 %v4 = affine.load %b[%i2] : memref<10xf32> 55 affine.store %v4, %c[%i2] : memref<10xf32> 56 } 57 58 // Should fuse in entire inner loop on %i1 from source loop nest, as %i1 59 // is not used in the access function of the store/load on %b. 60 // CHECK: affine.for %{{.*}} = 0 to 10 { 61 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 62 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 63 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32> 64 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32 65 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 66 // CHECK-NEXT: } 67 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 68 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 69 // CHECK-NEXT: } 70 // CHECK-NEXT: return 71 return 72} 73 74// ----- 75 76// CHECK-DAG: [[$MAP_SHIFT_MINUS_ONE_R1:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 - 1)> 77// CHECK-DAG: [[$MAP_SHIFT_D0_BY_ONE:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d0 + 1)> 78// CHECK-DAG: [[$MAP_SHIFT_D1_BY_ONE:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d1 + 1)> 79 80// CHECK-LABEL: func @should_fuse_loop_nests_with_shifts() { 81func.func @should_fuse_loop_nests_with_shifts() { 82 %a = memref.alloc() : memref<10x10xf32> 83 %cf7 = arith.constant 7.0 : f32 84 85 affine.for %i0 = 0 to 9 { 86 affine.for %i1 = 0 to 9 { 87 affine.store %cf7, %a[%i0 + 1, %i1 + 1] : memref<10x10xf32> 88 } 89 } 90 affine.for %i2 = 1 to 10 { 91 affine.for %i3 = 1 to 10 { 92 %v0 = affine.load %a[%i2, %i3] : memref<10x10xf32> 93 } 94 } 95 96 // Source slice affine apply sequence: 97 // *) First two affine apply's map from the dst to src iteration space. 98 // *) Third affine apply is access function around src store. 99 // *) Fourth affine apply shifts the stores access function by '-1', because 100 // of the offset induced by reducing the memref shape from 10x10 to 9x9. 101 // *) Fifth affine apply shifts the loads access function by '-1', because 102 // of the offset induced by reducing the memref shape from 10x10 to 9x9. 103 // NOTE: Should create a private memref with reduced shape 9x9xf32. 104 // CHECK: affine.for %{{.*}} = 1 to 10 { 105 // CHECK-NEXT: affine.for %{{.*}} = 1 to 10 { 106 // CHECK-NEXT: %[[I:.*]] = affine.apply [[$MAP_SHIFT_MINUS_ONE_R1]](%{{.*}}) 107 // CHECK-NEXT: %[[J:.*]] = affine.apply [[$MAP_SHIFT_MINUS_ONE_R1]](%{{.*}}) 108 // CHECK-NEXT: affine.apply [[$MAP_SHIFT_D0_BY_ONE]](%[[I]], %[[J]]) 109 // CHECK-NEXT: affine.apply [[$MAP_SHIFT_D1_BY_ONE]](%[[I]], %[[J]]) 110 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32> 111 // CHECK-NEXT: affine.load %{{.*}}[0, 0] : memref<1x1xf32> 112 // CHECK-NEXT: } 113 // CHECK-NEXT: } 114 // CHECK-NEXT: return 115 return 116} 117 118// ----- 119 120// CHECK-LABEL: func @should_fuse_loop_nest() { 121func.func @should_fuse_loop_nest() { 122 %a = memref.alloc() : memref<10x10xf32> 123 %b = memref.alloc() : memref<10x10xf32> 124 %cf7 = arith.constant 7.0 : f32 125 126 affine.for %i0 = 0 to 10 { 127 affine.for %i1 = 0 to 10 { 128 affine.store %cf7, %a[%i0, %i1] : memref<10x10xf32> 129 } 130 } 131 affine.for %i2 = 0 to 10 { 132 affine.for %i3 = 0 to 10 { 133 %v0 = affine.load %a[%i3, %i2] : memref<10x10xf32> 134 affine.store %v0, %b[%i2, %i3] : memref<10x10xf32> 135 } 136 } 137 affine.for %i4 = 0 to 10 { 138 affine.for %i5 = 0 to 10 { 139 %v1 = affine.load %b[%i4, %i5] : memref<10x10xf32> 140 } 141 } 142 // Expecting private memref for '%a' first, then private memref for '%b'. 143 // CHECK-DAG: [[NEWA:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1x1xf32> 144 // CHECK-DAG: [[NEWB:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1x1xf32> 145 // CHECK: affine.for %{{.*}} = 0 to 10 { 146 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 147 // CHECK-NEXT: affine.store %{{.*}}, [[NEWA]][0, 0] : memref<1x1xf32> 148 // CHECK-NEXT: affine.load [[NEWA]][0, 0] : memref<1x1xf32> 149 // CHECK-NEXT: affine.store %{{.*}}, [[NEWB]][0, 0] : memref<1x1xf32> 150 // CHECK-NEXT: affine.load [[NEWB]][0, 0] : memref<1x1xf32> 151 // CHECK-NEXT: } 152 // CHECK-NEXT: } 153 // CHECK-NEXT: return 154 return 155} 156 157// ----- 158 159// CHECK-LABEL: func @should_fuse_across_intermediate_loop_with_no_deps() { 160func.func @should_fuse_across_intermediate_loop_with_no_deps() { 161 %a = memref.alloc() : memref<10xf32> 162 %b = memref.alloc() : memref<10xf32> 163 %c = memref.alloc() : memref<10xf32> 164 165 %cf7 = arith.constant 7.0 : f32 166 167 affine.for %i0 = 0 to 10 { 168 %v0 = affine.load %a[%i0] : memref<10xf32> 169 affine.store %v0, %b[%i0] : memref<10xf32> 170 } 171 affine.for %i1 = 0 to 10 { 172 affine.store %cf7, %c[%i1] : memref<10xf32> 173 } 174 affine.for %i2 = 0 to 10 { 175 %v1 = affine.load %b[%i2] : memref<10xf32> 176 } 177 178 // Should fuse first loop (past second loop with no dependences) into third. 179 // Note that fusion creates a private memref '%2' for the fused loop nest. 180 // CHECK: affine.for %{{.*}} = 0 to 10 { 181 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 182 // CHECK-NEXT: } 183 // CHECK: affine.for %{{.*}} = 0 to 10 { 184 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 185 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 186 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 187 // CHECK-NEXT: } 188 // CHECK-NEXT: return 189 return 190} 191 192// ----- 193 194// CHECK-LABEL: func @should_fuse_all_loops() { 195func.func @should_fuse_all_loops() { 196 %a = memref.alloc() : memref<10xf32> 197 %b = memref.alloc() : memref<10xf32> 198 %cf7 = arith.constant 7.0 : f32 199 200 // Set up flow dependences from first and second loops to third. 201 affine.for %i0 = 0 to 10 { 202 affine.store %cf7, %a[%i0] : memref<10xf32> 203 } 204 affine.for %i1 = 0 to 10 { 205 affine.store %cf7, %b[%i1] : memref<10xf32> 206 } 207 affine.for %i2 = 0 to 10 { 208 %v0 = affine.load %a[%i2] : memref<10xf32> 209 %v1 = affine.load %b[%i2] : memref<10xf32> 210 } 211 212 // Should fuse first and second loops into third. 213 // Expecting private memref for '%a' first, then private memref for '%b'. 214 // CHECK-DAG: [[NEWA:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xf32> 215 // CHECK-DAG: [[NEWB:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xf32> 216 // CHECK: affine.for %{{.*}} = 0 to 10 { 217 // CHECK-NEXT: affine.store %{{.*}}, [[NEWA]][0] : memref<1xf32> 218 // CHECK-NEXT: affine.store %{{.*}}, [[NEWB]][0] : memref<1xf32> 219 // CHECK-NEXT: affine.load [[NEWA]][0] : memref<1xf32> 220 // CHECK-NEXT: affine.load [[NEWB]][0] : memref<1xf32> 221 // CHECK-NEXT: } 222 // CHECK-NEXT: return 223 return 224} 225 226// ----- 227 228// CHECK-LABEL: func @should_fuse_first_and_second_loops() { 229func.func @should_fuse_first_and_second_loops() { 230 %a = memref.alloc() : memref<10xf32> 231 %b = memref.alloc() : memref<10xf32> 232 %c = memref.alloc() : memref<10xf32> 233 234 %cf7 = arith.constant 7.0 : f32 235 236 affine.for %i0 = 0 to 10 { 237 affine.store %cf7, %a[%i0] : memref<10xf32> 238 } 239 affine.for %i1 = 0 to 10 { 240 %v0 = affine.load %a[%i1] : memref<10xf32> 241 affine.store %cf7, %b[%i1] : memref<10xf32> 242 } 243 affine.for %i2 = 0 to 10 { 244 %v1 = affine.load %c[%i2] : memref<10xf32> 245 } 246 247 // Should fuse first loop into the second (last loop should not be fused). 248 // Should create private memref '%2' for fused scf. 249 // CHECK: affine.for %{{.*}} = 0 to 10 { 250 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 251 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 252 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 253 // CHECK-NEXT: } 254 // CHECK: affine.for %{{.*}} = 0 to 10 { 255 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 256 // CHECK-NEXT: } 257 // CHECK-NEXT: return 258 259 return 260} 261 262// ----- 263 264// CHECK-LABEL: func @should_not_fuse_would_create_cycle() { 265func.func @should_not_fuse_would_create_cycle() { 266 %a = memref.alloc() : memref<10xf32> 267 %b = memref.alloc() : memref<10xf32> 268 %c = memref.alloc() : memref<10xf32> 269 270 %cf7 = arith.constant 7.0 : f32 271 272 // Set up the following dependences: 273 // 1) loop0 -> loop1 on memref '%{{.*}}' 274 // 2) loop0 -> loop2 on memref '%{{.*}}' 275 // 3) loop1 -> loop2 on memref '%{{.*}}' 276 affine.for %i0 = 0 to 10 { 277 %v0 = affine.load %a[%i0] : memref<10xf32> 278 affine.store %cf7, %b[%i0] : memref<10xf32> 279 } 280 affine.for %i1 = 0 to 10 { 281 affine.store %cf7, %a[%i1] : memref<10xf32> 282 %v1 = affine.load %c[%i1] : memref<10xf32> 283 } 284 affine.for %i2 = 0 to 10 { 285 %v2 = affine.load %b[%i2] : memref<10xf32> 286 affine.store %cf7, %c[%i2] : memref<10xf32> 287 } 288 // Should not fuse: fusing loop first loop into last would create a cycle. 289 // CHECK: affine.for %{{.*}} = 0 to 10 { 290 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 291 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 292 // CHECK-NEXT: } 293 // CHECK: affine.for %{{.*}} = 0 to 10 { 294 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 295 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 296 // CHECK-NEXT: } 297 // CHECK: affine.for %{{.*}} = 0 to 10 { 298 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 299 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 300 // CHECK-NEXT: } 301 // CHECK-NEXT: return 302 return 303} 304 305// ----- 306 307// CHECK-LABEL: func @should_fuse_producer_consumer() { 308func.func @should_fuse_producer_consumer() { 309 %m = memref.alloc() : memref<10xf32> 310 %cf7 = arith.constant 7.0 : f32 311 312 affine.for %i0 = 0 to 10 { 313 affine.store %cf7, %m[%i0] : memref<10xf32> 314 } 315 affine.for %i1 = 0 to 10 { 316 affine.store %cf7, %m[%i1] : memref<10xf32> 317 } 318 affine.for %i2 = 0 to 10 { 319 %v1 = affine.load %m[%i2] : memref<10xf32> 320 } 321 // Fusing loop %i0 to %i2 would violate the WAW dependence between %i0 and 322 // %i1, but OK to fuse %i1 into %i2. 323 // TODO: When the fusion pass is run to a fixed-point, it should 324 // fuse all three of these loop nests. 325 // CHECK: memref.alloc() : memref<1xf32> 326 // CHECK: affine.for %{{.*}} = 0 to 10 { 327 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 328 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 329 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 330 // CHECK-NEXT: } 331 // CHECK-NEXT: return 332 return 333} 334 335// ----- 336 337// CHECK-LABEL: func @should_fuse_and_move_to_preserve_war_dep() { 338func.func @should_fuse_and_move_to_preserve_war_dep() { 339 %a = memref.alloc() : memref<10xf32> 340 %b = memref.alloc() : memref<10xf32> 341 %cf7 = arith.constant 7.0 : f32 342 343 affine.for %i0 = 0 to 10 { 344 %v0 = affine.load %a[%i0] : memref<10xf32> 345 affine.store %v0, %b[%i0] : memref<10xf32> 346 } 347 affine.for %i1 = 0 to 10 { 348 affine.store %cf7, %a[%i1] : memref<10xf32> 349 } 350 affine.for %i2 = 0 to 10 { 351 %v1 = affine.load %b[%i2] : memref<10xf32> 352 } 353 // Loops '%i1' and '%i2' have no dependences. We can fuse a slice of '%i0' 354 // into '%i2' if we move the fused loop nest before '%i1', which preserves 355 // the WAR dependence from load '%a' in '%i0' to the store '%a' in loop '%i1'. 356 // CHECK: affine.for %{{.*}} = 0 to 10 { 357 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 358 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 359 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 360 // CHECK-NEXT: } 361 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 362 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 363 // CHECK-NEXT: } 364 // CHECK-NEXT: return 365 return 366} 367 368// ----- 369 370// CHECK-LABEL: func @should_fuse_if_top_level_access() { 371func.func @should_fuse_if_top_level_access() { 372 %m = memref.alloc() : memref<10xf32> 373 %cf7 = arith.constant 7.0 : f32 374 375 affine.for %i0 = 0 to 10 { 376 affine.store %cf7, %m[%i0] : memref<10xf32> 377 } 378 affine.for %i1 = 0 to 10 { 379 %v0 = affine.load %m[%i1] : memref<10xf32> 380 } 381 382 %c0 = arith.constant 4 : index 383 %v1 = affine.load %m[%c0] : memref<10xf32> 384 // Top-level load to '%m' should prevent creating a private memref but 385 // loop nests should be fused and '%i0' should be removed. 386 // CHECK: %[[m:.*]] = memref.alloc() : memref<10xf32> 387 // CHECK-NOT: memref.alloc 388 389 // CHECK: affine.for %[[i1:.*]] = 0 to 10 { 390 // CHECK-NEXT: affine.store %{{.*}}, %[[m]][%[[i1]]] : memref<10xf32> 391 // CHECK-NEXT: affine.load %[[m]][%[[i1]]] : memref<10xf32> 392 // CHECK-NEXT: } 393 // CHECK: affine.load %[[m]][%{{.*}}] : memref<10xf32> 394 return 395} 396 397// ----- 398 399// CHECK-LABEL: func @should_fuse_but_not_remove_src() { 400func.func @should_fuse_but_not_remove_src() { 401 %m = memref.alloc() : memref<100xf32> 402 %cf7 = arith.constant 7.0 : f32 403 404 affine.for %i0 = 0 to 100 { 405 affine.store %cf7, %m[%i0] : memref<100xf32> 406 } 407 affine.for %i1 = 0 to 17 { 408 %v0 = affine.load %m[%i1] : memref<100xf32> 409 } 410 %v1 = affine.load %m[99] : memref<100xf32> 411 412 // Loop '%i0' and '%i1' should be fused but '%i0' shouldn't be removed to 413 // preserve the dependence with the top-level access. 414 // CHECK: affine.for %{{.*}} = 0 to 100 { 415 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<100xf32> 416 // CHECK-NEXT: } 417 // CHECK-NEXT: affine.for %{{.*}} = 0 to 17 { 418 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 419 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 420 // CHECK-NEXT: } 421 // CHECK-NEXT: affine.load %{{.*}}[99] : memref<100xf32> 422 // CHECK-NEXT: return 423 return 424} 425 426// ----- 427 428// CHECK-LABEL: func @should_fuse_no_top_level_access() { 429func.func @should_fuse_no_top_level_access() { 430 %m = memref.alloc() : memref<10xf32> 431 %cf7 = arith.constant 7.0 : f32 432 433 affine.for %i0 = 0 to 10 { 434 affine.store %cf7, %m[%i0] : memref<10xf32> 435 } 436 affine.for %i1 = 0 to 10 { 437 %v0 = affine.load %m[%i1] : memref<10xf32> 438 } 439 // CHECK: affine.for %{{.*}} = 0 to 10 { 440 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 441 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 442 // CHECK-NEXT: } 443 // CHECK-NEXT: return 444 return 445} 446 447// ----- 448 449#set0 = affine_set<(d0) : (1 == 0)> 450 451// CHECK-LABEL: func @should_not_fuse_if_op_at_top_level() { 452func.func @should_not_fuse_if_op_at_top_level() { 453 %m = memref.alloc() : memref<10xf32> 454 %cf7 = arith.constant 7.0 : f32 455 456 affine.for %i0 = 0 to 10 { 457 affine.store %cf7, %m[%i0] : memref<10xf32> 458 } 459 affine.for %i1 = 0 to 10 { 460 %v0 = affine.load %m[%i1] : memref<10xf32> 461 } 462 %c0 = arith.constant 4 : index 463 affine.if #set0(%c0) { 464 } 465 // Top-level IfOp should prevent fusion. 466 // CHECK: affine.for %{{.*}} = 0 to 10 { 467 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 468 // CHECK-NEXT: } 469 // CHECK: affine.for %{{.*}} = 0 to 10 { 470 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 471 // CHECK-NEXT: } 472 return 473} 474 475// ----- 476 477#set0 = affine_set<(d0) : (1 == 0)> 478 479// CHECK-LABEL: func @should_not_fuse_if_op_in_loop_nest() { 480func.func @should_not_fuse_if_op_in_loop_nest() { 481 %m = memref.alloc() : memref<10xf32> 482 %cf7 = arith.constant 7.0 : f32 483 %c4 = arith.constant 4 : index 484 485 affine.for %i0 = 0 to 10 { 486 affine.store %cf7, %m[%i0] : memref<10xf32> 487 } 488 affine.for %i1 = 0 to 10 { 489 affine.if #set0(%c4) { 490 } 491 %v0 = affine.load %m[%i1] : memref<10xf32> 492 } 493 494 // IfOp in ForOp should prevent fusion. 495 // CHECK: affine.for %{{.*}} = 0 to 10 { 496 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 497 // CHECK-NEXT: } 498 // CHECK: affine.for %{{.*}} = 0 to 10 { 499 // CHECK-NEXT: affine.if #set(%{{.*}}) { 500 // CHECK-NEXT: } 501 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 502 // CHECK-NEXT: } 503 return 504} 505 506// ----- 507 508#set = affine_set<(d0) : (d0 - 1 >= 0)> 509 510// CHECK-LABEL: func @should_fuse_if_op_in_loop_nest_not_sandwiched() -> memref<10xf32> { 511func.func @should_fuse_if_op_in_loop_nest_not_sandwiched() -> memref<10xf32> { 512 %a = memref.alloc() : memref<10xf32> 513 %b = memref.alloc() : memref<10xf32> 514 %cf7 = arith.constant 7.0 : f32 515 516 affine.for %i0 = 0 to 10 { 517 affine.store %cf7, %a[%i0] : memref<10xf32> 518 } 519 affine.for %i1 = 0 to 10 { 520 %v0 = affine.load %a[%i1] : memref<10xf32> 521 affine.store %v0, %b[%i1] : memref<10xf32> 522 } 523 affine.for %i2 = 0 to 10 { 524 affine.if #set(%i2) { 525 %v0 = affine.load %b[%i2] : memref<10xf32> 526 } 527 } 528 529 // IfOp in ForOp should not prevent fusion if it does not in between the 530 // source and dest ForOp ops. 531 532 // CHECK: affine.for 533 // CHECK-NEXT: affine.store 534 // CHECK-NEXT: affine.load 535 // CHECK-NEXT: affine.store 536 // CHECK: affine.for 537 // CHECK-NEXT: affine.if 538 // CHECK-NEXT: affine.load 539 // CHECK-NOT: affine.for 540 // CHECK: return 541 542 return %a : memref<10xf32> 543} 544 545// ----- 546 547#set = affine_set<(d0) : (d0 - 1 >= 0)> 548 549// CHECK-LABEL: func @should_not_fuse_if_op_in_loop_nest_between_src_and_dest() -> memref<10xf32> { 550func.func @should_not_fuse_if_op_in_loop_nest_between_src_and_dest() -> memref<10xf32> { 551 %a = memref.alloc() : memref<10xf32> 552 %b = memref.alloc() : memref<10xf32> 553 %cf7 = arith.constant 7.0 : f32 554 555 affine.for %i0 = 0 to 10 { 556 affine.store %cf7, %a[%i0] : memref<10xf32> 557 } 558 affine.for %i1 = 0 to 10 { 559 affine.if #set(%i1) { 560 affine.store %cf7, %a[%i1] : memref<10xf32> 561 } 562 } 563 affine.for %i3 = 0 to 10 { 564 %v0 = affine.load %a[%i3] : memref<10xf32> 565 affine.store %v0, %b[%i3] : memref<10xf32> 566 } 567 return %b : memref<10xf32> 568 569 // IfOp in ForOp which modifies the memref should prevent fusion if it is in 570 // between the source and dest ForOp. 571 572 // CHECK: affine.for 573 // CHECK-NEXT: affine.store 574 // CHECK: affine.for 575 // CHECK-NEXT: affine.if 576 // CHECK-NEXT: affine.store 577 // CHECK: affine.for 578 // CHECK-NEXT: affine.load 579 // CHECK-NEXT: affine.store 580 // CHECK: return 581} 582 583// ----- 584 585// CHECK-LABEL: func @permute_and_fuse() { 586func.func @permute_and_fuse() { 587 %m = memref.alloc() : memref<10x20x30xf32> 588 589 %cf7 = arith.constant 7.0 : f32 590 affine.for %i0 = 0 to 10 { 591 affine.for %i1 = 0 to 20 { 592 affine.for %i2 = 0 to 30 { 593 affine.store %cf7, %m[%i0, %i1, %i2] : memref<10x20x30xf32> 594 } 595 } 596 } 597 affine.for %i3 = 0 to 30 { 598 affine.for %i4 = 0 to 10 { 599 affine.for %i5 = 0 to 20 { 600 %v0 = affine.load %m[%i4, %i5, %i3] : memref<10x20x30xf32> 601 "foo"(%v0) : (f32) -> () 602 } 603 } 604 } 605// CHECK: affine.for %{{.*}} = 0 to 30 { 606// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 607// CHECK-NEXT: affine.for %{{.*}} = 0 to 20 { 608// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0, 0] : memref<1x1x1xf32> 609// CHECK-NEXT: affine.load %{{.*}}[0, 0, 0] : memref<1x1x1xf32> 610// CHECK-NEXT: "foo"(%{{.*}}) : (f32) -> () 611// CHECK-NEXT: } 612// CHECK-NEXT: } 613// CHECK-NEXT: } 614// CHECK-NEXT: return 615 616 return 617} 618 619// ----- 620 621// CHECK-DAG: [[$MAP0:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d0 * 4 + d1)> 622// CHECK-DAG: [[$MAP1:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 floordiv 4)> 623// CHECK-DAG: [[$MAP2:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 mod 4)> 624 625// Reshape from a 64 x f32 to 16 x 4 x f32. 626// CHECK-LABEL: func @fuse_reshape_64_16_4 627func.func @fuse_reshape_64_16_4(%in : memref<64xf32>) { 628 %out = memref.alloc() : memref<16x4xf32> 629 630 affine.for %i0 = 0 to 64 { 631 %v = affine.load %in[%i0] : memref<64xf32> 632 affine.store %v, %out[%i0 floordiv 4, %i0 mod 4] : memref<16x4xf32> 633 } 634 635 affine.for %i1 = 0 to 16 { 636 affine.for %i2 = 0 to 4 { 637 %w = affine.load %out[%i1, %i2] : memref<16x4xf32> 638 "foo"(%w) : (f32) -> () 639 } 640 } 641 return 642 // CHECK: affine.for %{{.*}} = 643 // CHECK-NEXT: affine.for %{{.*}} = 644 // CHECK-NOT: for 645 // CHECK: } 646 // CHECK-NEXT: } 647 // CHECK-NEXT: return 648} 649 650// ----- 651// CHECK-DAG: [[$MAP0:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 floordiv 4)> 652// CHECK-DAG: [[$MAP1:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 mod 4)> 653// CHECK-DAG: [[$MAP2:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d0 * 4 + d1)> 654 655// Reshape a 16x4xf32 to 64xf32. 656// CHECK-LABEL: func @fuse_reshape_16_4_64 657func.func @fuse_reshape_16_4_64() { 658 %in = memref.alloc() : memref<16x4xf32> 659 %out = memref.alloc() : memref<64xf32> 660 661 affine.for %i0 = 0 to 16 { 662 affine.for %i1 = 0 to 4 { 663 %v = affine.load %in[%i0, %i1] : memref<16x4xf32> 664 affine.store %v, %out[4*%i0 + %i1] : memref<64xf32> 665 } 666 } 667 668 affine.for %i2 = 0 to 64 { 669 %w = affine.load %out[%i2] : memref<64xf32> 670 "foo"(%w) : (f32) -> () 671 } 672// CHECK: affine.for %{{.*}} = 0 to 64 { 673// CHECK-NEXT: affine.apply [[$MAP0]](%{{.*}}) 674// CHECK-NEXT: affine.apply [[$MAP1]](%{{.*}}) 675// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<16x4xf32> 676// CHECK-NEXT: affine.apply [[$MAP2]](%{{.*}}, %{{.*}}) 677// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 678// CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 679// CHECK-NEXT: "foo"(%{{.*}}) : (f32) -> () 680// CHECK-NEXT: } 681// CHECK-NEXT: return 682 return 683} 684 685 686// ----- 687 688// All three loop nests below (6-d one, 2-d one, 2-d one is fused into a single 689// 2-d loop nest). 690func.func @R6_to_R2_reshape_square() -> memref<64x9xi32> { 691 %in = memref.alloc() : memref<2x2x3x3x16x1xi32> 692 %out = memref.alloc() : memref<64x9xi32> 693 %live_out = memref.alloc() : memref<64x9xi32> 694 695 // Initialize input. 696 affine.for %i0 = 0 to 2 { 697 affine.for %i1 = 0 to 2 { 698 affine.for %i2 = 0 to 3 { 699 affine.for %i3 = 0 to 3 { 700 affine.for %i4 = 0 to 16 { 701 affine.for %i5 = 0 to 1 { 702 %val = "foo"(%i0, %i1, %i2, %i3, %i4, %i5) : (index, index, index, index, index, index) -> i32 703 affine.store %val, %in[%i0, %i1, %i2, %i3, %i4, %i5] : memref<2x2x3x3x16x1xi32> 704 } 705 } 706 } 707 } 708 } 709 } 710 711 affine.for %ii = 0 to 64 { 712 affine.for %jj = 0 to 9 { 713 // Convert output coordinates to linear index. 714 %a0 = affine.apply affine_map<(d0, d1) -> (d0 * 9 + d1)> (%ii, %jj) 715 %0 = affine.apply affine_map<(d0) -> (d0 floordiv (2 * 3 * 3 * 16 * 1))>(%a0) 716 %1 = affine.apply affine_map<(d0) -> ((d0 mod 288) floordiv (3 * 3 * 16 * 1))>(%a0) 717 %2 = affine.apply affine_map<(d0) -> (((d0 mod 288) mod 144) floordiv (3 * 16 * 1))>(%a0) 718 %3 = affine.apply affine_map<(d0) -> ((((d0 mod 288) mod 144) mod 48) floordiv (16 * 1))>(%a0) 719 %4 = affine.apply affine_map<(d0) -> ((((d0 mod 288) mod 144) mod 48) mod 16)>(%a0) 720 %5 = affine.apply affine_map<(d0) -> (((((d0 mod 144) mod 144) mod 48) mod 16) mod 1)>(%a0) 721 %v = affine.load %in[%0, %1, %2, %3, %4, %5] : memref<2x2x3x3x16x1xi32> 722 affine.store %v, %out[%ii, %jj] : memref<64x9xi32> 723 } 724 } 725 726 affine.for %i = 0 to 64 { 727 affine.for %j = 0 to 9 { 728 %a = affine.load %out[%i, %j] : memref<64x9xi32> 729 %b = arith.muli %a, %a : i32 730 affine.store %b, %live_out[%i, %j] : memref<64x9xi32> 731 } 732 } 733 return %live_out : memref<64x9xi32> 734} 735// Everything above is fused to a single 2-d loop nest, and the 6-d tensor %in 736// is eliminated if -memref-dataflow-opt is also supplied. 737// 738// CHECK-DAG: [[$MAP0:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> ((d0 * 9 + d1) floordiv 288)> 739// CHECK-DAG: [[$MAP1:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (((d0 * 9 + d1) mod 288) floordiv 144)> 740// CHECK-DAG: [[$MAP2:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (((d0 * 9 + d1) mod 144) floordiv 48)> 741// CHECK-DAG: [[$MAP3:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (((d0 * 9 + d1) mod 48) floordiv 16)> 742// CHECK-DAG: [[$MAP4:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> ((d0 * 9 + d1) mod 16)> 743// CHECK-DAG: [[$MAP11:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d0 * 9 + d1)> 744// CHECK-DAG: [[$MAP12:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 floordiv 288)> 745// CHECK-DAG: [[$MAP13:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> ((d0 mod 288) floordiv 144)> 746// CHECK-DAG: [[$MAP14:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> ((d0 mod 144) floordiv 48)> 747// CHECK-DAG: [[$MAP15:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> ((d0 mod 48) floordiv 16)> 748// CHECK-DAG: [[$MAP16:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 mod 16)> 749// CHECK-DAG: [[$MAP17:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (0)> 750 751// 752// CHECK-LABEL: func @R6_to_R2_reshape 753// CHECK: memref.alloc() : memref<1x2x3x3x16x1xi32> 754// CHECK: memref.alloc() : memref<1x1xi32> 755// CHECK: memref.alloc() : memref<64x9xi32> 756// CHECK-NEXT: affine.for %{{.*}} = 0 to 64 { 757// CHECK-NEXT: affine.for %{{.*}} = 0 to 9 { 758// CHECK-NEXT: affine.apply [[$MAP0]](%{{.*}}, %{{.*}}) 759// CHECK-NEXT: affine.apply [[$MAP1]](%{{.*}}, %{{.*}}) 760// CHECK-NEXT: affine.apply [[$MAP2]](%{{.*}}, %{{.*}}) 761// CHECK-NEXT: affine.apply [[$MAP3]](%{{.*}}, %{{.*}}) 762// CHECK-NEXT: affine.apply [[$MAP4]](%{{.*}}, %{{.*}}) 763// CHECK-NEXT: "foo"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (index, index, index, index, index, index) -> i32 764// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, ((%{{.*}} * 9 + %{{.*}}) mod 288) floordiv 144, ((%{{.*}} * 9 + %{{.*}}) mod 144) floordiv 48, ((%{{.*}} * 9 + %{{.*}}) mod 48) floordiv 16, (%{{.*}} * 9 + %{{.*}}) mod 16, 0] : memref<1x2x3x3x16x1xi32> 765// CHECK-NEXT: affine.apply [[$MAP11]](%{{.*}}, %{{.*}}) 766// CHECK-NEXT: affine.apply [[$MAP12]](%{{.*}}) 767// CHECK-NEXT: affine.apply [[$MAP13]](%{{.*}}) 768// CHECK-NEXT: affine.apply [[$MAP14]](%{{.*}}) 769// CHECK-NEXT: affine.apply [[$MAP15]](%{{.*}}) 770// CHECK-NEXT: affine.apply [[$MAP16]](%{{.*}}) 771// CHECK-NEXT: affine.apply [[$MAP17]](%{{.*}}) 772// CHECK-NEXT: affine.load %{{.*}}[0, ((%{{.*}} * 9 + %{{.*}}) mod 288) floordiv 144, ((%{{.*}} * 9 + %{{.*}}) mod 144) floordiv 48, ((%{{.*}} * 9 + %{{.*}}) mod 48) floordiv 16, (%{{.*}} * 9 + %{{.*}}) mod 16, 0] : memref<1x2x3x3x16x1xi32> 773// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xi32> 774// CHECK-NEXT: affine.load %{{.*}}[0, 0] : memref<1x1xi32> 775// CHECK-NEXT: arith.muli %{{.*}}, %{{.*}} : i32 776// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x9xi32> 777// CHECK-NEXT: } 778// CHECK-NEXT: } 779// CHECK-NEXT: return %{{.*}} : memref<64x9xi32> 780 781// ----- 782 783// CHECK-LABEL: func @fuse_symbolic_bounds 784func.func @fuse_symbolic_bounds(%M : index, %N : index) { 785 %N_plus_5 = affine.apply affine_map<(d0) -> (d0 + 5)>(%N) 786 %m = memref.alloc(%M, %N_plus_5) : memref<? x ? x f32> 787 788 %c0 = arith.constant 0.0 : f32 789 %s = arith.constant 5 : index 790 791 affine.for %i0 = 0 to %M { 792 affine.for %i1 = 0 to affine_map<(d0) -> (d0 + 5)> (%N) { 793 affine.store %c0, %m[%i0, %i1] : memref<? x ? x f32> 794 } 795 } 796 797 affine.for %i2 = 0 to %M { 798 affine.for %i3 = 0 to %N { 799 %v = affine.load %m[%i2, %i3 + symbol(%s)] : memref<? x ? x f32> 800 } 801 } 802 803 return 804} 805 806// ----- 807 808// CHECK-LABEL: func @should_fuse_reduction_at_depth_of_one 809func.func @should_fuse_reduction_at_depth_of_one() { 810 %a = memref.alloc() : memref<10x100xf32> 811 %b = memref.alloc() : memref<10xf32> 812 813 affine.for %i0 = 0 to 10 { 814 affine.for %i1 = 0 to 100 { 815 %v0 = affine.load %b[%i0] : memref<10xf32> 816 %v1 = affine.load %a[%i0, %i1] : memref<10x100xf32> 817 %v2 = "maxf"(%v0, %v1) : (f32, f32) -> f32 818 affine.store %v2, %b[%i0] : memref<10xf32> 819 } 820 } 821 affine.for %i2 = 0 to 10 { 822 affine.for %i3 = 0 to 100 { 823 %v3 = affine.load %b[%i2] : memref<10xf32> 824 %v4 = affine.load %a[%i2, %i3] : memref<10x100xf32> 825 %v5 = arith.subf %v4, %v3 : f32 826 affine.store %v5, %b[%i2] : memref<10xf32> 827 } 828 } 829 // This test should fuse the src reduction loop at depth 1 in the destination 830 // loop nest, which improves locality and enables subsequence passes to 831 // decrease the reduction memref size and possibly place it in a faster 832 // memory space. 833 // CHECK: affine.for %{{.*}} = 0 to 10 { 834 // CHECK-NEXT: affine.for %{{.*}} = 0 to 100 { 835 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 836 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x100xf32> 837 // CHECK-NEXT: "maxf"(%{{.*}}, %{{.*}}) : (f32, f32) -> f32 838 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 839 // CHECK-NEXT: } 840 // CHECK-NEXT: affine.for %{{.*}} = 0 to 100 { 841 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 842 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x100xf32> 843 // CHECK-NEXT: arith.subf %{{.*}}, %{{.*}} : f32 844 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 845 // CHECK-NEXT: } 846 // CHECK-NEXT: } 847 // CHECK-NEXT: return 848 return 849} 850 851// ----- 852 853// CHECK-LABEL: func @should_fuse_at_src_depth1_and_dst_depth1 854func.func @should_fuse_at_src_depth1_and_dst_depth1() { 855 %a = memref.alloc() : memref<100x16xf32> 856 %b = memref.alloc() : memref<100x16xf32> 857 858 affine.for %i0 = 0 to 100 { 859 affine.for %i1 = 0 to 16 { 860 %v0 = affine.load %a[%i0, %i1] : memref<100x16xf32> 861 "op0"(%v0) : (f32) -> () 862 } 863 affine.for %i2 = 0 to 16 { 864 %v1 = "op1"() : () -> (f32) 865 affine.store %v1, %b[%i0, %i2] : memref<100x16xf32> 866 } 867 } 868 869 affine.for %i3 = 0 to 100 { 870 affine.for %i4 = 0 to 16 { 871 %v2 = affine.load %b[%i3, %i4] : memref<100x16xf32> 872 "op2"(%v2) : (f32) -> () 873 } 874 } 875 // We can slice iterations of the '%i0' and '%i1' loops in the source 876 // loop nest, but slicing at depth 2 and inserting the slice in the 877 // destination loop nest at depth2 causes extra computation. Instead, 878 // the fusion algorithm should detect that the source loop should be sliced 879 // at depth 1 and the slice should be inserted at depth 1. 880 // CHECK: affine.for %{{.*}} = 0 to 100 { 881 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 882 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<100x16xf32> 883 // CHECK-NEXT: "op0"(%{{.*}}) : (f32) -> () 884 // CHECK-NEXT: } 885 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 886 // CHECK-NEXT: %{{.*}} = "op1"() : () -> f32 887 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, %{{.*}}] : memref<1x16xf32> 888 // CHECK-NEXT: } 889 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 890 // CHECK-NEXT: affine.load %{{.*}}[0, %{{.*}}] : memref<1x16xf32> 891 // CHECK-NEXT: "op2"(%{{.*}}) : (f32) -> () 892 // CHECK-NEXT: } 893 // CHECK-NEXT: } 894 // CHECK-NEXT: return 895 return 896} 897 898// ----- 899// CHECK: [[$MAP0:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 10 + d1)> 900 901// CHECK-LABEL: func @should_fuse_src_depth1_at_dst_depth2 902func.func @should_fuse_src_depth1_at_dst_depth2() { 903 %a = memref.alloc() : memref<100xf32> 904 %c0 = arith.constant 0.0 : f32 905 906 affine.for %i0 = 0 to 100 { 907 affine.store %c0, %a[%i0] : memref<100xf32> 908 } 909 910 affine.for %i1 = 0 to 10 { 911 affine.for %i2 = 0 to 10 { 912 %a0 = affine.apply affine_map<(d0, d1) -> (d0 * 10 + d1)> (%i1, %i2) 913 %v0 = affine.load %a[%a0] : memref<100xf32> 914 } 915 } 916 // The source loop nest slice loop bound is a function of both destination 917 // loop IVs, so we should slice at depth 1 and insert the slice at depth 2. 918 // CHECK: affine.for %{{.*}} = 0 to 10 { 919 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 920 // CHECK-NEXT: affine.apply [[$MAP0]](%{{.*}}, %{{.*}}) 921 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 922 // CHECK-NEXT: affine.apply [[$MAP0]](%{{.*}}, %{{.*}}) 923 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 924 // CHECK-NEXT: } 925 // CHECK-NEXT: } 926 // CHECK-NEXT: return 927 return 928} 929 930// ----- 931 932// CHECK-LABEL: func @fusion_at_depth0_not_currently_supported 933func.func @fusion_at_depth0_not_currently_supported() { 934 %0 = memref.alloc() : memref<10xf32> 935 %c0 = arith.constant 0 : index 936 %cst = arith.constant 0.000000e+00 : f32 937 affine.for %i0 = 0 to 10 { 938 affine.store %cst, %0[%i0] : memref<10xf32> 939 } 940 affine.for %i1 = 0 to 10 { 941 %1 = affine.load %0[%c0] : memref<10xf32> 942 } 943 // NOTE: Should shrink memref size to 1 element access by load in dst loop 944 // nest, and make the store in the slice store to the same element. 945 // CHECK-DAG: memref.alloc() : memref<1xf32> 946 // CHECK: affine.for %{{.*}} = 0 to 10 { 947 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 948 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 949 // CHECK-NEXT: } 950 // CHECK-NEXT: return 951 return 952} 953 954// ----- 955 956// CHECK-LABEL: func @should_fuse_deep_loop_nests 957func.func @should_fuse_deep_loop_nests() { 958 %0 = memref.alloc() : memref<2x2x3x3x16x10xf32, 2> 959 %1 = memref.alloc() : memref<2x2x3x3x16x10xf32, 2> 960 %2 = memref.alloc() : memref<3x3x3x3x16x10xf32, 2> 961 %c0 = arith.constant 0 : index 962 %c1 = arith.constant 1 : index 963 %c1_0 = arith.constant 1 : index 964 %cst = arith.constant 0.000000e+00 : f32 965 affine.for %i0 = 0 to 2 { 966 affine.for %i1 = 0 to 2 { 967 affine.for %i2 = 0 to 3 { 968 affine.for %i3 = 0 to 3 { 969 affine.for %i4 = 0 to 16 { 970 affine.for %i5 = 0 to 10 { 971 %3 = affine.load %0[%i0, %i1, %i2, %i3, %i4, %i5] 972 : memref<2x2x3x3x16x10xf32, 2> 973 } 974 } 975 affine.for %i6 = 0 to 16 { 976 affine.for %i7 = 0 to 10 { 977 affine.store %cst, %1[%i0, %i1, %i2, %i3, %i6, %i7] 978 : memref<2x2x3x3x16x10xf32, 2> 979 } 980 } 981 } 982 } 983 } 984 } 985 affine.for %i8 = 0 to 3 { 986 affine.for %i9 = 0 to 3 { 987 affine.for %i10 = 0 to 2 { 988 affine.for %i11 = 0 to 2 { 989 affine.for %i12 = 0 to 3 { 990 affine.for %i13 = 0 to 3 { 991 affine.for %i14 = 0 to 2 { 992 affine.for %i15 = 0 to 2 { 993 affine.for %i16 = 0 to 16 { 994 affine.for %i17 = 0 to 10 { 995 %5 = affine.load %0[%i14, %i15, %i12, %i13, %i16, %i17] 996 : memref<2x2x3x3x16x10xf32, 2> 997 } 998 } 999 affine.for %i18 = 0 to 16 { 1000 affine.for %i19 = 0 to 10 { 1001 %6 = affine.load %1[%i10, %i11, %i8, %i9, %i18, %i19] 1002 : memref<2x2x3x3x16x10xf32, 2> 1003 } 1004 } 1005 } 1006 } 1007 } 1008 } 1009 } 1010 } 1011 } 1012 } 1013// The first four loops of the source loop nest can be sliced with iteration 1014// bounds which are a function of the first four loops of destination loop nest, 1015// where the destination loops nests have been interchanged. 1016 1017// CHECK-DAG: memref.alloc() : memref<1x1x1x1x16x10xf32, 2> 1018// CHECK: affine.for %{{.*}} = 0 to 3 { 1019// CHECK-NEXT: affine.for %{{.*}} = 0 to 3 { 1020// CHECK-NEXT: affine.for %{{.*}} = 0 to 2 { 1021// CHECK-NEXT: affine.for %{{.*}} = 0 to 2 { 1022// CHECK-NEXT: affine.for %{{.*}} = 0 to 3 { 1023// CHECK-NEXT: affine.for %{{.*}} = 0 to 3 { 1024// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 1025// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 1026// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<2x2x3x3x16x10xf32, 2> 1027// CHECK-NEXT: } 1028// CHECK-NEXT: } 1029// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 1030// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 1031// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0, 0, 0, %{{.*}}, %{{.*}}] : memref<1x1x1x1x16x10xf32, 2> 1032// CHECK-NEXT: } 1033// CHECK-NEXT: } 1034// CHECK-NEXT: affine.for %{{.*}} = 0 to 2 { 1035// CHECK-NEXT: affine.for %{{.*}} = 0 to 2 { 1036// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 1037// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 1038// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<2x2x3x3x16x10xf32, 2> 1039// CHECK-NEXT: } 1040// CHECK-NEXT: } 1041// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 1042// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 1043// CHECK-NEXT: affine.load %{{.*}}[0, 0, 0, 0, %{{.*}}, %{{.*}}] : memref<1x1x1x1x16x10xf32, 2> 1044// CHECK-NEXT: } 1045// CHECK-NEXT: } 1046// CHECK-NEXT: } 1047// CHECK-NEXT: } 1048// CHECK-NEXT: } 1049// CHECK-NEXT: } 1050// CHECK-NEXT: } 1051// CHECK-NEXT: } 1052// CHECK-NEXT: } 1053// CHECK-NEXT: } 1054// CHECK-NEXT: return 1055 return 1056} 1057 1058// ----- 1059 1060// CHECK-LABEL: func @should_fuse_at_depth1_and_reduce_slice_trip_count 1061func.func @should_fuse_at_depth1_and_reduce_slice_trip_count() { 1062 %a = memref.alloc() : memref<4x256xf32> 1063 %b = memref.alloc() : memref<4x256xf32> 1064 1065 %c0 = arith.constant 0 : index 1066 %cf0 = arith.constant 0.0 : f32 1067 1068 affine.for %i0 = 0 to 4 { 1069 affine.for %i1 = 0 to 256 { 1070 %v0 = affine.load %b[%i0, %i1] : memref<4x256xf32> 1071 } 1072 affine.for %i2 = 0 to 256 { 1073 affine.store %cf0, %a[%i0, %i2] : memref<4x256xf32> 1074 } 1075 } 1076 1077 affine.for %d0 = 0 to 4 { 1078 affine.for %d1 = 0 to 16 { 1079 %v1 = affine.load %a[%d0, %d1] : memref<4x256xf32> 1080 } 1081 } 1082 // The cost of fusing at depth 2 is greater than the cost of fusing at depth 1 1083 // for two reasons: 1084 // 1) Inserting the unsliceable src loop %i1 to a higher depth removes 1085 // redundant computation and reduces costs. 1086 // 2) Inserting the sliceable src loop %i2 at depth 1, we can still reduce 1087 // its trip count to 16 (from 256) reducing costs. 1088 // NOTE: the size of the private memref created for the fused loop nest 1089 // is reduced from the original shape from 4x256 to 4x16 because of the 1090 // data accessed by the load. 1091 // CHECK-DAG: memref.alloc() : memref<1x16xf32> 1092 // CHECK: affine.for %{{.*}} = 0 to 4 { 1093 // CHECK-NEXT: affine.for %{{.*}} = 0 to 256 { 1094 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4x256xf32> 1095 // CHECK-NEXT: } 1096 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 1097 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, %{{.*}}] : memref<1x16xf32> 1098 // CHECK-NEXT: } 1099 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { 1100 // CHECK-NEXT: affine.load %{{.*}}[0, %{{.*}}] : memref<1x16xf32> 1101 // CHECK-NEXT: } 1102 // CHECK-NEXT: } 1103 // CHECK-NEXT: return 1104 return 1105} 1106 1107// ----- 1108 1109// CHECK-LABEL: func @should_fuse_at_depth1_with_trip_count_20 1110func.func @should_fuse_at_depth1_with_trip_count_20() { 1111 %a = memref.alloc() : memref<100xf32> 1112 %c0 = arith.constant 0 : index 1113 %cf0 = arith.constant 0.0 : f32 1114 1115 affine.for %i0 = 0 to 100 { 1116 affine.store %cf0, %a[%i0]: memref<100xf32> 1117 } 1118 1119 affine.for %i1 = 0 to 5 { 1120 affine.for %i2 = 0 to 10 { 1121 %v0 = affine.load %a[%i2]: memref<100xf32> 1122 } 1123 affine.for %i3 = 0 to 10 { 1124 affine.for %i4 = 0 to 20 { 1125 %v1 = affine.load %a[%i4]: memref<100xf32> 1126 } 1127 } 1128 } 1129 // NOTE: The size of the private memref created for fusion is shrunk to 20xf32 1130 // CHECK-DAG: memref.alloc() : memref<20xf32> 1131 // CHECK: affine.for %{{.*}} = 0 to 5 { 1132 // CHECK-NEXT: affine.for %{{.*}} = 0 to 20 { 1133 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<20xf32> 1134 // CHECK-NEXT: } 1135 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 1136 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<20xf32> 1137 // CHECK-NEXT: } 1138 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 1139 // CHECK-NEXT: affine.for %{{.*}} = 0 to 20 { 1140 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<20xf32> 1141 // CHECK-NEXT: } 1142 // CHECK-NEXT: } 1143 // CHECK-NEXT: } 1144 // CHECK-NEXT: return 1145 return 1146} 1147 1148// ----- 1149 1150// CHECK-LABEL: func @should_fuse_at_depth1_with_trip_count_19 1151func.func @should_fuse_at_depth1_with_trip_count_19() { 1152 %a = memref.alloc() : memref<100xf32> 1153 %c0 = arith.constant 0 : index 1154 %cf0 = arith.constant 0.0 : f32 1155 1156 affine.for %i0 = 0 to 100 { 1157 affine.store %cf0, %a[%i0]: memref<100xf32> 1158 } 1159 1160 affine.for %i1 = 0 to 5 { 1161 affine.for %i2 = 0 to 19 { 1162 %v0 = affine.load %a[%i2]: memref<100xf32> 1163 } 1164 affine.for %i3 = 0 to 10 { 1165 affine.for %i4 = 0 to 10 { 1166 %v1 = affine.load %a[%i4]: memref<100xf32> 1167 } 1168 } 1169 } 1170 // NOTE: The size of the private memref created for fusion is shrunk to 19xf32 1171 // CHECK-DAG: memref.alloc() : memref<19xf32> 1172 // CHECK: affine.for %{{.*}} = 0 to 5 { 1173 // CHECK-NEXT: affine.for %{{.*}} = 0 to 19 { 1174 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<19xf32> 1175 // CHECK-NEXT: } 1176 // CHECK-NEXT: affine.for %{{.*}} = 0 to 19 { 1177 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<19xf32> 1178 // CHECK-NEXT: } 1179 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 1180 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 1181 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<19xf32> 1182 // CHECK-NEXT: } 1183 // CHECK-NEXT: } 1184 // CHECK-NEXT: } 1185 // CHECK-NEXT: return 1186 return 1187} 1188 1189 1190// ----- 1191 1192// CHECK-LABEL: func @should_fuse_with_private_memref() { 1193func.func @should_fuse_with_private_memref() { 1194 %m = memref.alloc() : memref<100xf32> 1195 %cf7 = arith.constant 7.0 : f32 1196 1197 affine.for %i0 = 0 to 100 { 1198 affine.store %cf7, %m[%i0] : memref<100xf32> 1199 } 1200 affine.for %i1 = 0 to 17 { 1201 %v0 = affine.load %m[%i1] : memref<100xf32> 1202 } 1203 affine.for %i2 = 0 to 82 { 1204 %v1 = affine.load %m[%i2] : memref<100xf32> 1205 } 1206 // Should create a new private memref. 1207 // CHECK-DAG: memref.alloc() : memref<1xf32> 1208 // CHECK: affine.for %{{.*}} = 0 to 17 { 1209 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 1210 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 1211 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 1212 // CHECK-NEXT: } 1213 // CHECK-NEXT: return 1214 return 1215} 1216 1217// ----- 1218 1219// CHECK-LABEL: func @should_fuse_live_out_arg_but_preserve_src_loop(%{{.*}}: memref<10xf32>) { 1220func.func @should_fuse_live_out_arg_but_preserve_src_loop(%arg0: memref<10xf32>) { 1221 %cf7 = arith.constant 7.0 : f32 1222 1223 affine.for %i0 = 0 to 10 { 1224 affine.store %cf7, %arg0[%i0] : memref<10xf32> 1225 } 1226 affine.for %i1 = 0 to 9 { 1227 %v0 = affine.load %arg0[%i1] : memref<10xf32> 1228 } 1229 // This tests that the loop nest '%i0' should not be removed after fusion 1230 // because it writes to memref argument '%arg0', and its read region 1231 // does not cover its write region (so fusion would shrink the write region 1232 // in the fused loop nest, so complete live out data region would not 1233 // be written). 1234 // CHECK: affine.for %{{.*}} = 0 to 10 { 1235 // CHECK-NEXT: affine.store %{{.*}} : memref<10xf32> 1236 // CHECK-NEXT: } 1237 // CHECK-NEXT: affine.for %{{.*}} = 0 to 9 { 1238 // CHECK-NEXT: affine.store %{{.*}} : memref<1xf32> 1239 // CHECK-NEXT: affine.load %{{.*}} : memref<1xf32> 1240 // CHECK-NEXT: } 1241 // CHECK-NEXT: return 1242 return 1243} 1244 1245// ----- 1246 1247// CHECK-LABEL: func @should_fuse_live_out_arg(%{{.*}}: memref<10xf32>) { 1248func.func @should_fuse_live_out_arg(%arg0: memref<10xf32>) { 1249 %cf7 = arith.constant 7.0 : f32 1250 1251 affine.for %i0 = 0 to 10 { 1252 affine.store %cf7, %arg0[%i0] : memref<10xf32> 1253 } 1254 affine.for %i1 = 0 to 10 { 1255 %v0 = affine.load %arg0[%i1] : memref<10xf32> 1256 } 1257 // The read/write regions for memref '%{{.*}}' are the same for both 1258 // loops, so they should fuse. 1259 1260 // CHECK: affine.for %{{.*}} = 0 to 10 { 1261 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 1262 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 1263 // CHECK-NEXT: } 1264 // CHECK-NEXT: return 1265 return 1266} 1267 1268// ----- 1269 1270// CHECK-LABEL: func @should_fuse_escaping_memref_but_preserve_src_loop() -> memref<10xf32> 1271func.func @should_fuse_escaping_memref_but_preserve_src_loop() -> memref<10xf32> { 1272 %cf7 = arith.constant 7.0 : f32 1273 %m = memref.alloc() : memref<10xf32> 1274 affine.for %i0 = 0 to 10 { 1275 affine.store %cf7, %m[%i0] : memref<10xf32> 1276 } 1277 affine.for %i1 = 0 to 9 { 1278 %v0 = affine.load %m[%i1] : memref<10xf32> 1279 } 1280 // This tests that the loop nest '%i0' should not be removed after fusion 1281 // because it writes to memref '%m', which is returned by the function, and 1282 // the '%i1' memory region does not cover '%i0' memory region. 1283 1284 // CHECK-DAG: memref.alloc() : memref<1xf32> 1285 // CHECK: affine.for %{{.*}} = 0 to 10 { 1286 // CHECK-NEXT: affine.store %{{.*}} : memref<10xf32> 1287 // CHECK-NEXT: } 1288 // CHECK-NEXT: affine.for %{{.*}} = 0 to 9 { 1289 // CHECK-NEXT: affine.store %{{.*}} : memref<1xf32> 1290 // CHECK-NEXT: affine.load %{{.*}} : memref<1xf32> 1291 // CHECK-NEXT: } 1292 // CHECK-NEXT: return %{{.*}} : memref<10xf32> 1293 return %m : memref<10xf32> 1294} 1295// ----- 1296 1297// This should fuse with the %in becoming a 1x1x1. 1298func.func @R3_to_R2_reshape() { 1299 %in = memref.alloc() : memref<2x3x16xi32> 1300 1301 %c0 = arith.constant 0 : index 1302 1303 affine.for %i0 = 0 to 2 { 1304 affine.for %i1 = 0 to 3 { 1305 affine.for %i2 = 0 to 16 { 1306 %val = "foo"(%i0, %i1, %i2) : (index, index, index) -> i32 1307 affine.store %val, %in[%i0, %i1, %i2] : memref<2x3x16xi32> 1308 } 1309 } 1310 } 1311 1312 affine.for %ii = 0 to 32 { 1313 affine.for %jj = 0 to 3 { 1314 %a0 = affine.apply affine_map<(d0, d1) -> (d0 * 3 + d1)> (%ii, %jj) 1315 %idx = affine.apply affine_map<(d0) -> (d0 floordiv (3 * 16))> (%a0) 1316 %v = affine.load %in[%idx, %jj, %c0] 1317 : memref<2x3x16xi32> 1318 } 1319 } 1320 return 1321} 1322// CHECK-DAG: [[$MAP0:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> ((d0 * 3 + d1) floordiv 48)> 1323// CHECK-DAG: [[$MAP1:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d0 * 3 + d1)> 1324// CHECK-DAG: [[$MAP2:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 floordiv 48)> 1325 1326// CHECK-LABEL: func @R3_to_R2_reshape() 1327// CHECK-DAG: memref.alloc() : memref<1x1x1xi32> 1328// CHECK: affine.for %{{.*}} = 0 to 32 { 1329// CHECK-NEXT: affine.for %{{.*}} = 0 to 3 { 1330// CHECK-NEXT: affine.apply [[$MAP0]](%{{.*}}, %{{.*}}) 1331// CHECK-NEXT: "foo"(%{{.*}}, %{{.*}}, %{{.*}}) : (index, index, index) -> i32 1332// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0, 0] : memref<1x1x1xi32> 1333// CHECK-NEXT: affine.apply [[$MAP1]](%{{.*}}, %{{.*}}) 1334// CHECK-NEXT: affine.apply [[$MAP2]](%{{.*}}) 1335// CHECK-NEXT: affine.load %{{.*}}[0, 0, 0] : memref<1x1x1xi32> 1336// CHECK-NEXT: } 1337// CHECK-NEXT: } 1338// CHECK-NEXT: return 1339 1340// ----- 1341 1342func.func @should_fuse_multi_output_producer() { 1343 %a = memref.alloc() : memref<10xf32> 1344 %b = memref.alloc() : memref<10xf32> 1345 1346 %cf7 = arith.constant 7.0 : f32 1347 1348 affine.for %i0 = 0 to 10 { 1349 affine.store %cf7, %a[%i0] : memref<10xf32> 1350 affine.store %cf7, %b[%i0] : memref<10xf32> 1351 } 1352 affine.for %i1 = 0 to 10 { 1353 %v0 = affine.load %a[%i1] : memref<10xf32> 1354 %v1 = affine.load %b[%i1] : memref<10xf32> 1355 } 1356 1357 // CHECK: affine.for %{{.*}} = 0 to 10 { 1358 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 1359 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 1360 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 1361 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 1362 // CHECK-NEXT: } 1363 // CHECK-NEXT: return 1364 return 1365} 1366 1367// ----- 1368 1369// CHECK-LABEL: func @fusion_preventing_deps_on_middle_loop() { 1370func.func @fusion_preventing_deps_on_middle_loop() { 1371 %a = memref.alloc() : memref<10xf32> 1372 %b = memref.alloc() : memref<10xf32> 1373 %c = memref.alloc() : memref<10xf32> 1374 1375 %cf7 = arith.constant 7.0 : f32 1376 1377 affine.for %i0 = 0 to 10 { 1378 %v0 = affine.load %a[%i0] : memref<10xf32> 1379 affine.store %v0, %b[%i0] : memref<10xf32> 1380 } 1381 affine.for %i1 = 0 to 10 { 1382 affine.store %cf7, %a[%i1] : memref<10xf32> 1383 %v1 = affine.load %c[%i1] : memref<10xf32> 1384 } 1385 affine.for %i2 = 0 to 10 { 1386 %v2 = affine.load %b[%i2] : memref<10xf32> 1387 affine.store %v2, %c[%i2] : memref<10xf32> 1388 } 1389 // Loops '%i0' and '%i2' cannot fuse along producer/consumer edge on memref 1390 // '%b', because of the WAR dep from '%i0' to '%i1' on memref '%a' and 1391 // because of the WAR dep from '%i1' to '%i2' on memref '%c'. 1392 // CHECK: affine.for %{{.*}} = 0 to 10 { 1393 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 1394 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 1395 // CHECK-NEXT: } 1396 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 1397 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 1398 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 1399 // CHECK-NEXT: } 1400 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 1401 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 1402 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 1403 // CHECK-NEXT: } 1404 // CHECK-NEXT: return 1405 return 1406} 1407 1408// ----- 1409 1410// CHECK-LABEL: func @should_fuse_and_move_to_preserve_war_dep() { 1411func.func @should_fuse_and_move_to_preserve_war_dep() { 1412 %a = memref.alloc() : memref<10xf32> 1413 %b = memref.alloc() : memref<10xf32> 1414 %c = memref.alloc() : memref<10xf32> 1415 1416 %cf7 = arith.constant 7.0 : f32 1417 1418 affine.for %i0 = 0 to 10 { 1419 %v0 = affine.load %b[%i0] : memref<10xf32> 1420 affine.store %v0, %a[%i0] : memref<10xf32> 1421 } 1422 affine.for %i1 = 0 to 3 { 1423 %v2 = affine.load %c[%i1] : memref<10xf32> 1424 } 1425 affine.for %i2 = 0 to 5 { 1426 affine.store %cf7, %b[%i2] : memref<10xf32> 1427 } 1428 affine.for %i3 = 0 to 10 { 1429 %v1 = affine.load %a[%i3] : memref<10xf32> 1430 affine.store %cf7, %c[%i3] : memref<10xf32> 1431 } 1432 1433 // Dependence graph: 1434 // 1435 // %i0 --------- 1436 // | | 1437 // --- %i1 | %b | %a 1438 // | | | 1439 // %c | %i2 <-- | 1440 // | | 1441 // --> %i3 <-------- 1442 // 1443 // It is possible to fuse loop '%i0' into '%i3' and preserve dependences 1444 // if the fused loop nest is inserted between loops '%i1' and '%i2'. 1445 1446 // CHECK-DAG: memref.alloc() : memref<1xf32> 1447 // CHECK: affine.for %{{.*}} = 0 to 3 { 1448 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 1449 // CHECK-NEXT: } 1450 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 1451 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 1452 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 1453 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 1454 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 1455 // CHECK-NEXT: } 1456 // CHECK-NEXT: affine.for %{{.*}} = 0 to 5 { 1457 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 1458 // CHECK-NEXT: } 1459 // CHECK-NEXT: return 1460 return 1461} 1462 1463// ----- 1464 1465// CHECK-LABEL: func @fusion_preventing_dep_on_constant() { 1466func.func @fusion_preventing_dep_on_constant() { 1467 %a = memref.alloc() : memref<10xf32> 1468 %b = memref.alloc() : memref<10xf32> 1469 %c = memref.alloc() : memref<10xf32> 1470 1471 %cf7 = arith.constant 7.0 : f32 1472 1473 affine.for %i0 = 0 to 10 { 1474 %v0 = affine.load %b[%i0] : memref<10xf32> 1475 affine.store %cf7, %a[%i0] : memref<10xf32> 1476 } 1477 affine.for %i1 = 0 to 10 { 1478 affine.store %cf7, %b[%i1] : memref<10xf32> 1479 } 1480 %cf11 = arith.constant 11.0 : f32 1481 affine.for %i2 = 0 to 10 { 1482 %v2 = affine.load %a[%i2] : memref<10xf32> 1483 affine.store %cf11, %c[%i2] : memref<10xf32> 1484 } 1485 // Loops '%i0' and '%i2' cannot fuse along producer/consumer edge on memref 1486 // '%a', because of the WAR dep from '%i0' to '%i1' on memref '%b' and 1487 // because of the SSA value dep from '%cf11' def to use in '%i2'. 1488 // CHECK: affine.for %{{.*}} = 0 to 10 { 1489 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 1490 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 1491 // CHECK-NEXT: } 1492 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 1493 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 1494 // CHECK-NEXT: } 1495 // CHECK-NEXT: %{{.*}} = arith.constant 1.100000e+01 : f32 1496 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 1497 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 1498 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 1499 // CHECK-NEXT: } 1500 // CHECK-NEXT: return 1501 return 1502} 1503 1504// ----- 1505 1506// CHECK-LABEL: func @should_fuse_and_preserve_dep_on_constant() { 1507func.func @should_fuse_and_preserve_dep_on_constant() { 1508 %a = memref.alloc() : memref<10xf32> 1509 %b = memref.alloc() : memref<10xf32> 1510 %c = memref.alloc() : memref<10xf32> 1511 1512 %cf7 = arith.constant 7.0 : f32 1513 %cf11 = arith.constant 11.0 : f32 1514 affine.for %i0 = 0 to 10 { 1515 %v0 = affine.load %b[%i0] : memref<10xf32> 1516 affine.store %cf7, %a[%i0] : memref<10xf32> 1517 } 1518 affine.for %i1 = 0 to 10 { 1519 affine.store %cf7, %b[%i1] : memref<10xf32> 1520 } 1521 affine.for %i2 = 0 to 10 { 1522 %v2 = affine.load %a[%i2] : memref<10xf32> 1523 affine.store %cf11, %c[%i2] : memref<10xf32> 1524 } 1525 1526 // Loops '%i0' and '%i2' can fuse along producer/consumer edge on memref 1527 // '%a', and preserve the WAR dep from '%i0' to '%i1' on memref '%b', and 1528 // the SSA value dep from '%cf11' def to use in '%i2'. 1529 1530 // CHECK: arith.constant 1.100000e+01 : f32 1531 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 1532 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 1533 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 1534 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 1535 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 1536 // CHECK-NEXT: } 1537 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 1538 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 1539 // CHECK-NEXT: } 1540 // CHECK-NEXT: return 1541 return 1542} 1543 1544// ----- 1545 1546// CHECK-LABEL: @producer_consumer_with_outmost_user 1547func.func @producer_consumer_with_outmost_user(%arg0 : f16) { 1548 %c0 = arith.constant 0 : index 1549 %src = memref.alloc() : memref<f16, 1> 1550 %dst = memref.alloc() : memref<f16> 1551 %tag = memref.alloc() : memref<1xi32> 1552 affine.for %arg1 = 4 to 6 { 1553 affine.for %arg2 = 0 to 1 { 1554 %0 = arith.addf %arg0, %arg0 : f16 1555 affine.store %0, %src[] : memref<f16, 1> 1556 } 1557 affine.for %arg3 = 0 to 1 { 1558 %0 = affine.load %src[] : memref<f16, 1> 1559 } 1560 } 1561 affine.dma_start %src[], %dst[], %tag[%c0], %c0 : memref<f16, 1>, memref<f16>, memref<1xi32> 1562 // CHECK: %[[CST_INDEX:.*]] = arith.constant 0 : index 1563 // CHECK: %[[DMA_SRC:.*]] = memref.alloc() : memref<f16, 1> 1564 // CHECK: %[[DMA_DST:.*]] = memref.alloc() : memref<f16> 1565 // CHECK: %[[DMA_TAG:.*]] = memref.alloc() : memref<1xi32> 1566 // CHECK: affine.for %arg1 = 4 to 6 1567 // CHECK-NEXT: affine.for %arg2 = 0 to 1 1568 // CHECK-NEXT: %[[RESULT_ADD:.*]] = arith.addf %arg0, %arg0 : f16 1569 // CHECK-NEXT: affine.store %[[RESULT_ADD]], %[[DMA_SRC]][] : memref<f16, 1> 1570 // CHECK-NEXT: affine.load %[[DMA_SRC]][] : memref<f16, 1> 1571 // CHECK: affine.dma_start %[[DMA_SRC]][], %[[DMA_DST]][], %[[DMA_TAG]][%[[CST_INDEX]]], %[[CST_INDEX]] : memref<f16, 1>, memref<f16>, memref<1xi32> 1572 // CHECK-NEXT: return 1573 return 1574} 1575 1576// Add further tests in mlir/test/Transforms/loop-fusion-4.mlir 1577 1578