1// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion))' -split-input-file | FileCheck %s 2// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{fusion-maximal}))' -split-input-file | FileCheck %s --check-prefix=MAXIMAL 3 4// Part I of fusion tests in mlir/test/Transforms/loop-fusion.mlir. 5// Part II of fusion tests in mlir/test/Transforms/loop-fusion-2.mlir 6// Part IV of fusion tests in mlir/test/Transforms/loop-fusion-4.mlir 7 8// ----- 9 10// Test case from github bug 777. 11// CHECK-LABEL: func @mul_add_0 12func.func @mul_add_0(%arg0: memref<3x4xf32>, %arg1: memref<4x3xf32>, %arg2: memref<3x3xf32>, %arg3: memref<3x3xf32>) { 13 %cst = arith.constant 0.000000e+00 : f32 14 %0 = memref.alloc() : memref<3x3xf32> 15 affine.for %arg4 = 0 to 3 { 16 affine.for %arg5 = 0 to 3 { 17 affine.store %cst, %0[%arg4, %arg5] : memref<3x3xf32> 18 } 19 } 20 affine.for %arg4 = 0 to 3 { 21 affine.for %arg5 = 0 to 3 { 22 affine.for %arg6 = 0 to 4 { 23 %1 = affine.load %arg1[%arg6, %arg5] : memref<4x3xf32> 24 %2 = affine.load %arg0[%arg4, %arg6] : memref<3x4xf32> 25 %3 = arith.mulf %2, %1 : f32 26 %4 = affine.load %0[%arg4, %arg5] : memref<3x3xf32> 27 %5 = arith.addf %4, %3 : f32 28 affine.store %5, %0[%arg4, %arg5] : memref<3x3xf32> 29 } 30 } 31 } 32 affine.for %arg4 = 0 to 3 { 33 affine.for %arg5 = 0 to 3 { 34 %6 = affine.load %arg2[%arg4, %arg5] : memref<3x3xf32> 35 %7 = affine.load %0[%arg4, %arg5] : memref<3x3xf32> 36 %8 = arith.addf %7, %6 : f32 37 affine.store %8, %arg3[%arg4, %arg5] : memref<3x3xf32> 38 } 39 } 40 // CHECK: affine.for %[[i0:.*]] = 0 to 3 { 41 // CHECK-NEXT: affine.for %[[i1:.*]] = 0 to 3 { 42 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32> 43 // CHECK-NEXT: affine.for %[[i2:.*]] = 0 to 4 { 44 // CHECK-NEXT: affine.load %{{.*}}[%[[i2]], %[[i1]]] : memref<4x3xf32> 45 // CHECK-NEXT: affine.load %{{.*}}[%[[i0]], %[[i2]]] : memref<3x4xf32> 46 // CHECK-NEXT: arith.mulf %{{.*}}, %{{.*}} : f32 47 // CHECK-NEXT: affine.load %{{.*}}[0, 0] : memref<1x1xf32> 48 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32 49 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32> 50 // CHECK-NEXT: } 51 // CHECK-NEXT: affine.load %{{.*}}[%[[i0]], %[[i1]]] : memref<3x3xf32> 52 // CHECK-NEXT: affine.load %{{.*}}[0, 0] : memref<1x1xf32> 53 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32 54 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[i0]], %[[i1]]] : memref<3x3xf32> 55 // CHECK-NEXT: } 56 // CHECK-NEXT: } 57 // CHECK-NEXT: return 58 return 59} 60 61// ----- 62 63// Verify that 'fuseProducerConsumerNodes' fuse a producer loop with a store 64// that has multiple outgoing edges. 65 66// CHECK-LABEL: func @should_fuse_multi_outgoing_edge_store_producer 67func.func @should_fuse_multi_outgoing_edge_store_producer(%a : memref<1xf32>) { 68 %cst = arith.constant 0.000000e+00 : f32 69 affine.for %arg0 = 0 to 1 { 70 affine.store %cst, %a[%arg0] : memref<1xf32> 71 } 72 73 affine.for %arg0 = 0 to 1 { 74 %0 = affine.load %a[%arg0] : memref<1xf32> 75 } 76 77 affine.for %arg0 = 0 to 1 { 78 %0 = affine.load %a[%arg0] : memref<1xf32> 79 } 80 // CHECK: affine.for %{{.*}} = 0 to 1 { 81 // CHECK-NEXT: affine.store 82 // CHECK-NEXT: affine.load 83 // CHECK-NEXT: affine.load 84 // CHECK-NEXT: } 85 86 return 87} 88 89// ----- 90 91// Verify that 'fuseProducerConsumerNodes' fuses a producer loop that: 1) has 92// multiple outgoing edges, 2) producer store has a single outgoing edge. 93// Sibling loop fusion should not fuse any of these loops due to 94// dependencies on external memrefs '%a' and '%b'. 95 96// CHECK-LABEL: func @should_fuse_producer_with_multi_outgoing_edges 97func.func @should_fuse_producer_with_multi_outgoing_edges(%a : memref<1xf32>, %b : memref<1xf32>) { 98 %cst = arith.constant 0.000000e+00 : f32 99 affine.for %arg0 = 0 to 1 { 100 %0 = affine.load %a[%arg0] : memref<1xf32> 101 affine.store %cst, %b[%arg0] : memref<1xf32> 102 } 103 104 affine.for %arg0 = 0 to 1 { 105 affine.store %cst, %a[%arg0] : memref<1xf32> 106 %1 = affine.load %b[%arg0] : memref<1xf32> 107 } 108 // CHECK: affine.for %{{.*}} = 0 to 1 109 // CHECK-NEXT: affine.load %[[A:.*]][{{.*}}] 110 // CHECK-NEXT: affine.store %{{.*}}, %[[B:.*]][{{.*}}] 111 // CHECK-NEXT: affine.store %{{.*}}, %[[A]] 112 // CHECK-NEXT: affine.load %[[B]] 113 // CHECK-NOT: affine.for %{{.*}} 114 // CHECK: return 115 return 116} 117 118// MAXIMAL-LABEL: func @reshape_into_matmul 119func.func @reshape_into_matmul(%lhs : memref<1024x1024xf32>, 120 %R: memref<16x64x1024xf32>, %out: memref<1024x1024xf32>) { 121 %rhs = memref.alloc() : memref<1024x1024xf32> 122 123 // Reshape from 3-d to 2-d. 124 affine.for %i0 = 0 to 16 { 125 affine.for %i1 = 0 to 64 { 126 affine.for %k = 0 to 1024 { 127 %v = affine.load %R[%i0, %i1, %k] : memref<16x64x1024xf32> 128 affine.store %v, %rhs[64*%i0 + %i1, %k] : memref<1024x1024xf32> 129 } 130 } 131 } 132 133 // Matmul. 134 affine.for %i = 0 to 1024 { 135 affine.for %j = 0 to 1024 { 136 affine.for %k = 0 to 1024 { 137 %0 = affine.load %rhs[%k, %j] : memref<1024x1024xf32> 138 %1 = affine.load %lhs[%i, %k] : memref<1024x1024xf32> 139 %2 = arith.mulf %1, %0 : f32 140 %3 = affine.load %out[%i, %j] : memref<1024x1024xf32> 141 %4 = arith.addf %3, %2 : f32 142 affine.store %4, %out[%i, %j] : memref<1024x1024xf32> 143 } 144 } 145 } 146 return 147} 148// MAXIMAL-NEXT: memref.alloc 149// MAXIMAL-NEXT: affine.for 150// MAXIMAL-NEXT: affine.for 151// MAXIMAL-NEXT: affine.for 152// MAXIMAL-NOT: affine.for 153// MAXIMAL: return 154 155// ----- 156 157// CHECK-LABEL: func @vector_loop 158func.func @vector_loop(%a : memref<10x20xf32>, %b : memref<10x20xf32>, 159 %c : memref<10x20xf32>) { 160 affine.for %j = 0 to 10 { 161 affine.for %i = 0 to 5 { 162 %ld0 = affine.vector_load %a[%j, %i*4] : memref<10x20xf32>, vector<4xf32> 163 affine.vector_store %ld0, %b[%j, %i*4] : memref<10x20xf32>, vector<4xf32> 164 } 165 } 166 167 affine.for %j = 0 to 10 { 168 affine.for %i = 0 to 5 { 169 %ld0 = affine.vector_load %b[%j, %i*4] : memref<10x20xf32>, vector<4xf32> 170 affine.vector_store %ld0, %c[%j, %i*4] : memref<10x20xf32>, vector<4xf32> 171 } 172 } 173 174 return 175} 176// CHECK: affine.for 177// CHECK-NEXT: affine.for 178// CHECK-NEXT: affine.vector_load 179// CHECK-NEXT: affine.vector_store 180// CHECK-NEXT: affine.vector_load 181// CHECK-NEXT: affine.vector_store 182// CHECK-NOT: affine.for 183 184// ----- 185 186// CHECK-LABEL: func @multi_outgoing_edges 187func.func @multi_outgoing_edges(%in0 : memref<32xf32>, 188 %in1 : memref<32xf32>) { 189 affine.for %d = 0 to 32 { 190 %lhs = affine.load %in0[%d] : memref<32xf32> 191 %rhs = affine.load %in1[%d] : memref<32xf32> 192 %add = arith.addf %lhs, %rhs : f32 193 affine.store %add, %in0[%d] : memref<32xf32> 194 } 195 affine.for %d = 0 to 32 { 196 %lhs = affine.load %in0[%d] : memref<32xf32> 197 %rhs = affine.load %in1[%d] : memref<32xf32> 198 %add = arith.subf %lhs, %rhs : f32 199 affine.store %add, %in0[%d] : memref<32xf32> 200 } 201 affine.for %d = 0 to 32 { 202 %lhs = affine.load %in0[%d] : memref<32xf32> 203 %rhs = affine.load %in1[%d] : memref<32xf32> 204 %add = arith.mulf %lhs, %rhs : f32 205 affine.store %add, %in0[%d] : memref<32xf32> 206 } 207 affine.for %d = 0 to 32 { 208 %lhs = affine.load %in0[%d] : memref<32xf32> 209 %rhs = affine.load %in1[%d] : memref<32xf32> 210 %add = arith.divf %lhs, %rhs : f32 211 affine.store %add, %in0[%d] : memref<32xf32> 212 } 213 return 214} 215 216// CHECK: affine.for 217// CHECK-NOT: affine.for 218// CHECK: arith.addf 219// CHECK-NOT: affine.for 220// CHECK: arith.subf 221// CHECK-NOT: affine.for 222// CHECK: arith.mulf 223// CHECK-NOT: affine.for 224// CHECK: arith.divf 225 226// ----- 227 228// Test fusion when dynamically shaped memrefs are used with constant trip count loops. 229 230// CHECK-LABEL: func @calc 231func.func @calc(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>, %len: index) { 232 %c1 = arith.constant 1 : index 233 %1 = memref.alloc(%len) : memref<?xf32> 234 affine.for %arg4 = 1 to 10 { 235 %7 = affine.load %arg0[%arg4] : memref<?xf32> 236 %8 = affine.load %arg1[%arg4] : memref<?xf32> 237 %9 = arith.addf %7, %8 : f32 238 affine.store %9, %1[%arg4] : memref<?xf32> 239 } 240 affine.for %arg4 = 1 to 10 { 241 %7 = affine.load %1[%arg4] : memref<?xf32> 242 %8 = affine.load %arg1[%arg4] : memref<?xf32> 243 %9 = arith.mulf %7, %8 : f32 244 affine.store %9, %arg2[%arg4] : memref<?xf32> 245 } 246 return 247} 248// CHECK: memref.alloc() : memref<1xf32> 249// CHECK: affine.for %arg{{.*}} = 1 to 10 { 250// CHECK-NEXT: affine.load %arg{{.*}} 251// CHECK-NEXT: affine.load %arg{{.*}} 252// CHECK-NEXT: arith.addf 253// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 254// CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 255// CHECK-NEXT: affine.load %arg{{.*}}[%arg{{.*}}] : memref<?xf32> 256// CHECK-NEXT: arith.mulf 257// CHECK-NEXT: affine.store %{{.*}}, %arg{{.*}}[%arg{{.*}}] : memref<?xf32> 258// CHECK-NEXT: } 259// CHECK-NEXT: return 260 261// ----- 262 263// CHECK-LABEL: func @should_not_fuse_since_non_affine_users 264func.func @should_not_fuse_since_non_affine_users(%in0 : memref<32xf32>, 265 %in1 : memref<32xf32>) { 266 affine.for %d = 0 to 32 { 267 %lhs = affine.load %in0[%d] : memref<32xf32> 268 %rhs = affine.load %in1[%d] : memref<32xf32> 269 %add = arith.addf %lhs, %rhs : f32 270 affine.store %add, %in0[%d] : memref<32xf32> 271 } 272 affine.for %d = 0 to 32 { 273 %lhs = memref.load %in0[%d] : memref<32xf32> 274 %rhs = memref.load %in1[%d] : memref<32xf32> 275 %add = arith.subf %lhs, %rhs : f32 276 memref.store %add, %in0[%d] : memref<32xf32> 277 } 278 affine.for %d = 0 to 32 { 279 %lhs = affine.load %in0[%d] : memref<32xf32> 280 %rhs = affine.load %in1[%d] : memref<32xf32> 281 %add = arith.mulf %lhs, %rhs : f32 282 affine.store %add, %in0[%d] : memref<32xf32> 283 } 284 return 285} 286 287// CHECK: affine.for 288// CHECK: arith.addf 289// CHECK: affine.for 290// CHECK: arith.subf 291// CHECK: affine.for 292// CHECK: arith.mulf 293 294// ----- 295 296// CHECK-LABEL: func @should_not_fuse_since_top_level_non_affine_users 297func.func @should_not_fuse_since_top_level_non_affine_users(%in0 : memref<32xf32>, 298 %in1 : memref<32xf32>) { 299 %sum = memref.alloc() : memref<f32> 300 affine.for %d = 0 to 32 { 301 %lhs = affine.load %in0[%d] : memref<32xf32> 302 %rhs = affine.load %in1[%d] : memref<32xf32> 303 %add = arith.addf %lhs, %rhs : f32 304 memref.store %add, %sum[] : memref<f32> 305 affine.store %add, %in0[%d] : memref<32xf32> 306 } 307 %load_sum = memref.load %sum[] : memref<f32> 308 affine.for %d = 0 to 32 { 309 %lhs = affine.load %in0[%d] : memref<32xf32> 310 %rhs = affine.load %in1[%d] : memref<32xf32> 311 %add = arith.mulf %lhs, %rhs : f32 312 %sub = arith.subf %add, %load_sum: f32 313 affine.store %sub, %in0[%d] : memref<32xf32> 314 } 315 memref.dealloc %sum : memref<f32> 316 return 317} 318 319// CHECK: affine.for 320// CHECK: arith.addf 321// CHECK: affine.for 322// CHECK: arith.mulf 323// CHECK: arith.subf 324 325// ----- 326 327// CHECK-LABEL: func @should_not_fuse_since_top_level_non_affine_mem_write_users 328func.func @should_not_fuse_since_top_level_non_affine_mem_write_users( 329 %in0 : memref<32xf32>, %in1 : memref<32xf32>) { 330 %c0 = arith.constant 0 : index 331 %cst_0 = arith.constant 0.000000e+00 : f32 332 333 affine.for %d = 0 to 32 { 334 %lhs = affine.load %in0[%d] : memref<32xf32> 335 %rhs = affine.load %in1[%d] : memref<32xf32> 336 %add = arith.addf %lhs, %rhs : f32 337 affine.store %add, %in0[%d] : memref<32xf32> 338 } 339 memref.store %cst_0, %in0[%c0] : memref<32xf32> 340 affine.for %d = 0 to 32 { 341 %lhs = affine.load %in0[%d] : memref<32xf32> 342 %rhs = affine.load %in1[%d] : memref<32xf32> 343 %add = arith.addf %lhs, %rhs: f32 344 affine.store %add, %in0[%d] : memref<32xf32> 345 } 346 return 347} 348 349// CHECK: affine.for 350// CHECK: arith.addf 351// CHECK: affine.for 352// CHECK: arith.addf 353 354// ----- 355 356// MAXIMAL-LABEL: func @fuse_minor_affine_map 357func.func @fuse_minor_affine_map(%in: memref<128xf32>, %out: memref<20x512xf32>) { 358 %tmp = memref.alloc() : memref<128xf32> 359 360 affine.for %arg4 = 0 to 128 { 361 %ld = affine.load %in[%arg4] : memref<128xf32> 362 affine.store %ld, %tmp[%arg4] : memref<128xf32> 363 } 364 365 affine.for %arg3 = 0 to 20 { 366 affine.for %arg4 = 0 to 512 { 367 %ld = affine.load %tmp[%arg4 mod 128] : memref<128xf32> 368 affine.store %ld, %out[%arg3, %arg4] : memref<20x512xf32> 369 } 370 } 371 372 return 373} 374 375// TODO: The size of the private memref is not properly computed in the presence 376// of the 'mod' operation. It should be memref<1xf32> instead of 377// memref<128xf32>: https://bugs.llvm.org/show_bug.cgi?id=46973 378// MAXIMAL: memref.alloc() : memref<128xf32> 379// MAXIMAL: affine.for 380// MAXIMAL-NEXT: affine.for 381// MAXIMAL-NOT: affine.for 382// MAXIMAL: return 383 384// ----- 385 386// CHECK-LABEL: func @should_fuse_multi_store_producer_and_privatize_memfefs 387func.func @should_fuse_multi_store_producer_and_privatize_memfefs() { 388 %a = memref.alloc() : memref<10xf32> 389 %b = memref.alloc() : memref<10xf32> 390 %c = memref.alloc() : memref<10xf32> 391 %cst = arith.constant 0.000000e+00 : f32 392 affine.for %arg0 = 0 to 10 { 393 affine.store %cst, %a[%arg0] : memref<10xf32> 394 affine.store %cst, %b[%arg0] : memref<10xf32> 395 affine.store %cst, %c[%arg0] : memref<10xf32> 396 %0 = affine.load %c[%arg0] : memref<10xf32> 397 } 398 399 affine.for %arg0 = 0 to 10 { 400 %0 = affine.load %a[%arg0] : memref<10xf32> 401 } 402 403 affine.for %arg0 = 0 to 10 { 404 %0 = affine.load %b[%arg0] : memref<10xf32> 405 } 406 407 // All the memrefs should be privatized except '%c', which is not involved in 408 // the producer-consumer fusion. 409 // CHECK: affine.for %{{.*}} = 0 to 10 { 410 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 411 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> 412 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 413 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 414 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 415 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32> 416 // CHECK-NEXT: } 417 418 return 419} 420 421 422func.func @should_fuse_multi_store_producer_with_escaping_memrefs_and_remove_src( 423 %a : memref<10xf32>, %b : memref<10xf32>) { 424 %cst = arith.constant 0.000000e+00 : f32 425 affine.for %i0 = 0 to 10 { 426 affine.store %cst, %a[%i0] : memref<10xf32> 427 affine.store %cst, %b[%i0] : memref<10xf32> 428 } 429 430 affine.for %i1 = 0 to 10 { 431 %0 = affine.load %a[%i1] : memref<10xf32> 432 } 433 434 affine.for %i2 = 0 to 10 { 435 %0 = affine.load %b[%i2] : memref<10xf32> 436 } 437 438 // Producer loop '%i0' should be removed after fusion since fusion is maximal. 439 // No memref should be privatized since they escape the function, and the 440 // producer is removed after fusion. 441 // CHECK: affine.for %{{.*}} = 0 to 10 { 442 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 443 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 444 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 445 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 446 // CHECK-NEXT: } 447 // CHECK-NOT: affine.for 448 449 return 450} 451 452// ----- 453 454func.func @should_fuse_multi_store_producer_with_escaping_memrefs_and_preserve_src( 455 %a : memref<10xf32>, %b : memref<10xf32>) { 456 %cst = arith.constant 0.000000e+00 : f32 457 affine.for %i0 = 0 to 10 { 458 affine.store %cst, %a[%i0] : memref<10xf32> 459 affine.store %cst, %b[%i0] : memref<10xf32> 460 } 461 462 affine.for %i1 = 0 to 5 { 463 %0 = affine.load %a[%i1] : memref<10xf32> 464 } 465 466 affine.for %i2 = 0 to 10 { 467 %0 = affine.load %b[%i2] : memref<10xf32> 468 } 469 470 // Loops '%i0' and '%i2' should be fused first and '%i0' should be removed 471 // since fusion is maximal. Then the fused loop and '%i1' should be fused 472 // and the fused loop shouldn't be removed since fusion is not maximal. 473 // CHECK: affine.for %{{.*}} = 0 to 10 { 474 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 475 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 476 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 477 // CHECK-NEXT: } 478 // CHECK: affine.for %{{.*}} = 0 to 5 { 479 // CHECK-NEXT: affine.store %{{.*}} : memref<1xf32> 480 // CHECK-NEXT: affine.store %{{.*}} : memref<10xf32> 481 // CHECK-NEXT: affine.load %{{.*}} : memref<10xf32> 482 // CHECK-NEXT: affine.load %{{.*}} : memref<1xf32> 483 // CHECK-NEXT: } 484 // CHECK-NOT: affine.for 485 486 return 487} 488 489 490func.func @should_not_fuse_due_to_dealloc(%arg0: memref<16xf32>){ 491 %A = memref.alloc() : memref<16xf32> 492 %C = memref.alloc() : memref<16xf32> 493 %cst_1 = arith.constant 1.000000e+00 : f32 494 affine.for %arg1 = 0 to 16 { 495 %a = affine.load %arg0[%arg1] : memref<16xf32> 496 affine.store %a, %A[%arg1] : memref<16xf32> 497 affine.store %a, %C[%arg1] : memref<16xf32> 498 } 499 memref.dealloc %C : memref<16xf32> 500 %B = memref.alloc() : memref<16xf32> 501 affine.for %arg1 = 0 to 16 { 502 %a = affine.load %A[%arg1] : memref<16xf32> 503 %b = arith.addf %cst_1, %a : f32 504 affine.store %b, %B[%arg1] : memref<16xf32> 505 } 506 memref.dealloc %A : memref<16xf32> 507 return 508} 509// CHECK-LABEL: func @should_not_fuse_due_to_dealloc 510// CHECK: affine.for 511// CHECK-NEXT: affine.load 512// CHECK-NEXT: affine.store 513// CHECK-NEXT: affine.store 514// CHECK: memref.dealloc 515// CHECK: affine.for 516// CHECK-NEXT: affine.load 517// CHECK-NEXT: arith.addf 518// CHECK-NEXT: affine.store 519 520// ----- 521 522// CHECK-LABEL: func @should_fuse_defining_node_has_no_dependence_from_source_node 523func.func @should_fuse_defining_node_has_no_dependence_from_source_node( 524 %a : memref<10xf32>, %b : memref<f32>) -> () { 525 affine.for %i0 = 0 to 10 { 526 %0 = affine.load %b[] : memref<f32> 527 affine.store %0, %a[%i0] : memref<10xf32> 528 } 529 %0 = affine.load %b[] : memref<f32> 530 affine.for %i1 = 0 to 10 { 531 %1 = affine.load %a[%i1] : memref<10xf32> 532 %2 = arith.divf %0, %1 : f32 533 } 534 535 // Loops '%i0' and '%i1' should be fused even though there is a defining node 536 // between the loops. It is because the node has no dependence from '%i0'. 537 // CHECK: affine.load %{{.*}}[] : memref<f32> 538 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { 539 // CHECK-NEXT: affine.load %{{.*}}[] : memref<f32> 540 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 541 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 542 // CHECK-NEXT: arith.divf 543 // CHECK-NEXT: } 544 // CHECK-NOT: affine.for 545 return 546} 547 548// ----- 549 550// CHECK-LABEL: func @should_not_fuse_defining_node_has_dependence_from_source_loop 551func.func @should_not_fuse_defining_node_has_dependence_from_source_loop( 552 %a : memref<10xf32>, %b : memref<f32>) -> () { 553 %cst = arith.constant 0.000000e+00 : f32 554 affine.for %i0 = 0 to 10 { 555 affine.store %cst, %b[] : memref<f32> 556 affine.store %cst, %a[%i0] : memref<10xf32> 557 } 558 %0 = affine.load %b[] : memref<f32> 559 affine.for %i1 = 0 to 10 { 560 %1 = affine.load %a[%i1] : memref<10xf32> 561 %2 = arith.divf %0, %1 : f32 562 } 563 564 // Loops '%i0' and '%i1' should not be fused because the defining node of '%0' 565 // used in '%i1' has dependence from loop '%i0'. 566 // CHECK: affine.for %{{.*}} = 0 to 10 { 567 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[] : memref<f32> 568 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 569 // CHECK-NEXT: } 570 // CHECK-NEXT: affine.load %{{.*}}[] : memref<f32> 571 // CHECK: affine.for %{{.*}} = 0 to 10 { 572 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 573 // CHECK-NEXT: arith.divf 574 // CHECK-NEXT: } 575 return 576} 577 578// ----- 579 580// CHECK-LABEL: func @should_not_fuse_defining_node_has_transitive_dependence_from_source_loop 581func.func @should_not_fuse_defining_node_has_transitive_dependence_from_source_loop( 582 %a : memref<10xf32>, %b : memref<10xf32>, %c : memref<f32>) -> () { 583 %cst = arith.constant 0.000000e+00 : f32 584 affine.for %i0 = 0 to 10 { 585 affine.store %cst, %a[%i0] : memref<10xf32> 586 affine.store %cst, %b[%i0] : memref<10xf32> 587 } 588 affine.for %i1 = 0 to 10 { 589 %1 = affine.load %b[%i1] : memref<10xf32> 590 affine.store %1, %c[] : memref<f32> 591 } 592 %0 = affine.load %c[] : memref<f32> 593 affine.for %i2 = 0 to 10 { 594 %1 = affine.load %a[%i2] : memref<10xf32> 595 %2 = arith.divf %0, %1 : f32 596 } 597 598 // When loops '%i0' and '%i2' are evaluated first, they should not be 599 // fused. The defining node of '%0' in loop '%i2' has transitive dependence 600 // from loop '%i0'. After that, loops '%i0' and '%i1' are evaluated, and they 601 // will be fused as usual. 602 // CHECK: affine.for %{{.*}} = 0 to 10 { 603 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 604 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 605 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 606 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[] : memref<f32> 607 // CHECK-NEXT: } 608 // CHECK-NEXT: affine.load %{{.*}}[] : memref<f32> 609 // CHECK: affine.for %{{.*}} = 0 to 10 { 610 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 611 // CHECK-NEXT: arith.divf 612 // CHECK-NEXT: } 613 // CHECK-NOT: affine.for 614 return 615} 616 617// ----- 618 619// CHECK-LABEL: func @should_not_fuse_dest_loop_nest_return_value 620func.func @should_not_fuse_dest_loop_nest_return_value( 621 %a : memref<10xf32>) -> () { 622 %cst = arith.constant 0.000000e+00 : f32 623 affine.for %i0 = 0 to 10 { 624 affine.store %cst, %a[%i0] : memref<10xf32> 625 } 626 %b = affine.for %i1 = 0 to 10 step 2 iter_args(%b_iter = %cst) -> f32 { 627 %load_a = affine.load %a[%i1] : memref<10xf32> 628 affine.yield %load_a: f32 629 } 630 631 // CHECK: affine.for %{{.*}} = 0 to 10 { 632 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 633 // CHECK-NEXT: } 634 // CHECK: affine.for %{{.*}} = 0 to 10 step 2 iter_args(%{{.*}} = %{{.*}}) -> (f32) { 635 // CHECK-NEXT: affine.load 636 // CHECK-NEXT: affine.yield 637 // CHECK-NEXT: } 638 639 return 640} 641 642// ----- 643 644// CHECK-LABEL: func @should_not_fuse_src_loop_nest_return_value 645func.func @should_not_fuse_src_loop_nest_return_value( 646 %a : memref<10xf32>) -> () { 647 %cst = arith.constant 1.000000e+00 : f32 648 %b = affine.for %i = 0 to 10 step 2 iter_args(%b_iter = %cst) -> f32 { 649 %c = arith.addf %b_iter, %b_iter : f32 650 affine.store %c, %a[%i] : memref<10xf32> 651 affine.yield %c: f32 652 } 653 affine.for %i1 = 0 to 10 { 654 %1 = affine.load %a[%i1] : memref<10xf32> 655 } 656 657 // CHECK: %{{.*}} = affine.for %{{.*}} = 0 to 10 step 2 iter_args(%{{.*}} = %{{.*}}) -> (f32) { 658 // CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32 659 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 660 // CHECK-NEXT: affine.yield %{{.*}} : f32 661 // CHECK-NEXT: } 662 // CHECK: affine.for %{{.*}} = 0 to 10 { 663 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 664 // CHECK-NEXT: } 665 666 return 667} 668 669// ----- 670 671func.func private @some_function(memref<16xf32>) 672func.func @call_op_prevents_fusion(%arg0: memref<16xf32>){ 673 %A = memref.alloc() : memref<16xf32> 674 %cst_1 = arith.constant 1.000000e+00 : f32 675 affine.for %arg1 = 0 to 16 { 676 %a = affine.load %arg0[%arg1] : memref<16xf32> 677 affine.store %a, %A[%arg1] : memref<16xf32> 678 } 679 call @some_function(%A) : (memref<16xf32>) -> () 680 %B = memref.alloc() : memref<16xf32> 681 affine.for %arg1 = 0 to 16 { 682 %a = affine.load %A[%arg1] : memref<16xf32> 683 %b = arith.addf %cst_1, %a : f32 684 affine.store %b, %B[%arg1] : memref<16xf32> 685 } 686 return 687} 688// CHECK-LABEL: func @call_op_prevents_fusion 689// CHECK: affine.for 690// CHECK-NEXT: affine.load 691// CHECK-NEXT: affine.store 692// CHECK: call 693// CHECK: affine.for 694// CHECK-NEXT: affine.load 695// CHECK-NEXT: arith.addf 696// CHECK-NEXT: affine.store 697 698// ----- 699 700func.func private @some_function() 701func.func @call_op_does_not_prevent_fusion(%arg0: memref<16xf32>){ 702 %A = memref.alloc() : memref<16xf32> 703 %cst_1 = arith.constant 1.000000e+00 : f32 704 affine.for %arg1 = 0 to 16 { 705 %a = affine.load %arg0[%arg1] : memref<16xf32> 706 affine.store %a, %A[%arg1] : memref<16xf32> 707 } 708 call @some_function() : () -> () 709 %B = memref.alloc() : memref<16xf32> 710 affine.for %arg1 = 0 to 16 { 711 %a = affine.load %A[%arg1] : memref<16xf32> 712 %b = arith.addf %cst_1, %a : f32 713 affine.store %b, %B[%arg1] : memref<16xf32> 714 } 715 return 716} 717// CHECK-LABEL: func @call_op_does_not_prevent_fusion 718// CHECK: affine.for 719// CHECK-NOT: affine.for 720 721// ----- 722 723// Test for source that writes to an escaping memref and has two consumers. 724// Fusion should create private memrefs in place of `%arg0` since the source is 725// not to be removed after fusion and the destinations do not write to `%arg0`. 726// This should enable both the consumers to benefit from fusion, which would not 727// be possible if private memrefs were not created. 728func.func @should_fuse_with_both_consumers_separately(%arg0: memref<10xf32>) { 729 %cf7 = arith.constant 7.0 : f32 730 affine.for %i0 = 0 to 10 { 731 affine.store %cf7, %arg0[%i0] : memref<10xf32> 732 } 733 affine.for %i1 = 0 to 7 { 734 %v0 = affine.load %arg0[%i1] : memref<10xf32> 735 } 736 affine.for %i1 = 5 to 9 { 737 %v0 = affine.load %arg0[%i1] : memref<10xf32> 738 } 739 return 740} 741 742// CHECK-LABEL: func @should_fuse_with_both_consumers_separately 743// CHECK: affine.for 744// CHECK-NEXT: affine.store 745// CHECK: affine.for 746// CHECK-NEXT: affine.store 747// CHECK-NEXT: affine.load 748// CHECK: affine.for 749// CHECK-NEXT: affine.store 750// CHECK-NEXT: affine.load 751 752// ----- 753 754// Fusion is avoided when the slice computed is invalid. Comments below describe 755// incorrect backward slice computation. Similar logic applies for forward slice 756// as well. 757func.func @no_fusion_cannot_compute_valid_slice() { 758 %A = memref.alloc() : memref<5xf32> 759 %B = memref.alloc() : memref<6xf32> 760 %C = memref.alloc() : memref<5xf32> 761 %cst = arith.constant 0. : f32 762 763 affine.for %arg0 = 0 to 5 { 764 %a = affine.load %A[%arg0] : memref<5xf32> 765 affine.store %a, %B[%arg0 + 1] : memref<6xf32> 766 } 767 768 affine.for %arg0 = 0 to 5 { 769 // Backward slice computed will be: 770 // slice ( src loop: 0, dst loop: 1, depth: 1 : insert point: (1, 0) 771 // loop bounds: [(d0) -> (d0 - 1), (d0) -> (d0)] ) 772 773 // Resulting fusion would be as below. It is easy to note the out-of-bounds 774 // access by 'affine.load'. 775 776 // #map0 = affine_map<(d0) -> (d0 - 1)> 777 // #map1 = affine_map<(d0) -> (d0)> 778 // affine.for %arg1 = #map0(%arg0) to #map1(%arg0) { 779 // %5 = affine.load %1[%arg1] : memref<5xf32> 780 // ... 781 // ... 782 // } 783 784 %a = affine.load %B[%arg0] : memref<6xf32> 785 %b = arith.mulf %a, %cst : f32 786 affine.store %b, %C[%arg0] : memref<5xf32> 787 } 788 return 789} 790// CHECK-LABEL: func @no_fusion_cannot_compute_valid_slice 791// CHECK: affine.for 792// CHECK-NEXT: affine.load 793// CHECK-NEXT: affine.store 794// CHECK: affine.for 795// CHECK-NEXT: affine.load 796// CHECK-NEXT: arith.mulf 797// CHECK-NEXT: affine.store 798 799// MAXIMAL-LABEL: func @reduce_add_f32_f32( 800func.func @reduce_add_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1: memref<1x64xf32, 1>, %arg2: memref<1x64xf32, 1>) { 801 %cst_0 = arith.constant 0.000000e+00 : f32 802 %cst_1 = arith.constant 1.000000e+00 : f32 803 %0 = memref.alloca() : memref<f32, 1> 804 %1 = memref.alloca() : memref<f32, 1> 805 affine.for %arg3 = 0 to 1 { 806 affine.for %arg4 = 0 to 64 { 807 %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 { 808 %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> 809 %5 = arith.addf %prevAccum, %4 : f32 810 affine.yield %5 : f32 811 } 812 %accum_dbl = arith.addf %accum, %accum : f32 813 affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1> 814 } 815 } 816 affine.for %arg3 = 0 to 1 { 817 affine.for %arg4 = 0 to 64 { 818 %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_1) -> f32 { 819 %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> 820 %5 = arith.mulf %prevAccum, %4 : f32 821 affine.yield %5 : f32 822 } 823 %accum_sqr = arith.mulf %accum, %accum : f32 824 affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1> 825 } 826 } 827 return 828} 829// The two loops here get maximally sibling-fused at the innermost 830// insertion point. Test checks if the innermost reduction loop of the fused loop 831// gets promoted into its outerloop. 832// MAXIMAL-SAME: %[[arg_0:.*]]: memref<64x64xf32, 1>, 833// MAXIMAL-SAME: %[[arg_1:.*]]: memref<1x64xf32, 1>, 834// MAXIMAL-SAME: %[[arg_2:.*]]: memref<1x64xf32, 1>) { 835// MAXIMAL: %[[cst:.*]] = arith.constant 0 : index 836// MAXIMAL-NEXT: %[[cst_0:.*]] = arith.constant 0.000000e+00 : f32 837// MAXIMAL-NEXT: %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32 838// MAXIMAL: affine.for %[[idx_0:.*]] = 0 to 1 { 839// MAXIMAL-NEXT: affine.for %[[idx_1:.*]] = 0 to 64 { 840// MAXIMAL-NEXT: %[[results:.*]]:2 = affine.for %[[idx_2:.*]] = 0 to 64 iter_args(%[[iter_0:.*]] = %[[cst_1]], %[[iter_1:.*]] = %[[cst_0]]) -> (f32, f32) { 841// MAXIMAL-NEXT: %[[val_0:.*]] = affine.load %[[arg_0]][%[[idx_2]], %[[idx_1]]] : memref<64x64xf32, 1> 842// MAXIMAL-NEXT: %[[reduc_0:.*]] = arith.addf %[[iter_1]], %[[val_0]] : f32 843// MAXIMAL-NEXT: %[[val_1:.*]] = affine.load %[[arg_0]][%[[idx_2]], %[[idx_1]]] : memref<64x64xf32, 1> 844// MAXIMAL-NEXT: %[[reduc_1:.*]] = arith.mulf %[[iter_0]], %[[val_1]] : f32 845// MAXIMAL-NEXT: affine.yield %[[reduc_1]], %[[reduc_0]] : f32, f32 846// MAXIMAL-NEXT: } 847// MAXIMAL-NEXT: %[[reduc_0_dbl:.*]] = arith.addf %[[results:.*]]#1, %[[results]]#1 : f32 848// MAXIMAL-NEXT: affine.store %[[reduc_0_dbl]], %[[arg_1]][%[[cst]], %[[idx_1]]] : memref<1x64xf32, 1> 849// MAXIMAL-NEXT: %[[reduc_1_sqr:.*]] = arith.mulf %[[results]]#0, %[[results]]#0 : f32 850// MAXIMAL-NEXT: affine.store %[[reduc_1_sqr]], %[[arg_2]][%[[idx_0]], %[[idx_1]]] : memref<1x64xf32, 1> 851// MAXIMAL-NEXT: } 852// MAXIMAL-NEXT: } 853// MAXIMAL-NEXT: return 854// MAXIMAL-NEXT: } 855 856// ----- 857 858// CHECK-LABEL: func @reduce_add_non_innermost 859func.func @reduce_add_non_innermost(%arg0: memref<64x64xf32, 1>, %arg1: memref<1x64xf32, 1>, %arg2: memref<1x64xf32, 1>) { 860 %cst = arith.constant 0.000000e+00 : f32 861 %cst_0 = arith.constant 1.000000e+00 : f32 862 %0 = memref.alloca() : memref<f32, 1> 863 %1 = memref.alloca() : memref<f32, 1> 864 affine.for %arg3 = 0 to 1 { 865 affine.for %arg4 = 0 to 64 { 866 %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst) -> f32 { 867 %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> 868 %5 = arith.addf %prevAccum, %4 : f32 869 affine.yield %5 : f32 870 } 871 %accum_dbl = arith.addf %accum, %accum : f32 872 affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1> 873 } 874 } 875 affine.for %arg3 = 0 to 1 { 876 affine.for %arg4 = 0 to 64 { 877 %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 { 878 %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> 879 %5 = arith.mulf %prevAccum, %4 : f32 880 affine.yield %5 : f32 881 } 882 %accum_sqr = arith.mulf %accum, %accum : f32 883 affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1> 884 } 885 } 886 return 887} 888// Test checks the loop structure is preserved after sibling fusion. 889// CHECK: affine.for 890// CHECK-NEXT: affine.for 891// CHECK-NEXT: affine.for 892// CHECK: affine.for 893 894 895 896// ----- 897 898// CHECK-LABEL: func @fuse_large_number_of_loops 899func.func @fuse_large_number_of_loops(%arg0: memref<20x10xf32, 1>, %arg1: memref<20x10xf32, 1>, %arg2: memref<20x10xf32, 1>, %arg3: memref<20x10xf32, 1>, %arg4: memref<20x10xf32, 1>, %arg5: memref<f32, 1>, %arg6: memref<f32, 1>, %arg7: memref<f32, 1>, %arg8: memref<f32, 1>, %arg9: memref<20x10xf32, 1>, %arg10: memref<20x10xf32, 1>, %arg11: memref<20x10xf32, 1>, %arg12: memref<20x10xf32, 1>) { 900 %cst = arith.constant 1.000000e+00 : f32 901 %0 = memref.alloc() : memref<f32, 1> 902 affine.store %cst, %0[] : memref<f32, 1> 903 %1 = memref.alloc() : memref<20x10xf32, 1> 904 affine.for %arg13 = 0 to 20 { 905 affine.for %arg14 = 0 to 10 { 906 %21 = affine.load %arg6[] : memref<f32, 1> 907 affine.store %21, %1[%arg13, %arg14] : memref<20x10xf32, 1> 908 } 909 } 910 %2 = memref.alloc() : memref<20x10xf32, 1> 911 affine.for %arg13 = 0 to 20 { 912 affine.for %arg14 = 0 to 10 { 913 %21 = affine.load %1[%arg13, %arg14] : memref<20x10xf32, 1> 914 %22 = affine.load %arg3[%arg13, %arg14] : memref<20x10xf32, 1> 915 %23 = arith.mulf %22, %21 : f32 916 affine.store %23, %2[%arg13, %arg14] : memref<20x10xf32, 1> 917 } 918 } 919 %3 = memref.alloc() : memref<f32, 1> 920 %4 = affine.load %arg6[] : memref<f32, 1> 921 %5 = affine.load %0[] : memref<f32, 1> 922 %6 = arith.subf %5, %4 : f32 923 affine.store %6, %3[] : memref<f32, 1> 924 %7 = memref.alloc() : memref<20x10xf32, 1> 925 affine.for %arg13 = 0 to 20 { 926 affine.for %arg14 = 0 to 10 { 927 %21 = affine.load %3[] : memref<f32, 1> 928 affine.store %21, %7[%arg13, %arg14] : memref<20x10xf32, 1> 929 } 930 } 931 %8 = memref.alloc() : memref<20x10xf32, 1> 932 affine.for %arg13 = 0 to 20 { 933 affine.for %arg14 = 0 to 10 { 934 %21 = affine.load %arg1[%arg13, %arg14] : memref<20x10xf32, 1> 935 %22 = affine.load %7[%arg13, %arg14] : memref<20x10xf32, 1> 936 %23 = arith.mulf %22, %21 : f32 937 affine.store %23, %8[%arg13, %arg14] : memref<20x10xf32, 1> 938 } 939 } 940 %9 = memref.alloc() : memref<20x10xf32, 1> 941 affine.for %arg13 = 0 to 20 { 942 affine.for %arg14 = 0 to 10 { 943 %21 = affine.load %arg1[%arg13, %arg14] : memref<20x10xf32, 1> 944 %22 = affine.load %8[%arg13, %arg14] : memref<20x10xf32, 1> 945 %23 = arith.mulf %22, %21 : f32 946 affine.store %23, %9[%arg13, %arg14] : memref<20x10xf32, 1> 947 } 948 } 949 affine.for %arg13 = 0 to 20 { 950 affine.for %arg14 = 0 to 10 { 951 %21 = affine.load %9[%arg13, %arg14] : memref<20x10xf32, 1> 952 %22 = affine.load %2[%arg13, %arg14] : memref<20x10xf32, 1> 953 %23 = arith.addf %22, %21 : f32 954 affine.store %23, %arg11[%arg13, %arg14] : memref<20x10xf32, 1> 955 } 956 } 957 %10 = memref.alloc() : memref<20x10xf32, 1> 958 affine.for %arg13 = 0 to 20 { 959 affine.for %arg14 = 0 to 10 { 960 %21 = affine.load %1[%arg13, %arg14] : memref<20x10xf32, 1> 961 %22 = affine.load %arg2[%arg13, %arg14] : memref<20x10xf32, 1> 962 %23 = arith.mulf %22, %21 : f32 963 affine.store %23, %10[%arg13, %arg14] : memref<20x10xf32, 1> 964 } 965 } 966 affine.for %arg13 = 0 to 20 { 967 affine.for %arg14 = 0 to 10 { 968 %21 = affine.load %8[%arg13, %arg14] : memref<20x10xf32, 1> 969 %22 = affine.load %10[%arg13, %arg14] : memref<20x10xf32, 1> 970 %23 = arith.addf %22, %21 : f32 971 affine.store %23, %arg10[%arg13, %arg14] : memref<20x10xf32, 1> 972 } 973 } 974 %11 = memref.alloc() : memref<20x10xf32, 1> 975 affine.for %arg13 = 0 to 20 { 976 affine.for %arg14 = 0 to 10 { 977 %21 = affine.load %arg10[%arg13, %arg14] : memref<20x10xf32, 1> 978 %22 = affine.load %arg10[%arg13, %arg14] : memref<20x10xf32, 1> 979 %23 = arith.mulf %22, %21 : f32 980 affine.store %23, %11[%arg13, %arg14] : memref<20x10xf32, 1> 981 } 982 } 983 %12 = memref.alloc() : memref<20x10xf32, 1> 984 affine.for %arg13 = 0 to 20 { 985 affine.for %arg14 = 0 to 10 { 986 %21 = affine.load %11[%arg13, %arg14] : memref<20x10xf32, 1> 987 %22 = affine.load %arg11[%arg13, %arg14] : memref<20x10xf32, 1> 988 %23 = arith.subf %22, %21 : f32 989 affine.store %23, %12[%arg13, %arg14] : memref<20x10xf32, 1> 990 } 991 } 992 %13 = memref.alloc() : memref<20x10xf32, 1> 993 affine.for %arg13 = 0 to 20 { 994 affine.for %arg14 = 0 to 10 { 995 %21 = affine.load %arg7[] : memref<f32, 1> 996 affine.store %21, %13[%arg13, %arg14] : memref<20x10xf32, 1> 997 } 998 } 999 %14 = memref.alloc() : memref<20x10xf32, 1> 1000 affine.for %arg13 = 0 to 20 { 1001 affine.for %arg14 = 0 to 10 { 1002 %21 = affine.load %arg4[%arg13, %arg14] : memref<20x10xf32, 1> 1003 %22 = affine.load %13[%arg13, %arg14] : memref<20x10xf32, 1> 1004 %23 = arith.mulf %22, %21 : f32 1005 affine.store %23, %14[%arg13, %arg14] : memref<20x10xf32, 1> 1006 } 1007 } 1008 %15 = memref.alloc() : memref<20x10xf32, 1> 1009 affine.for %arg13 = 0 to 20 { 1010 affine.for %arg14 = 0 to 10 { 1011 %21 = affine.load %arg8[] : memref<f32, 1> 1012 affine.store %21, %15[%arg13, %arg14] : memref<20x10xf32, 1> 1013 } 1014 } 1015 %16 = memref.alloc() : memref<20x10xf32, 1> 1016 affine.for %arg13 = 0 to 20 { 1017 affine.for %arg14 = 0 to 10 { 1018 %21 = affine.load %15[%arg13, %arg14] : memref<20x10xf32, 1> 1019 %22 = affine.load %12[%arg13, %arg14] : memref<20x10xf32, 1> 1020 %23 = arith.addf %22, %21 : f32 1021 affine.store %23, %16[%arg13, %arg14] : memref<20x10xf32, 1> 1022 } 1023 } 1024 %17 = memref.alloc() : memref<20x10xf32, 1> 1025 affine.for %arg13 = 0 to 20 { 1026 affine.for %arg14 = 0 to 10 { 1027 %21 = affine.load %16[%arg13, %arg14] : memref<20x10xf32, 1> 1028 %22 = math.sqrt %21 : f32 1029 affine.store %22, %17[%arg13, %arg14] : memref<20x10xf32, 1> 1030 } 1031 } 1032 %18 = memref.alloc() : memref<20x10xf32, 1> 1033 affine.for %arg13 = 0 to 20 { 1034 affine.for %arg14 = 0 to 10 { 1035 %21 = affine.load %arg5[] : memref<f32, 1> 1036 affine.store %21, %18[%arg13, %arg14] : memref<20x10xf32, 1> 1037 } 1038 } 1039 %19 = memref.alloc() : memref<20x10xf32, 1> 1040 affine.for %arg13 = 0 to 20 { 1041 affine.for %arg14 = 0 to 10 { 1042 %21 = affine.load %arg1[%arg13, %arg14] : memref<20x10xf32, 1> 1043 %22 = affine.load %18[%arg13, %arg14] : memref<20x10xf32, 1> 1044 %23 = arith.mulf %22, %21 : f32 1045 affine.store %23, %19[%arg13, %arg14] : memref<20x10xf32, 1> 1046 } 1047 } 1048 %20 = memref.alloc() : memref<20x10xf32, 1> 1049 affine.for %arg13 = 0 to 20 { 1050 affine.for %arg14 = 0 to 10 { 1051 %21 = affine.load %17[%arg13, %arg14] : memref<20x10xf32, 1> 1052 %22 = affine.load %19[%arg13, %arg14] : memref<20x10xf32, 1> 1053 %23 = arith.divf %22, %21 : f32 1054 affine.store %23, %20[%arg13, %arg14] : memref<20x10xf32, 1> 1055 } 1056 } 1057 affine.for %arg13 = 0 to 20 { 1058 affine.for %arg14 = 0 to 10 { 1059 %21 = affine.load %20[%arg13, %arg14] : memref<20x10xf32, 1> 1060 %22 = affine.load %14[%arg13, %arg14] : memref<20x10xf32, 1> 1061 %23 = arith.addf %22, %21 : f32 1062 affine.store %23, %arg12[%arg13, %arg14] : memref<20x10xf32, 1> 1063 } 1064 } 1065 affine.for %arg13 = 0 to 20 { 1066 affine.for %arg14 = 0 to 10 { 1067 %21 = affine.load %arg12[%arg13, %arg14] : memref<20x10xf32, 1> 1068 %22 = affine.load %arg0[%arg13, %arg14] : memref<20x10xf32, 1> 1069 %23 = arith.subf %22, %21 : f32 1070 affine.store %23, %arg9[%arg13, %arg14] : memref<20x10xf32, 1> 1071 } 1072 } 1073 return 1074} 1075// CHECK: affine.for 1076// CHECK: affine.for 1077// CHECK-NOT: affine.for 1078 1079// CHECK-LABEL: func @alias_escaping_memref 1080func.func @alias_escaping_memref(%a : memref<2x5xf32>) { 1081 %cst = arith.constant 0.000000e+00 : f32 1082 %alias = memref.reinterpret_cast %a to offset: [0], sizes: [10], strides: [1] : memref<2x5xf32> to memref<10xf32> 1083 affine.for %i0 = 0 to 10 { 1084 affine.store %cst, %alias[%i0] : memref<10xf32> 1085 } 1086 1087 affine.for %i1 = 0 to 10 { 1088 %0 = affine.load %alias[%i1] : memref<10xf32> 1089 } 1090 // Fusion happens, but memref isn't privatized since %alias is an alias of a 1091 // function argument. 1092 // CHECK: memref.reinterpret_cast 1093 // CHECK-NEXT: affine.for 1094 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 1095 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 1096 // CHECK-NEXT: } 1097 // CHECK-NOT: affine.for 1098 1099 return 1100} 1101 1102// CHECK-LABEL: func @unknown_memref_def_op 1103func.func @unknown_memref_def_op() { 1104 %cst = arith.constant 0.000000e+00 : f32 1105 %may_alias = call @bar() : () -> memref<10xf32> 1106 affine.for %i0 = 0 to 10 { 1107 affine.store %cst, %may_alias[%i0] : memref<10xf32> 1108 } 1109 1110 affine.for %i1 = 0 to 10 { 1111 %0 = affine.load %may_alias[%i1] : memref<10xf32> 1112 } 1113 // Fusion happens, but memref isn't privatized since %may_alias's origin is 1114 // unknown. 1115 // CHECK: call 1116 // CHECK-NEXT: affine.for 1117 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> 1118 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32> 1119 // CHECK-NEXT: } 1120 // CHECK-NOT: affine.for 1121 1122 return 1123} 1124func.func private @bar() -> memref<10xf32> 1125 1126 1127// Add further tests in mlir/test/Transforms/loop-fusion-4.mlir 1128