1// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s --check-prefixes=CHECK,PARALLEL-CHECK 2// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only check-parallel-regions=false" -split-input-file | FileCheck %s --check-prefixes=CHECK,NO-PARALLEL-CHECK 3 4// Run fuzzer with different seeds. 5// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23" -split-input-file -o /dev/null 6// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59" -split-input-file -o /dev/null 7// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91" -split-input-file -o /dev/null 8 9// CHECK-LABEL: func @scf_for_yield_only 10func.func @scf_for_yield_only( 11 %A : tensor<?xf32> {bufferization.writable = false}, 12 %B : tensor<?xf32> {bufferization.writable = true}, 13 %lb : index, 14 %ub : index, 15 %step : index) 16 -> (tensor<?xf32>, tensor<?xf32>) 17{ 18 // CHECK: scf.for 19 // CHECK-NEXT: scf.yield 20 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 21 // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]} 22 %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) { 23 scf.yield %t : tensor<?xf32> 24 } 25 26 // CHECK: scf.for 27 // CHECK-NEXT: scf.yield 28 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 29 // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]} 30 %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor<?xf32>) { 31 scf.yield %t : tensor<?xf32> 32 } 33 34 // CHECK: return 35 // CHECK-SAME: __equivalent_func_args__ = [-1, 1] 36 return %r0, %r1: tensor<?xf32>, tensor<?xf32> 37} 38 39// ----- 40 41// CHECK-LABEL: func @scf_for_with_tensor.insert_slice 42func.func @scf_for_with_tensor.insert_slice( 43 %A : tensor<?xf32> {bufferization.writable = false}, 44 %B : tensor<?xf32> {bufferization.writable = true}, 45 %C : tensor<4xf32> {bufferization.writable = false}, 46 %lb : index, 47 %ub : index, 48 %step : index) 49 -> (tensor<?xf32>, tensor<?xf32>) 50{ 51 // CHECK: scf.for 52 // scf.for bbArgs are always inplaceable seen from ops inside the body: 53 // 1. Either the matching tensor is not inplaceable and an alloc occurs 54 // which makes bbArg inplaceable. 55 // 2. Or it is already inplaceable and so is bbArg. 56 // CHECK-NEXT: tensor.insert_slice 57 // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]} 58 // CHECK-NEXT: tensor.insert_slice 59 // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]} 60 // CHECK-NEXT: scf.yield {__inplace_operands_attr__ = ["true", "true"]} 61 // CHECK-NEXT: } {__inplace_operands_attr__ = ["none", "none", "none", "false", "true"]} 62 %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B) 63 -> (tensor<?xf32>, tensor<?xf32>) 64 { 65 %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32> 66 %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32> 67 scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32> 68 } 69 70 // CHECK: return 71 // CHECK-SAME: __equivalent_func_args__ = [-1, 1] 72 return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32> 73} 74 75// ----- 76 77func.func private @some_use(tensor<?xf32>) -> () 78 79// CHECK-LABEL: func @scf_for_deps 80func.func @scf_for_deps( 81 %A : tensor<?xf32> {bufferization.writable = true}, 82 %B : tensor<?xf32> {bufferization.writable = true}, 83 %lb : index, 84 %ub : index, 85 %step : index) 86 -> (tensor<?xf32>) 87{ 88 // %r0 must be out of place because one use of %t in the subsequent production 89 // of %r1 is read. 90 // CHECK: scf.for 91 // CHECK-NEXT: call 92 // CHECK-SAME: {__inplace_operands_attr__ = ["false"]} 93 // CHECK-NEXT: scf.yield 94 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 95 // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]} 96 %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) { 97 func.call @some_use(%t) : (tensor<?xf32>) -> () 98 scf.yield %t : tensor<?xf32> 99 } 100 101 // %r1 bufferizes inplace fine. 102 // CHECK: scf.for 103 // CHECK-NEXT: call 104 // CHECK-SAME: {__inplace_operands_attr__ = ["false"]} 105 // CHECK-NEXT: scf.yield 106 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 107 // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]} 108 %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) { 109 func.call @some_use(%t) : (tensor<?xf32>) -> () 110 scf.yield %t : tensor<?xf32> 111 } 112 113 // CHECK: return 114 // CHECK-SAME: __equivalent_func_args__ = [0] 115 return %r1: tensor<?xf32> 116} 117 118// ----- 119 120#accesses = [ 121 affine_map<(i) -> (i)> 122] 123#trait = { 124 indexing_maps = #accesses, 125 iterator_types = ["parallel"] 126} 127 128// CHECK-LABEL: func @reading_scf_for 129func.func @reading_scf_for(%t1: tensor<?xf32> {bufferization.writable = true}, 130 %s: index, %v: vector<5xf32>) -> (tensor<?xf32>, vector<5xf32>) { 131 132 %c0 = arith.constant 0 : index 133 %c1 = arith.constant 1 : index 134 %cst = arith.constant 0.0 : f32 135 136 // Write to %t1. 137 // CHECK: vector.transfer_write 138 // CHECK-SAME: __inplace_operands_attr__ = ["none", "false", "none"] 139 %t3 = vector.transfer_write %v, %t1[%s] : vector<5xf32>, tensor<?xf32> 140 141 // Read the old value of %t1 inside the loop via an alias. 142 // CHECK: scf.for {{.*}} { 143 %r, %v3 = scf.for %i = %c0 to %s step %c1 iter_args(%t2 = %t1, %v0 = %v) -> (tensor<?xf32>, vector<5xf32>) { 144 // CHECK: tensor.extract_slice 145 // CHECK-SAME: __inplace_operands_attr__ = ["true", "none", "none"] 146 %e = tensor.extract_slice %t2[%s][%s][1] : tensor<?xf32> to tensor<?xf32> 147 148 // Read from %t1 via alias %e. 149 %v2 = vector.transfer_read %e[%s], %cst : tensor<?xf32>, vector<5xf32> 150 scf.yield %t2, %v2 : tensor<?xf32>, vector<5xf32> 151 } 152 // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true", "none"]} 153 154 // Use %t3 in some way without reading it, so that it does not get DCE'd. 155 // CHECK: linalg.generic 156 // CHECK-SAME: __inplace_operands_attr__ = ["true"] 157 %o = linalg.generic #trait outs (%t3 : tensor<?xf32>) { 158 ^bb(%0: f32) : 159 linalg.yield %cst : f32 160 } -> (tensor<?xf32>) 161 162 return %o, %v3 : tensor<?xf32>, vector<5xf32> 163} 164 165// ----- 166 167#accesses = [ 168 affine_map<(i) -> (i)> 169] 170#trait = { 171 indexing_maps = #accesses, 172 iterator_types = ["parallel"] 173} 174 175// CHECK-LABEL: func @non_reading_scf_for 176func.func @non_reading_scf_for(%t1: tensor<?xf32> {bufferization.writable = true}, 177 %s: index, %v: vector<5xf32>) -> (tensor<?xf32>, vector<5xf32>) { 178 179 %c0 = arith.constant 0 : index 180 %c1 = arith.constant 1 : index 181 %c10 = arith.constant 10 : index 182 %cst = arith.constant 0.0 : f32 183 184 // Write to %t1. 185 // CHECK: vector.transfer_write 186 // CHECK-SAME: __inplace_operands_attr__ = ["none", "true", "none"] 187 %t3 = vector.transfer_write %v, %t1[%s] : vector<5xf32>, tensor<?xf32> 188 189 // This loop does not read from %t1. It only writes to it. 190 // CHECK: scf.for 191 %r, %v3 = scf.for %i = %c0 to %c10 step %c1 iter_args(%t2 = %t1, %v0 = %v) -> (tensor<?xf32>, vector<5xf32>) { 192 // Write to %t1 via %t2. (Overwrite %t3.) 193 // CHECK: linalg.generic 194 // CHECK-SAME: __inplace_operands_attr__ = ["true"] 195 %o2 = linalg.generic #trait outs (%t2 : tensor<?xf32>) { 196 ^bb(%0: f32) : 197 linalg.yield %cst : f32 198 } -> (tensor<?xf32>) 199 200 // Read overwritten value. This is not a read of %t1. 201 %v2 = vector.transfer_read %o2[%s], %cst : tensor<?xf32>, vector<5xf32> 202 scf.yield %o2, %v2 : tensor<?xf32>, vector<5xf32> 203 } 204 205 // Use %t3 in some way without reading it, so that it does not get DCE'd. 206 // CHECK: linalg.generic 207 // CHECK-SAME: __inplace_operands_attr__ = ["true"] 208 %o = linalg.generic #trait outs (%t3 : tensor<?xf32>) { 209 ^bb(%0: f32) : 210 linalg.yield %cst : f32 211 } -> (tensor<?xf32>) 212 213 // CHECK: return 214 // CHECK-SAME: __equivalent_func_args__ = [0, -1] 215 return %o, %v3 : tensor<?xf32>, vector<5xf32> 216} 217 218// ----- 219 220//===----------------------------------------------------------------------===// 221// scf.if cases 222//===----------------------------------------------------------------------===// 223 224// This example passes analysis, but it fails when bufferizing. 225// CHECK-LABEL: func @scf_if_inplace1 226func.func @scf_if_inplace1(%t1: tensor<?xf32> {bufferization.writable = true}, 227 %t2: tensor<?xf32> {bufferization.writable = true}, 228 %cond: i1) -> tensor<?xf32> { 229 %r = scf.if %cond -> (tensor<?xf32>) { 230 // CHECK: scf.yield 231 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 232 scf.yield %t1 : tensor<?xf32> 233 } else { 234 // CHECK: scf.yield 235 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 236 scf.yield %t2 : tensor<?xf32> 237 } 238 return %r : tensor<?xf32> 239} 240 241// ----- 242 243// CHECK-LABEL: func @scf_if_inplace2 244func.func @scf_if_inplace2(%t1: tensor<?xf32> {bufferization.writable = true}, 245 %v: vector<5xf32>, %idx: index, 246 %cond: i1) -> tensor<?xf32> { 247 %r = scf.if %cond -> (tensor<?xf32>) { 248 // CHECK: scf.yield 249 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 250 scf.yield %t1 : tensor<?xf32> 251 } else { 252 // CHECK: vector.transfer_write 253 // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"] 254 %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32> 255 scf.yield %t2 : tensor<?xf32> 256 } 257 // CHECK: return 258 // CHECK-SAME: __equivalent_func_args__ = [0] 259 return %r : tensor<?xf32> 260} 261 262// ----- 263 264// CHECK-LABEL: func @scf_if_inplace3 265func.func @scf_if_inplace3(%t1: tensor<?xf32> {bufferization.writable = true}, 266 %v1: vector<5xf32>, %v2: vector<5xf32>, %idx: index, 267 %cond: i1) -> tensor<?xf32> { 268 // CHECK: tensor.extract_slice 269 // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"] 270 %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32> 271 %r = scf.if %cond -> (tensor<?xf32>) { 272 // CHECK: vector.transfer_write 273 // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"] 274 %t2 = vector.transfer_write %v1, %e[%idx] : vector<5xf32>, tensor<?xf32> 275 // CHECK: scf.yield 276 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 277 scf.yield %t2 : tensor<?xf32> 278 } else { 279 // Writing the same tensor through an alias. This is OK. 280 // CHECK: vector.transfer_write 281 // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"] 282 %t3 = vector.transfer_write %v2, %t1[%idx] : vector<5xf32>, tensor<?xf32> 283 // CHECK: scf.yield 284 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 285 scf.yield %t3 : tensor<?xf32> 286 } 287 return %r : tensor<?xf32> 288} 289 290// ----- 291 292// CHECK-LABEL: func @scf_if_in_place4 293func.func @scf_if_in_place4(%t1: tensor<?xf32> {bufferization.writable = true}, 294 %v: vector<5xf32>, %idx: index, 295 %cond: i1, %cond2: i1) -> (tensor<?xf32>, vector<10xf32>) { 296 %cst = arith.constant 0.0 : f32 297 %r = scf.if %cond -> (tensor<?xf32>) { 298 // CHECK: scf.yield 299 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 300 scf.yield %t1 : tensor<?xf32> 301 } else { 302 // CHECK: vector.transfer_write 303 // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"] 304 %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32> 305 // CHECK: scf.yield 306 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 307 scf.yield %t2 : tensor<?xf32> 308 } 309 %r_alias = scf.if %cond2 -> (tensor<?xf32>) { 310 // Reading %r is OK. No conflict. 311 // CHECK: scf.yield 312 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 313 scf.yield %r : tensor<?xf32> 314 } else { 315 // CHECK: scf.yield 316 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 317 scf.yield %r : tensor<?xf32> 318 } 319 %v2 = vector.transfer_read %r_alias[%idx], %cst : tensor<?xf32>, vector<10xf32> 320 321 // CHECK: return 322 // CHECK-SAME: __equivalent_func_args__ = [0, -1] 323 return %r_alias, %v2 : tensor<?xf32>, vector<10xf32> 324} 325 326// ----- 327 328// CHECK-LABEL: func @scf_if_inplace5 329func.func @scf_if_inplace5(%t1: tensor<?xf32> {bufferization.writable = true}, 330 %idx: index, %cond: i1) -> tensor<?xf32> { 331 %r = scf.if %cond -> (tensor<?xf32>) { 332 // CHECK: tensor.extract_slice 333 // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"] 334 %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32> 335 // CHECK: scf.yield 336 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 337 scf.yield %e : tensor<?xf32> 338 } else { 339 // CHECK: tensor.extract_slice 340 // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"] 341 %f = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32> 342 // CHECK: scf.yield 343 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 344 scf.yield %f : tensor<?xf32> 345 } 346 347 // Inserting into an equivalent tensor at the same offset. This bufferizes 348 // inplace. 349 // CHECK: tensor.insert_slice 350 // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"] 351 %r2 = tensor.insert_slice %r into %t1[%idx][%idx][1] : tensor<?xf32> into tensor<?xf32> 352 353 // CHECK: return 354 // CHECK-SAME: __equivalent_func_args__ = [0] 355 return %r2 : tensor<?xf32> 356} 357 358// ----- 359 360// CHECK-LABEL: func @scf_if_inplace6 361func.func @scf_if_inplace6(%t1: tensor<?xf32> {bufferization.writable = true}, 362 %v1: vector<5xf32>, %v2: vector<5xf32>, 363 %v3: vector<5xf32>, %idx: index, 364 %cond: i1, %cond2: i1) -> tensor<?xf32> { 365 // Test nested scf.if ops. 366 %r = scf.if %cond -> (tensor<?xf32>) { 367 %t2 = scf.if %cond2 -> (tensor<?xf32>) { 368 // CHECK: vector.transfer_write 369 // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"] 370 %t3 = vector.transfer_write %v1, %t1[%idx] : vector<5xf32>, tensor<?xf32> 371 // CHECK: scf.yield 372 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 373 scf.yield %t3 : tensor<?xf32> 374 } else { 375 // CHECK: vector.transfer_write 376 // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"] 377 %t4 = vector.transfer_write %v3, %t1[%idx] : vector<5xf32>, tensor<?xf32> 378 // CHECK: scf.yield 379 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 380 scf.yield %t4 : tensor<?xf32> 381 } 382 // CHECK: scf.yield 383 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 384 scf.yield %t2 : tensor<?xf32> 385 } else { 386 // CHECK: vector.transfer_write 387 // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"] 388 %t3 = vector.transfer_write %v2, %t1[%idx] : vector<5xf32>, tensor<?xf32> 389 // CHECK: scf.yield 390 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 391 scf.yield %t3 : tensor<?xf32> 392 } 393 394 // CHECK: return 395 // CHECK-SAME: __equivalent_func_args__ = [0] 396 return %r : tensor<?xf32> 397} 398 399// ----- 400 401// CHECK-LABEL: func @scf_if_inplace7 402func.func @scf_if_inplace7(%t1: tensor<?xf32> {bufferization.writable = true}, 403 %v1: vector<5xf32>, %v2: vector<5xf32>, %idx: index, 404 %idx2: index, %cond: i1) -> (tensor<?xf32>, vector<5xf32>) { 405 %cst = arith.constant 0.0 : f32 406 %r, %v_r2 = scf.if %cond -> (tensor<?xf32>, vector<5xf32>) { 407 // CHECK: vector.transfer_write 408 // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"] 409 %t2 = vector.transfer_write %v1, %t1[%idx] : vector<5xf32>, tensor<?xf32> 410 // CHECK: scf.yield 411 // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]} 412 scf.yield %t2, %v1 : tensor<?xf32>, vector<5xf32> 413 } else { 414 // Writing the same tensor through an alias. 415 // CHECK: vector.transfer_write 416 // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"] 417 %t3 = vector.transfer_write %v2, %t1[%idx] : vector<5xf32>, tensor<?xf32> 418 // Read the original value of %t1. This requires the write in this branch 419 // to be out-of-place. But the write in the other branch can still be 420 // inplace. 421 %v_r = vector.transfer_read %t1[%idx2], %cst : tensor<?xf32>, vector<5xf32> 422 // CHECK: scf.yield 423 // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]} 424 scf.yield %t3, %v_r : tensor<?xf32>, vector<5xf32> 425 } 426 return %r, %v_r2 : tensor<?xf32>, vector<5xf32> 427} 428 429// ----- 430 431// CHECK-LABEL: func @scf_if_out_of_place1a 432func.func @scf_if_out_of_place1a(%t1: tensor<?xf32> {bufferization.writable = true}, 433 %idx: index, %idx2: index, 434 %cond: i1) -> tensor<?xf32> { 435 %r = scf.if %cond -> (tensor<?xf32>) { 436 // CHECK: tensor.extract_slice 437 // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"] 438 %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32> 439 // CHECK: scf.yield 440 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 441 scf.yield %e : tensor<?xf32> 442 } else { 443 // CHECK: scf.yield 444 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 445 scf.yield %t1 : tensor<?xf32> 446 } 447 448 // Reading from and writing to the same tensor via different args. This is a 449 // conflict. 450 // CHECK: tensor.insert_slice 451 // CHECK-SAME: {__inplace_operands_attr__ = ["true", "false", "none", "none"] 452 %r2 = tensor.insert_slice %r into %t1[%idx2][%idx2][1] : tensor<?xf32> into tensor<?xf32> 453 return %r2 : tensor<?xf32> 454} 455 456// ----- 457 458// CHECK-LABEL: func @scf_if_out_of_place1b 459func.func @scf_if_out_of_place1b(%t1: tensor<?xf32> {bufferization.writable = true}, 460 %idx: index, %idx2: index, %idx3: index, 461 %cond: i1) -> tensor<?xf32> { 462 %r = scf.if %cond -> (tensor<?xf32>) { 463 // CHECK: tensor.extract_slice 464 // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"] 465 %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32> 466 // CHECK: scf.yield 467 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 468 scf.yield %e : tensor<?xf32> 469 } else { 470 // CHECK: tensor.extract_slice 471 // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"] 472 %f = tensor.extract_slice %t1[%idx2][%idx2][1] : tensor<?xf32> to tensor<?xf32> 473 // CHECK: scf.yield 474 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 475 scf.yield %f : tensor<?xf32> 476 } 477 478 // Reading from and writing to the same tensor via different args. This is a 479 // conflict. In contrast to scf_if_out_of_place1a, the fact that %r aliases 480 // with %t1 is only detected when analyzing the tensor.extract_slices. That's 481 // why the tensor.insert_slice is inplace and the two extract_slices are 482 // out-of-place. 483 // CHECK: tensor.insert_slice 484 // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"] 485 %r2 = tensor.insert_slice %r into %t1[%idx3][%idx3][1] : tensor<?xf32> into tensor<?xf32> 486 487 // CHECK: return 488 // CHECK-SAME: __equivalent_func_args__ = [0] 489 return %r2 : tensor<?xf32> 490} 491 492// ----- 493 494// CHECK-LABEL: func @scf_if_out_of_place1c 495func.func @scf_if_out_of_place1c(%t1: tensor<?xf32> {bufferization.writable = true}, 496 %idx: index, %idx2: index, %cond: i1) -> tensor<?xf32> { 497 %r = scf.if %cond -> (tensor<?xf32>) { 498 // CHECK: tensor.extract_slice 499 // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"] 500 %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32> 501 // CHECK: scf.yield 502 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 503 scf.yield %e : tensor<?xf32> 504 } else { 505 // TODO: This one could bufferize inplace, but the analysis is too restrictive. 506 // CHECK: tensor.extract_slice 507 // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"] 508 %f = tensor.extract_slice %t1[%idx2][%idx2][1] : tensor<?xf32> to tensor<?xf32> 509 // CHECK: scf.yield 510 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 511 scf.yield %f : tensor<?xf32> 512 } 513 514 // CHECK: tensor.insert_slice 515 // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"] 516 %r2 = tensor.insert_slice %r into %t1[%idx2][%idx2][1] : tensor<?xf32> into tensor<?xf32> 517 518 // CHECK: return 519 // CHECK-SAME: __equivalent_func_args__ = [0] 520 return %r2 : tensor<?xf32> 521} 522 523// ----- 524 525// CHECK-LABEL: func @scf_if_out_of_place2 526func.func @scf_if_out_of_place2(%t1: tensor<?xf32> {bufferization.writable = true}, 527 %v: vector<5xf32>, %idx: index, 528 %cond: i1) -> (tensor<?xf32>, vector<10xf32>) { 529 %cst = arith.constant 0.0 : f32 530 %r = scf.if %cond -> (tensor<?xf32>) { 531 scf.yield %t1 : tensor<?xf32> 532 } else { 533 // CHECK: vector.transfer_write 534 // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"] 535 %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32> 536 // CHECK: scf.yield 537 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 538 scf.yield %t2 : tensor<?xf32> 539 } 540 541 // Read the old value of %t1. Forces the transfer_write to bufferize 542 // out-of-place. 543 %v2 = vector.transfer_read %t1[%idx], %cst : tensor<?xf32>, vector<10xf32> 544 return %r, %v2 : tensor<?xf32>, vector<10xf32> 545} 546 547// ----- 548 549// CHECK-LABEL: func @scf_if_out_of_place3 550func.func @scf_if_out_of_place3(%t1: tensor<?xf32> {bufferization.writable = true}, 551 %v: vector<5xf32>, %idx: index, 552 %cond: i1, %cond2: i1) -> (tensor<?xf32>, vector<10xf32>) { 553 %cst = arith.constant 0.0 : f32 554 %r = scf.if %cond -> (tensor<?xf32>) { 555 scf.yield %t1 : tensor<?xf32> 556 } else { 557 // CHECK: vector.transfer_write 558 // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"] 559 %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32> 560 // CHECK: scf.yield 561 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 562 scf.yield %t2 : tensor<?xf32> 563 } 564 %t1_alias = scf.if %cond2 -> (tensor<?xf32>) { 565 // scf.yield bufferizes to a read. That is a conflict in this example. 566 // CHECK: scf.yield 567 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 568 scf.yield %t1 : tensor<?xf32> 569 } else { 570 // CHECK: scf.yield 571 // CHECK-SAME: {__inplace_operands_attr__ = ["true"]} 572 scf.yield %t1 : tensor<?xf32> 573 } 574 %v2 = vector.transfer_read %t1_alias[%idx], %cst : tensor<?xf32>, vector<10xf32> 575 return %r, %v2 : tensor<?xf32>, vector<10xf32> 576} 577 578// ----- 579 580// CHECK-LABEL: func @write_to_same_tensor_in_loop_in_place( 581func.func @write_to_same_tensor_in_loop_in_place( 582 %A : tensor<?xf32> {bufferization.writable = true}, 583 %lb : index, %ub : index, %step : index, %sz: index) 584 -> (tensor<?xf32>) 585{ 586 // CHECK: scf.for {{.*}} { 587 %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) { 588 %B = bufferization.alloc_tensor(%sz) : tensor<?xf32> 589 %i2 = arith.index_cast %i : index to i32 590 %i3 = arith.sitofp %i2 : i32 to f32 591 // The tensor.insert is in-place because the %B is defined inside the loop. 592 // CHECK: tensor.insert 593 // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]} 594 %B2 = tensor.insert %i3 into %B[%i] : tensor<?xf32> 595 // CHECK: tensor.insert_slice 596 // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]} 597 %A2 = tensor.insert_slice %B2 into %t[%i][%sz][1] : tensor<?xf32> into tensor<?xf32> 598 scf.yield %A2 : tensor<?xf32> 599 } 600 // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]} 601 602 return %r0 : tensor<?xf32> 603} 604 605// ----- 606 607// This is a regression test. Everything can bufferize in-place because %7 and 608// %arg1 are in the same repetitive region. 609 610// CHECK-LABEL: func @same_enclosing_repetitive_region 611func.func @same_enclosing_repetitive_region(%2: tensor<320xf32>, 612 %3: tensor<320x10240xf32>) 613 -> tensor<320xf32> 614{ 615 %c0 = arith.constant 0 : index 616 %cst = arith.constant -0.000000e+00 : f32 617 %c320 = arith.constant 320 : index 618 %4 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %2) -> (tensor<320xf32>) { 619 // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]} 620 %5 = tensor.extract_slice %3[%arg0, 0] [1, 10240] [1, 1] : tensor<320x10240xf32> to tensor<1x10240xf32> 621 // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]} 622 %6 = tensor.extract_slice %arg1[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32> 623 // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} 624 %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<1xf32>) -> tensor<1xf32> 625 // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} 626 %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32> 627 628 scf.forall.in_parallel { 629 // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]} 630 tensor.parallel_insert_slice %8 into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32> 631 } 632 } 633 return %4 : tensor<320xf32> 634} 635 636// ----- 637 638// CHECK-LABEL: different_repetitive_region_via_alias 639func.func @different_repetitive_region_via_alias(%arg0: tensor<4xf32>, 640 %arg1: tensor<4xf32>, 641 %arg2: index, 642 %arg3: index, 643 %arg4: index) 644 -> (tensor<4xf32>) 645{ 646 %cst = arith.constant 0.000000e+00 : f32 647 %cst2 = arith.constant 1.000000e+00 : f32 648 %0 = bufferization.alloc_tensor() : tensor<4xf32> 649 650 // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} 651 %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4xf32>) -> tensor<4xf32> 652 653 %2 = scf.for %arg5 = %arg2 to %arg3 step %arg4 iter_args(%arg6 = %arg1) -> (tensor<4xf32>) { 654 // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]} 655 %4 = tensor.extract %1[%arg4] : tensor<4xf32> 656 vector.print %4 : f32 657 // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} 658 %5 = linalg.fill ins(%cst2 : f32) outs(%0 : tensor<4xf32>) -> tensor<4xf32> 659 scf.yield %5 : tensor<4xf32> 660 } 661 662 return %2 : tensor<4xf32> 663} 664 665// ----- 666 667// CHECK-LABEL: no_raw_conflict_after_repetitive_use 668func.func @no_raw_conflict_after_repetitive_use(%arg0: tensor<4xf32>, 669 %arg1: tensor<4xf32>, 670 %arg2: index, 671 %arg3: index, 672 %arg4: index) 673 -> (tensor<4xf32>, tensor<4xf32>) 674{ 675 %cst = arith.constant 0.000000e+00 : f32 676 %cst2 = arith.constant 1.000000e+00 : f32 677 %cst3 = arith.constant 2.000000e+00 : f32 678 %0 = bufferization.alloc_tensor() : tensor<4xf32> 679 680 // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} 681 %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4xf32>) -> tensor<4xf32> 682 683 %2 = scf.for %arg5 = %arg2 to %arg3 step %arg4 iter_args(%arg6 = %arg1) -> (tensor<4xf32>) { 684 // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]} 685 %4 = tensor.extract %1[%arg4] : tensor<4xf32> 686 vector.print %4 : f32 687 // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} 688 %5 = linalg.fill ins(%cst2 : f32) outs(%1 : tensor<4xf32>) -> tensor<4xf32> 689 scf.yield %5 : tensor<4xf32> 690 } 691 692 // The following is *not* a RaW conflict. 693 // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]} 694 %6 = tensor.extract %1[%arg4] : tensor<4xf32> 695 vector.print %6 : f32 696 // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} 697 %7 = linalg.fill ins(%cst3 : f32) outs(%1 : tensor<4xf32>) -> tensor<4xf32> 698 699 return %2, %7 : tensor<4xf32>, tensor<4xf32> 700} 701 702// ----- 703 704// CHECK-LABEL: func @read_of_bbarg_in_repetitive_region( 705func.func @read_of_bbarg_in_repetitive_region( 706 %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) { 707 // CHECK: scf.for 708 scf.for %iv = %a to %b step %c { 709 // Must bufferize out-of-place because definition of read is in a different 710 // repetitive region. 711 // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]} 712 %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32> 713 %3 = tensor.extract %2[%a] : tensor<4xf32> 714 vector.print %3 : f32 715 // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "false", "none"]} 716 %4 = tensor.insert %cst into %2[%a] : tensor<4xf32> 717 %5 = tensor.extract %4[%a] : tensor<4xf32> 718 vector.print %5 : f32 719 } 720 return 721} 722 723// ----- 724 725// CHECK-LABEL: func @read_definition_in_same_repetitive_region_as_write( 726func.func @read_definition_in_same_repetitive_region_as_write( 727 %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) { 728 // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]} 729 %1 = tensor.insert %cst into %t[%a] : tensor<10xf32> 730 // CHECK: scf.for 731 scf.for %iv = %a to %b step %c { 732 // Can bufferize in-place. 733 // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]} 734 %2 = tensor.extract_slice %1[0][4][1] : tensor<10xf32> to tensor<4xf32> 735 %3 = tensor.extract %2[%a] : tensor<4xf32> 736 vector.print %3 : f32 737 } 738 return 739} 740 741// ----- 742 743// CHECK-LABEL: func @read_definition_in_same_repetitive_region_as_conflicting_write( 744func.func @read_definition_in_same_repetitive_region_as_conflicting_write( 745 %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) { 746 // Cannot bufferize in-place according to normal op dominance rules. 747 // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "false", "none"]} 748 %1 = tensor.insert %cst into %t[%a] : tensor<10xf32> 749 // CHECK: scf.for 750 scf.for %iv = %a to %b step %c { 751 // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]} 752 %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32> 753 %3 = tensor.extract %2[%a] : tensor<4xf32> 754 vector.print %3 : f32 755 } 756 return 757} 758 759// ----- 760 761// CHECK: func @write_value_in_repetitive_region( 762func.func @write_value_in_repetitive_region( 763 %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) { 764 %0 = tensor.extract %t[%a] : tensor<10xf32> 765 vector.print %0 : f32 766 767 scf.for %iv = %a to %b step %c { 768 // No further read of %0, so this can bufferize in-place. 769 // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]} 770 %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32> 771 // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} 772 %filled = linalg.fill ins(%cst : f32) outs(%2 : tensor<4xf32>) -> tensor<4xf32> 773 %3 = tensor.extract %filled[%a] : tensor<4xf32> 774 vector.print %3 : f32 775 } 776 return 777} 778 779// ----- 780 781// CHECK-LABEL: func @nesting_op_repetitive_regions( 782func.func @nesting_op_repetitive_regions( 783 %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) { 784 // Cannot bufferize in-place according to normal op dominance rules. 785 // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "false", "none"]} 786 %1 = tensor.insert %cst into %t[%a] : tensor<10xf32> 787 // CHECK: scf.for 788 scf.for %iv1 = %a to %b step %c { 789 // CHECK: scf.for 790 scf.for %iv2 = %a to %b step %c { 791 // CHECK: scf.for 792 scf.for %iv3 = %a to %b step %c { 793 // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]} 794 %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32> 795 %3 = tensor.extract %2[%a] : tensor<4xf32> 796 vector.print %3 : f32 797 } 798 } 799 } 800 return 801} 802 803// ----- 804 805// CHECK-LABEL: func @parallel_region() 806func.func @parallel_region() -> tensor<320xf32> 807{ 808 %alloc0 = bufferization.alloc_tensor() : tensor<320xf32> 809 %alloc1 = bufferization.alloc_tensor() : tensor<1xf32> 810 %c320 = arith.constant 320 : index 811 // CHECK: scf.forall 812 %0 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %alloc0) -> (tensor<320xf32>) { 813 %val = "test.foo"() : () -> (f32) 814 // linalg.fill must bufferize out-of-place because every thread needs a 815 // private copy of %alloc1. If not accounting for parallel regions, the fill 816 // can bufferize in place. 817 // PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} 818 // NO-PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} 819 %fill = linalg.fill ins(%val : f32) outs(%alloc1 : tensor<1xf32>) -> tensor<1xf32> 820 scf.forall.in_parallel { 821 // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]} 822 tensor.parallel_insert_slice %fill into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32> 823 } 824 } 825 // CHECK: } {__inplace_operands_attr__ = ["none", "true"]} 826 return %0 : tensor<320xf32> 827} 828 829// ----- 830 831// CHECK-LABEL: func @parallel_region_mixed_def( 832func.func @parallel_region_mixed_def(%c: i1) -> tensor<320xf32> 833{ 834 %alloc0 = bufferization.alloc_tensor() : tensor<320xf32> 835 %alloc1 = bufferization.alloc_tensor() : tensor<1xf32> 836 %c320 = arith.constant 320 : index 837 // CHECK: scf.forall 838 %0 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %alloc0) -> (tensor<320xf32>) { 839 %alloc2 = bufferization.alloc_tensor() : tensor<1xf32> 840 %selected = scf.if %c -> tensor<1xf32> { 841 scf.yield %alloc1 : tensor<1xf32> 842 } else { 843 scf.yield %alloc2 : tensor<1xf32> 844 } 845 %val = "test.foo"() : () -> (f32) 846 // linalg.fill must bufferize out-of-place because every thread needs a 847 // private copy of %alloc1. If not accounting for parallel regions, the fill 848 // can bufferize in place. 849 // PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} 850 // NO-PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} 851 %fill = linalg.fill ins(%val : f32) outs(%selected : tensor<1xf32>) -> tensor<1xf32> 852 scf.forall.in_parallel { 853 // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]} 854 tensor.parallel_insert_slice %fill into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32> 855 } 856 } 857 // CHECK: } {__inplace_operands_attr__ = ["none", "true"]} 858 return %0 : tensor<320xf32> 859} 860 861// ----- 862 863// CHECK-LABEL: func @parallel_region_two_writes( 864func.func @parallel_region_two_writes(%f: f32) -> tensor<320xf32> 865{ 866 %alloc0 = bufferization.alloc_tensor() : tensor<320xf32> 867 %alloc1 = bufferization.alloc_tensor() : tensor<1xf32> 868 %c320 = arith.constant 320 : index 869 %c0 = arith.constant 0 : index 870 // CHECK: scf.forall 871 %0 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %alloc0) -> (tensor<320xf32>) { 872 %val = "test.foo"() : () -> (f32) 873 // linalg.fill must bufferize out-of-place because every thread needs a 874 // private copy of %alloc1. If not accounting for parallel regions, the fill 875 // can bufferize in place. 876 // PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} 877 // NO-PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} 878 %fill = linalg.fill ins(%val : f32) outs(%alloc1 : tensor<1xf32>) -> tensor<1xf32> 879 // CHECK: tensor.insert 880 // CHECK-SAME: __inplace_operands_attr__ = ["none", "true", "none"] 881 %inserted = tensor.insert %f into %fill[%c0] : tensor<1xf32> 882 883 scf.forall.in_parallel { 884 // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]} 885 tensor.parallel_insert_slice %inserted into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32> 886 } 887 } 888 // CHECK: } {__inplace_operands_attr__ = ["none", "true"]} 889 return %0 : tensor<320xf32> 890} 891 892// ----- 893 894// CHECK-LABEL: func @parallel_region_no_read() 895func.func @parallel_region_no_read() 896{ 897 %alloc0 = bufferization.alloc_tensor() : tensor<320xf32> 898 %alloc1 = bufferization.alloc_tensor() : tensor<1xf32> 899 %c320 = arith.constant 320 : index 900 // CHECK: scf.forall 901 scf.forall (%arg0) in (%c320) { 902 %val = "test.foo"() : () -> (f32) 903 // linalg.fill can bufferize in-place because no alias of %alloc1 is read. 904 // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} 905 %fill = linalg.fill ins(%val : f32) outs(%alloc1 : tensor<1xf32>) -> tensor<1xf32> 906 scf.forall.in_parallel { 907 } 908 } 909 return 910} 911