Dialect/Linalg/one-shot-bufferize-analysis.mlir

// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s

// CHECK-LABEL: @elementwise_no_conflict
func.func @elementwise_no_conflict(%a: tensor<5xf32>,
                                   %b: tensor<5xf32>) -> tensor<5xf32> {
  // CHECK: linalg.elemwise_binary
  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"], fun = #linalg.binary_fn<add>}
  %0 = linalg.elemwise_binary {fun = #linalg.binary_fn<add>}
      ins(%a, %b : tensor<5xf32>, tensor<5xf32>)
      outs(%a : tensor<5xf32>) -> tensor<5xf32>
  return %0 : tensor<5xf32>
}

// -----

// CHECK-LABEL: @elementwise_no_conflict_2
func.func @elementwise_no_conflict_2(%a: tensor<5xf32>) -> tensor<5xf32> {
  // CHECK: linalg.elemwise_binary
  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"], fun = #linalg.binary_fn<add>}
  %0 = linalg.elemwise_binary {fun = #linalg.binary_fn<add>}
      ins(%a, %a : tensor<5xf32>, tensor<5xf32>)
      outs(%a : tensor<5xf32>) -> tensor<5xf32>
  return %0 : tensor<5xf32>
}

// -----

// CHECK-LABEL: @elementwise_no_conflict_3
func.func @elementwise_no_conflict_3(%a: tensor<5xf32>) -> tensor<5xf32> {
  %c0f = arith.constant 1.0 : f32
  // CHECK: linalg.elemwise_binary
  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "true"], fun = #linalg.binary_fn<add>}
  %0 = linalg.elemwise_binary {fun = #linalg.binary_fn<add>}
      ins(%a, %c0f : tensor<5xf32>, f32)
      outs(%a : tensor<5xf32>) -> tensor<5xf32>
  return %0 : tensor<5xf32>
}

// -----

func.func @not_elementwise(%a: tensor<5x6xf32>) -> tensor<5x6xf32> {
  %cst = arith.constant 5.0 : f32
  // CHECK: tensor.extract_slice
  // CHECK-SAME: {__inplace_operands_attr__ = ["false"]}
  %b = tensor.extract_slice %a[0, 0] [1, 6] [1, 1]
      : tensor<5x6xf32> to tensor<6xf32>
  // CHECK: linalg.generic
  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]}
  %0 = linalg.generic
    { iterator_types = ["parallel", "parallel"],
      indexing_maps = [ affine_map<(d0, d1) -> (d1)>,
                        affine_map<(d0, d1) -> (d0, d1)>] }
    ins(%b: tensor<6xf32>) outs(%a: tensor<5x6xf32>) {
    ^bb0(%arg0: f32, %arg1: f32):
      %r = arith.addf %arg0, %arg1 : f32
      linalg.yield %r : f32
    } -> tensor<5x6xf32>
  return %0 : tensor<5x6xf32>
}

// -----

#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1) -> (d1)>

// CHECK-LABEL: @elementwise_no_conflict_4
func.func @elementwise_no_conflict_4(%arg0: tensor<8x32x32x32xf32>, %arg1: tensor<32x32x32xf32>) -> tensor<8x32x32x32xf32> {
  %cst = arith.constant dense<3.000000e-02> : tensor<32x32x32xf32>
  %cst_0 = arith.constant dense<6.000000e-01> : tensor<32xf32>
  %cst_1 = arith.constant 0.000000e+00 : f32
  %r = scf.forall (%arg2, %arg3) in (8, 32) shared_outs(%arg4 = %arg0) -> (tensor<8x32x32x32xf32>) {
    // CHECK: tensor.extract_slice
    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]}
    %extracted_slice = tensor.extract_slice %arg4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<8x32x32x32xf32> to tensor<32x32xf32>

    // CHECK: linalg.fill
    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]}
    %4 = linalg.fill ins(%cst_1 : f32) outs(%extracted_slice : tensor<32x32xf32>) -> tensor<32x32xf32>

    // CHECK: linalg.batch_reduce_matmul
    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
    %5 = linalg.batch_reduce_matmul ins(%arg1, %cst : tensor<32x32x32xf32>, tensor<32x32x32xf32>) outs(%4 : tensor<32x32xf32>) -> tensor<32x32xf32>

    // CHECK: linalg.generic
    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"]}
    // %cst_0 has a non-identity layout may, but %5 and %extracted_slice still
    // bufferize to element-wise access.
    %6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %cst_0 : tensor<32x32xf32>, tensor<32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
    ^bb0(%in: f32, %in_4: f32, %out: f32):
      %8 = arith.addf %in, %in_4 : f32
      linalg.yield %8 : f32
    } -> tensor<32x32xf32>

    // CHECK: linalg.generic
    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true"]}
    // They are different SSA values, but %6 and %extract_slice are equivalent.
    %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %8 = arith.maximumf %in, %cst_1 : f32
      linalg.yield %8 : f32
    } -> tensor<32x32xf32>
    scf.forall.in_parallel {
      // CHECK: tensor.parallel_insert_slice
      // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]}
      tensor.parallel_insert_slice %7 into %arg4[%arg2, %arg3, 0, 0] [1, 1, 32, 32] [1, 1, 1, 1] : tensor<32x32xf32> into tensor<8x32x32x32xf32>
    }
  }
  return %r : tensor<8x32x32x32xf32>
}

// -----

// CHECK-LABEL: func @elementwise_access_regression(
//       CHECK:   linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
//       CHECK:   linalg.map
//  CHECK-SAME:   {__inplace_operands_attr__ = ["true", "true", "true"]}
//       CHECK:   linalg.map
//  CHECK-SAME:   {__inplace_operands_attr__ = ["true", "true", "true"]}
func.func private @f(%arg: tensor<32x1xf32>) -> ()
func.func @elementwise_access_regression(%arg0: i32, %arg2: tensor<32x1xf32>, %arg3: tensor<32x1xf32>) {
      %cst_0 = arith.constant 0.000000e+00 : f32
      %c0_i32 = arith.constant 0 : i32
      %c1_i32 = arith.constant 1 : i32
      %0 = tensor.empty() : tensor<32x1xf32>

      // This op must bufferize out-of-place so that the filled tensor is not
      // overwritten by the ops inside of the loop.
      %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<32x1xf32>) -> tensor<32x1xf32>

      scf.for %arg1 = %c0_i32 to %arg0 step %c1_i32 : i32 {
        %2 = linalg.map { arith.subf } ins(%1, %arg2 : tensor<32x1xf32>, tensor<32x1xf32>) outs(%0 : tensor<32x1xf32>)
        %3 = tensor.empty() : tensor<32x1xf32>
        %4 = linalg.map { arith.subf } ins(%2, %arg3 : tensor<32x1xf32>, tensor<32x1xf32>) outs(%3 : tensor<32x1xf32>)
        func.call @f(%4) : (tensor<32x1xf32>) -> ()
      }
      return
}