1// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(canonicalize,cse),one-shot-bufferize{bufferize-function-boundaries})" |\ 2// RUN: mlir-opt -pass-pipeline="builtin.module(buffer-deallocation-pipeline,convert-bufferization-to-memref,func.func(convert-vector-to-scf,lower-affine,convert-linalg-to-loops))" |\ 3// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(canonicalize,convert-scf-to-cf),convert-vector-to-llvm,expand-strided-metadata,lower-affine,convert-arith-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | \ 4 5// RUN: mlir-runner -O3 -e main -entry-point-result=void \ 6// RUN: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils |\ 7// RUN: FileCheck %s 8 9#map0 = affine_map<(d0, d1)[s0] -> ((d1 - d0) ceildiv s0)> 10#map1 = affine_map<(d0, d1)[s0] -> ((d0 - d1) ceildiv s0)> 11 12func.func @init_and_dot(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>, %arg2: tensor<f32>) -> tensor<f32> { 13 %c64 = arith.constant 64 : index 14 %cst = arith.constant 0.000000e+00 : f32 15 %c2 = arith.constant 2 : index 16 %c0 = arith.constant 0 : index 17 %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<f32>) -> tensor<f32> 18 %1 = affine.apply #map0(%c0, %c64)[%c2] 19 %2 = bufferization.alloc_tensor(%1) : tensor<?x2xf32> 20 %3 = scf.for %arg3 = %c0 to %c64 step %c2 iter_args(%arg4 = %2) -> (tensor<?x2xf32>) { 21 %8 = affine.apply #map1(%arg3, %c0)[%c2] 22 %9 = tensor.extract_slice %arg1[%arg3] [2] [1] : tensor<64xf32> to tensor<2xf32> 23 %10 = tensor.cast %9 : tensor<2xf32> to tensor<?xf32> 24 %11 = tensor.pad %10 low[%c0] high[%c0] { 25 ^bb0(%arg5: index): 26 tensor.yield %cst : f32 27 } : tensor<?xf32> to tensor<2xf32> 28 %12 = tensor.insert_slice %11 into %arg4[%8, 0] [1, 2] [1, 1] : tensor<2xf32> into tensor<?x2xf32> 29 scf.yield %12 : tensor<?x2xf32> 30 } 31 32 // %B = tensor.cast %3 : tensor<?x2xf32> to tensor<*xf32> 33 // call @printMemrefF32(%B) : (tensor<*xf32>) -> () 34 35 %4 = affine.apply #map0(%c0, %c64)[%c2] 36 %5 = bufferization.alloc_tensor(%4) : tensor<?x2xf32> 37 %6 = scf.for %arg3 = %c0 to %c64 step %c2 iter_args(%arg4 = %5) -> (tensor<?x2xf32>) { 38 %8 = affine.apply #map1(%arg3, %c0)[%c2] 39 %9 = tensor.extract_slice %arg0[%arg3] [2] [1] : tensor<64xf32> to tensor<2xf32> 40 %10 = tensor.cast %9 : tensor<2xf32> to tensor<?xf32> 41 %11 = tensor.pad %10 low[%c0] high[%c0] { 42 ^bb0(%arg5: index): 43 tensor.yield %cst : f32 44 } : tensor<?xf32> to tensor<2xf32> 45 %12 = tensor.insert_slice %11 into %arg4[%8, 0] [1, 2] [1, 1] : tensor<2xf32> into tensor<?x2xf32> 46 scf.yield %12 : tensor<?x2xf32> 47 } 48 49 // %A = tensor.cast %6 : tensor<?x2xf32> to tensor<*xf32> 50 // call @printMemrefF32(%A) : (tensor<*xf32>) -> () 51 52 // %C = tensor.cast %0 : tensor<f32> to tensor<*xf32> 53 // call @printMemrefF32(%C) : (tensor<*xf32>) -> () 54 55 %7 = scf.for %arg3 = %c0 to %c64 step %c2 iter_args(%arg4 = %0) -> (tensor<f32>) { 56 %8 = tensor.extract_slice %arg0[%arg3] [2] [1] : tensor<64xf32> to tensor<2xf32> 57 %9 = tensor.cast %8 : tensor<2xf32> to tensor<?xf32> 58 %10 = tensor.extract_slice %arg1[%arg3] [2] [1] : tensor<64xf32> to tensor<2xf32> 59 %11 = tensor.cast %10 : tensor<2xf32> to tensor<?xf32> 60 %12 = affine.apply #map1(%arg3, %c0)[%c2] 61 %13 = tensor.extract_slice %6[%12, 0] [1, 2] [1, 1] : tensor<?x2xf32> to tensor<2xf32> 62 %14 = affine.apply #map1(%arg3, %c0)[%c2] 63 %15 = tensor.extract_slice %3[%14, 0] [1, 2] [1, 1] : tensor<?x2xf32> to tensor<2xf32> 64 %16 = linalg.dot ins(%13, %15 : tensor<2xf32>, tensor<2xf32>) outs(%arg4 : tensor<f32>) -> tensor<f32> 65 66 // %AA = tensor.cast %13 : tensor<2xf32> to tensor<*xf32> 67 // call @printMemrefF32(%AA) : (tensor<*xf32>) -> () 68 // %BB = tensor.cast %15 : tensor<2xf32> to tensor<*xf32> 69 // call @printMemrefF32(%BB) : (tensor<*xf32>) -> () 70 // %CC = tensor.cast %16 : tensor<f32> to tensor<*xf32> 71 // call @printMemrefF32(%CC) : (tensor<*xf32>) -> () 72 73 scf.yield %16 : tensor<f32> 74 } 75 return %7 : tensor<f32> 76} 77 78func.func @main() { 79 %v0 = arith.constant 0.0 : f32 80 %v1 = arith.constant 1.0 : f32 81 %v2 = arith.constant 2.0 : f32 82 83 %A = bufferization.alloc_tensor() : tensor<64xf32> 84 %B = bufferization.alloc_tensor() : tensor<64xf32> 85 %C = bufferization.alloc_tensor() : tensor<f32> 86 %AA = linalg.fill ins(%v1 : f32) outs(%A : tensor<64xf32>) -> tensor<64xf32> 87 %BB = linalg.fill ins(%v2 : f32) outs(%B : tensor<64xf32>) -> tensor<64xf32> 88 %CC = linalg.fill ins(%v0 : f32) outs(%C : tensor<f32>) -> tensor<f32> 89 90 %res = call @init_and_dot(%AA, %BB, %CC) : 91 (tensor<64xf32>, tensor<64xf32>, tensor<f32>) -> tensor<f32> 92 93 %res2 = tensor.cast %res: tensor<f32> to tensor<*xf32> 94 95// CHECK: Unranked Memref base@ = {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data = 96// CHECK-NEXT: [128] 97 call @printMemrefF32(%res2) : (tensor<*xf32>) -> () 98 99 return 100} 101 102func.func private @printMemrefF32(tensor<*xf32>) attributes { llvm.emit_c_interface } 103