1// DEFINE: %{compile} = mlir-opt %s \ 2// DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule |\ 3// DEFINE: mlir-opt \ 4// DEFINE: -test-lower-to-llvm -o %t 5// DEFINE: %{entry_point} = main 6// DEFINE: %{run} = mlir-runner %t -e %{entry_point} -entry-point-result=void \ 7// DEFINE: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils 8 9// RUN: rm -f %t && %{compile} && %{run} | FileCheck %s 10 11/// End-to-end test for tensor.pack where one of the inner tile sizes is 12/// dynamic. 13 14func.func @main() { 15 // Allocate and initialise the inputs 16 %A_alloc = tensor.empty() : tensor<7x16xi32> 17 18 %A = arith.constant dense<[ 19 [ 1, 8, 15, 22, 29, 36, 43, 50, 57, 64, 71, 78, 85, 92, 99 , 106], 20 [ 2, 9, 16, 23, 30, 37, 44, 51, 58, 65, 72, 79, 86, 93, 100, 107], 21 [ 3, 10, 17, 24, 31, 38, 45, 52, 59, 66, 73, 80, 87, 94, 101, 108], 22 [ 4, 11, 18, 25, 32, 39, 46, 53, 60, 67, 74, 81, 88, 95, 102, 109], 23 [ 5, 12, 19, 26, 33, 40, 47, 54, 61, 68, 75, 82, 89, 96, 103, 110], 24 [ 6, 13, 20, 27, 34, 41, 48, 55, 62, 69, 76, 83, 90, 97, 104, 111], 25 [ 7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84, 91, 98, 105, 112] 26 ]> : tensor<7x16xi32> 27 28 func.call @pack(%A) : (tensor<7x16xi32>) -> () 29 30 return 31} 32 33func.func private @pack(%A: tensor<7x16xi32>) { 34 %c1 = arith.constant 1 : index 35 %pad_val = arith.constant 123 : i32 36 37 // Dynamic tile size 38 %tile_size = arith.constant 8 : index 39 %A_pack_empty = tensor.empty(%c1, %tile_size) : tensor<?x16x?x1xi32> 40 41 %A_pack = tensor.pack %A 42 padding_value(%pad_val : i32) 43 inner_dims_pos = [0, 1] 44 inner_tiles = [%tile_size, 1] 45 into %A_pack_empty : tensor<7x16xi32> -> tensor<?x16x?x1xi32> 46 %A_cast = tensor.cast %A_pack : tensor<?x16x?x1xi32> to tensor<*xi32> 47 48 // Print the results 49 // CHECK: Unranked Memref base@ = 0x{{.*}} rank = 4 offset = 0 sizes = [1, 16, 8, 1] strides = [128, 8, 1, 1] data = 50 // Tile 1: (8 x 1) 51 // CHECK-NEXT: 1 52 // CHECK-NEXT: 2 53 // CHECK-NEXT: 3 54 // CHECK-NEXT: 4 55 // CHECK-NEXT: 5 56 // CHECK-NEXT: 6 57 // CHECK-NEXT: 7 58 // Expect pad value after 7 elements 59 // CHECK-NEXT: 123 60 // Tile 2: (8 x 1) 61 // CHECK-NEXT: 8 62 // CHECK-NEXT: 9 63 // CHECK-NEXT: 10 64 // CHECK-NEXT: 11 65 // CHECK-NEXT: 12 66 // CHECK-NEXT: 13 67 // CHECK-NEXT: 14 68 // Expect pad value after further 7 elements 69 // CHECK-NEXT: 123 70 // Tile 3: (8 x 1) 71 // CHECK-NEXT: 15 72 // CHECK-NEXT: 16 73 // ... 74 call @printMemrefI32(%A_cast) : (tensor<*xi32>) -> () 75 76 return 77} 78 79module @transforms attributes { transform.with_named_sequence } { 80 transform.named_sequence @__transform_main(%module: !transform.any_op {transform.consume}) { 81 %pack = transform.structured.match ops{["tensor.pack"]} in %module : (!transform.any_op) -> !transform.any_op 82 83 // 1. Tile so that we can decompose tensor.pack into tensor.pad and other 84 // Ops (see step 2) 85 %tiled_pack_op_p, %loops:2 = transform.structured.tile_using_for %pack tile_sizes [1, 1] 86 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) 87 88 // 2. Decompose the tiled pack Op into (trimmed for brevity): 89 // 90 // %padded = tensor.pad %slice_of_A (..) : 91 // tensor<?x?xi32> to tensor<8x1xi32> 92 // %inserted_slice = tensor.insert_slice %padded into %slice_of_A_pack (...) : 93 // tensor<8x1xi32> into tensor<1x1x?x1xi32> 94 // 95 // (NOTE: no tile is transposed, hence no linalg.transpose) 96 // 97 // This is followed by this decomposition of the pad Op: 98 // 99 // %c123_i32 = arith.constant 123 : i32 100 // %slice_of_A = tensor.extract_slice %A[%3, %arg3] [%4, %5] [1, 1] : 101 // tensor<7x16xi32> to tensor<?x?xi32> 102 // %empty = tensor.empty() : tensor<8x1xi32> 103 // %fill = linalg.fill ins(%c123_i32 : i32) outs(%empty : 104 // tensor<8x1xi32>) -> tensor<8x1xi32> 105 // %inserted_slice = tensor.insert_slice %slice_of_A into %fill[0, 0] [%4, %5] [1, 1] : 106 // tensor<?x?xi32> into tensor<8x1xi32> 107 // 108 %func_op = transform.get_parent_op %tiled_pack_op_p {isolated_from_above} : (!transform.any_op) -> !transform.op<"func.func"> 109 transform.apply_patterns to %func_op { 110 transform.apply_patterns.linalg.decompose_pack_unpack 111 transform.apply_patterns.linalg.decompose_pad 112 } : !transform.op<"func.func"> 113 114 // 3. Vectorize linalg.fill. 115 // Vector sizes match the inner tiles in the payload IR. 116 %fill = transform.structured.match ops{["linalg.fill"]} in %func_op : (!transform.op<"func.func">) -> !transform.any_op 117 transform.structured.vectorize %fill vector_sizes [8, 1] : !transform.any_op 118 119 transform.apply_patterns to %func_op { 120 transform.apply_patterns.tensor.fold_tensor_subset_ops 121 transform.apply_patterns.canonicalization 122 } : !transform.op<"func.func"> 123 124 // 3. Bufferize before lowering to LLVM 125 %bufferize = transform.bufferization.one_shot_bufferize %module 126 {bufferize_function_boundaries=true} : (!transform.any_op) -> !transform.any_op 127 128 // 4. Canonicalize 129 %func_op_bufferized = transform.structured.match ops{["func.func"]} in %bufferize : (!transform.any_op) -> !transform.op<"func.func"> 130 transform.apply_patterns to %func_op_bufferized { 131 transform.apply_patterns.canonicalization 132 } : !transform.op<"func.func"> 133 134 transform.yield 135 } 136} 137 138func.func private @printMemrefI32(%ptr : tensor<*xi32>) 139