xref: /llvm-project/mlir/test/Integration/Dialect/Linalg/CPU/pack-dynamic-inner-tile.mlir (revision eb206e9ea84eff0a0596fed2de8316d924f946d1)
1// DEFINE: %{compile} =  mlir-opt %s \
2// DEFINE:  -transform-interpreter -test-transform-dialect-erase-schedule |\
3// DEFINE: mlir-opt \
4// DEFINE:  -test-lower-to-llvm -o %t
5// DEFINE: %{entry_point} = main
6// DEFINE: %{run} = mlir-runner %t -e %{entry_point} -entry-point-result=void \
7// DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils
8
9// RUN: rm -f %t && %{compile} && %{run} | FileCheck %s
10
11/// End-to-end test for tensor.pack where one of the inner tile sizes is
12/// dynamic.
13
14func.func @main() {
15  // Allocate and initialise the inputs
16  %A_alloc = tensor.empty() : tensor<7x16xi32>
17
18  %A = arith.constant dense<[
19    [ 1,  8, 15, 22, 29, 36, 43, 50, 57, 64, 71, 78, 85, 92, 99 , 106],
20    [ 2,  9, 16, 23, 30, 37, 44, 51, 58, 65, 72, 79, 86, 93, 100, 107],
21    [ 3, 10, 17, 24, 31, 38, 45, 52, 59, 66, 73, 80, 87, 94, 101, 108],
22    [ 4, 11, 18, 25, 32, 39, 46, 53, 60, 67, 74, 81, 88, 95, 102, 109],
23    [ 5, 12, 19, 26, 33, 40, 47, 54, 61, 68, 75, 82, 89, 96, 103, 110],
24    [ 6, 13, 20, 27, 34, 41, 48, 55, 62, 69, 76, 83, 90, 97, 104, 111],
25    [ 7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84, 91, 98, 105, 112]
26  ]> : tensor<7x16xi32>
27
28  func.call @pack(%A) : (tensor<7x16xi32>) -> ()
29
30  return
31}
32
33func.func private @pack(%A: tensor<7x16xi32>) {
34  %c1 = arith.constant 1 : index
35  %pad_val = arith.constant 123 : i32
36
37  // Dynamic tile size
38  %tile_size = arith.constant 8 : index
39  %A_pack_empty = tensor.empty(%c1, %tile_size) : tensor<?x16x?x1xi32>
40
41  %A_pack = tensor.pack %A
42    padding_value(%pad_val : i32)
43    inner_dims_pos = [0, 1]
44    inner_tiles = [%tile_size, 1]
45    into %A_pack_empty : tensor<7x16xi32> -> tensor<?x16x?x1xi32>
46  %A_cast = tensor.cast %A_pack : tensor<?x16x?x1xi32> to tensor<*xi32>
47
48  // Print the results
49  // CHECK: Unranked Memref base@ = 0x{{.*}} rank = 4 offset = 0 sizes = [1, 16, 8, 1] strides = [128, 8, 1, 1] data =
50  // Tile 1: (8 x 1)
51  // CHECK-NEXT:  1
52  // CHECK-NEXT:  2
53  // CHECK-NEXT:  3
54  // CHECK-NEXT:  4
55  // CHECK-NEXT:  5
56  // CHECK-NEXT:  6
57  // CHECK-NEXT:  7
58  // Expect pad value after 7 elements
59  // CHECK-NEXT:  123
60  // Tile 2: (8 x 1)
61  // CHECK-NEXT:  8
62  // CHECK-NEXT:  9
63  // CHECK-NEXT:  10
64  // CHECK-NEXT:  11
65  // CHECK-NEXT:  12
66  // CHECK-NEXT:  13
67  // CHECK-NEXT:  14
68  // Expect pad value after further 7 elements
69  // CHECK-NEXT:  123
70  // Tile 3: (8 x 1)
71  // CHECK-NEXT:  15
72  // CHECK-NEXT:  16
73  // ...
74  call @printMemrefI32(%A_cast) : (tensor<*xi32>) -> ()
75
76  return
77}
78
79module @transforms attributes { transform.with_named_sequence } {
80  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.consume}) {
81    %pack = transform.structured.match ops{["tensor.pack"]} in %module : (!transform.any_op) -> !transform.any_op
82
83    // 1. Tile so that we can decompose tensor.pack into tensor.pad and other
84    // Ops (see step 2)
85    %tiled_pack_op_p, %loops:2 = transform.structured.tile_using_for %pack tile_sizes [1, 1]
86       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
87
88    // 2. Decompose the tiled pack Op into (trimmed for brevity):
89    //
90    //  %padded = tensor.pad %slice_of_A (..) :
91    //      tensor<?x?xi32> to tensor<8x1xi32>
92    //  %inserted_slice = tensor.insert_slice %padded into %slice_of_A_pack (...) :
93    //      tensor<8x1xi32> into tensor<1x1x?x1xi32>
94    //
95    // (NOTE: no tile is transposed, hence no linalg.transpose)
96    //
97    // This is followed by this decomposition of the pad Op:
98    //
99    //  %c123_i32 = arith.constant 123 : i32
100    //  %slice_of_A = tensor.extract_slice %A[%3, %arg3] [%4, %5] [1, 1] :
101    //    tensor<7x16xi32> to tensor<?x?xi32>
102    //  %empty = tensor.empty() : tensor<8x1xi32>
103    //  %fill = linalg.fill ins(%c123_i32 : i32) outs(%empty :
104    //    tensor<8x1xi32>) -> tensor<8x1xi32>
105    //  %inserted_slice = tensor.insert_slice %slice_of_A into %fill[0, 0] [%4, %5] [1, 1] :
106    //    tensor<?x?xi32> into tensor<8x1xi32>
107    //
108    %func_op = transform.get_parent_op %tiled_pack_op_p {isolated_from_above} : (!transform.any_op) -> !transform.op<"func.func">
109    transform.apply_patterns to %func_op {
110      transform.apply_patterns.linalg.decompose_pack_unpack
111      transform.apply_patterns.linalg.decompose_pad
112    } : !transform.op<"func.func">
113
114    // 3. Vectorize linalg.fill.
115    // Vector sizes match the inner tiles in the payload IR.
116    %fill = transform.structured.match ops{["linalg.fill"]} in %func_op : (!transform.op<"func.func">) -> !transform.any_op
117    transform.structured.vectorize %fill vector_sizes [8, 1] : !transform.any_op
118
119    transform.apply_patterns to %func_op {
120      transform.apply_patterns.tensor.fold_tensor_subset_ops
121      transform.apply_patterns.canonicalization
122    } : !transform.op<"func.func">
123
124    // 3. Bufferize before lowering to LLVM
125    %bufferize = transform.bufferization.one_shot_bufferize %module
126      {bufferize_function_boundaries=true} : (!transform.any_op) -> !transform.any_op
127
128    // 4. Canonicalize
129    %func_op_bufferized = transform.structured.match ops{["func.func"]} in %bufferize : (!transform.any_op) -> !transform.op<"func.func">
130    transform.apply_patterns to %func_op_bufferized {
131      transform.apply_patterns.canonicalization
132    } : !transform.op<"func.func">
133
134    transform.yield
135  }
136}
137
138func.func private @printMemrefI32(%ptr : tensor<*xi32>)
139