xref: /llvm-project/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions_prod.mlir (revision eb206e9ea84eff0a0596fed2de8316d924f946d1)
1//--------------------------------------------------------------------------------------------------
2// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS.
3//
4// Set-up that's shared across all tests in this directory. In principle, this
5// config could be moved to lit.local.cfg. However, there are downstream users that
6//  do not use these LIT config files. Hence why this is kept inline.
7//
8// DEFINE: %{sparsifier_opts} = enable-runtime-library=true
9// DEFINE: %{sparsifier_opts_sve} = enable-arm-sve=true %{sparsifier_opts}
10// DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
11// DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
12// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
13// DEFINE: %{run_libs_sve} = -shared-libs=%native_mlir_runner_utils,%native_mlir_c_runner_utils
14// DEFINE: %{run_opts} = -e main -entry-point-result=void
15// DEFINE: %{run} = mlir-runner %{run_opts} %{run_libs}
16// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs_sve}
17//
18// DEFINE: %{env} =
19//--------------------------------------------------------------------------------------------------
20
21// RUN: %{compile} | %{run} | FileCheck %s
22//
23// Do the same run, but now with direct IR generation.
24// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true
25// RUN: %{compile} | %{run} | FileCheck %s
26//
27// Do the same run, but now with direct IR generation and vectorization.
28// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true
29// RUN: %{compile} | %{run} | FileCheck %s
30
31#SV = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
32#DV = #sparse_tensor.encoding<{ map = (d0) -> (d0 : dense) }>
33
34#trait_reduction = {
35  indexing_maps = [
36    affine_map<(i) -> (i)>,  // a
37    affine_map<(i) -> ()>    // x (scalar out)
38  ],
39  iterator_types = ["reduction"],
40  doc = "x += PROD_CUSTOM_i a(i)"
41}
42
43// An example of vector reductions.
44module {
45
46  // Custom prod reduction: stored i32 elements only.
47  func.func @prod_dreduction_i32(%arga: tensor<32xi32, #DV>,
48                                 %argx: tensor<i32>) -> tensor<i32> {
49    %c = tensor.extract %argx[] : tensor<i32>
50    %0 = linalg.generic #trait_reduction
51      ins(%arga: tensor<32xi32, #DV>)
52      outs(%argx: tensor<i32>) {
53        ^bb(%a: i32, %b: i32):
54          %1 = sparse_tensor.reduce %a, %b, %c : i32 {
55            ^bb0(%x: i32, %y: i32):
56              %2 = arith.muli %x, %y : i32
57              sparse_tensor.yield %2 : i32
58          }
59          linalg.yield %1 : i32
60    } -> tensor<i32>
61    return %0 : tensor<i32>
62  }
63
64  // Custom prod reduction: stored f32 elements only.
65  func.func @prod_dreduction_f32(%arga: tensor<32xf32, #DV>,
66                                 %argx: tensor<f32>) -> tensor<f32> {
67    %c = tensor.extract %argx[] : tensor<f32>
68    %0 = linalg.generic #trait_reduction
69      ins(%arga: tensor<32xf32, #DV>)
70      outs(%argx: tensor<f32>) {
71        ^bb(%a: f32, %b: f32):
72          %1 = sparse_tensor.reduce %a, %b, %c : f32 {
73            ^bb0(%x: f32, %y: f32):
74              %2 = arith.mulf %x, %y : f32
75              sparse_tensor.yield %2 : f32
76          }
77          linalg.yield %1 : f32
78    } -> tensor<f32>
79    return %0 : tensor<f32>
80  }
81
82  // Custom prod reduction: stored i32 elements only.
83  func.func @prod_sreduction_i32(%arga: tensor<32xi32, #SV>,
84                                 %argx: tensor<i32>) -> tensor<i32> {
85    %c = tensor.extract %argx[] : tensor<i32>
86    %0 = linalg.generic #trait_reduction
87      ins(%arga: tensor<32xi32, #SV>)
88      outs(%argx: tensor<i32>) {
89        ^bb(%a: i32, %b: i32):
90          %1 = sparse_tensor.reduce %a, %b, %c : i32 {
91            ^bb0(%x: i32, %y: i32):
92              %2 = arith.muli %x, %y : i32
93              sparse_tensor.yield %2 : i32
94          }
95          linalg.yield %1 : i32
96    } -> tensor<i32>
97    return %0 : tensor<i32>
98  }
99
100  // Custom prod reduction: stored f32 elements only.
101  func.func @prod_sreduction_f32(%arga: tensor<32xf32, #SV>,
102                                 %argx: tensor<f32>) -> tensor<f32> {
103    %c = tensor.extract %argx[] : tensor<f32>
104    %0 = linalg.generic #trait_reduction
105      ins(%arga: tensor<32xf32, #SV>)
106      outs(%argx: tensor<f32>) {
107        ^bb(%a: f32, %b: f32):
108          %1 = sparse_tensor.reduce %a, %b, %c : f32 {
109            ^bb0(%x: f32, %y: f32):
110              %2 = arith.mulf %x, %y : f32
111              sparse_tensor.yield %2 : f32
112          }
113          linalg.yield %1 : f32
114    } -> tensor<f32>
115    return %0 : tensor<f32>
116  }
117
118  // Custom prod reduction: stored i32 elements and implicit zeros.
119  //
120  // NOTE: this is a somewhat strange operation, since for most sparse
121  //       situations the outcome would always be zero; it is added
122  //       to test full functionality and illustrate the subtle differences
123  //       between the various custom operations; it would make a bit more
124  //       sense for e.g. a min/max reductions, although it still would
125  //       "densify" the iteration space.
126  //
127  func.func @prod_xreduction_i32(%arga: tensor<32xi32, #SV>,
128                                 %argx: tensor<i32>) -> tensor<i32> {
129    %c = tensor.extract %argx[] : tensor<i32>
130    %0 = linalg.generic #trait_reduction
131      ins(%arga: tensor<32xi32, #SV>)
132      outs(%argx: tensor<i32>) {
133        ^bb(%a: i32, %b: i32):
134           %u = sparse_tensor.unary %a : i32 to i32
135           present={
136             ^bb0(%x: i32):
137             sparse_tensor.yield %x : i32
138           } absent={
139             ^bb0:
140             %c0 = arith.constant 0 : i32
141             sparse_tensor.yield %c0 : i32
142          }
143          %1 = sparse_tensor.reduce %u, %b, %c : i32 {
144            ^bb0(%x: i32, %y: i32):
145              %2 = arith.muli %x, %y : i32
146              sparse_tensor.yield %2 : i32
147          }
148          linalg.yield %1 : i32
149    } -> tensor<i32>
150    return %0 : tensor<i32>
151  }
152
153
154  func.func @dump_i32(%arg0 : tensor<i32>) {
155    %v = tensor.extract %arg0[] : tensor<i32>
156    vector.print %v : i32
157    return
158  }
159
160  func.func @dump_f32(%arg0 : tensor<f32>) {
161    %v = tensor.extract %arg0[] : tensor<f32>
162    vector.print %v : f32
163    return
164  }
165
166  func.func @main() {
167    // Note: Constants bufferize to read-only buffers.
168    %ri = arith.constant dense< 7   > : tensor<i32>
169    %rf = arith.constant dense< 2.0 > : tensor<f32>
170
171    // Vectors with a few zeros.
172    %c_0_i32 = arith.constant dense<[
173      1, 1, 7, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
174      1, 1, 1, 1, 3, 0, 1, 1, 1, 1, 1, 0, 1, 1, 7, 3
175    ]> : tensor<32xi32>
176
177    %c_0_f32 = arith.constant dense<[
178      1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0,
179      1.0, 0.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0,
180      1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0,
181      1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0
182    ]> : tensor<32xf32>
183
184    // Vectors with no zeros.
185    %c_1_i32 = arith.constant dense<[
186      1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
187      1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3
188    ]> : tensor<32xi32>
189
190    %c_1_f32 = arith.constant dense<[
191      1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0,
192      1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0,
193      1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0,
194      1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0
195    ]> : tensor<32xf32>
196
197    // Convert constants to annotated tensors. Note that this
198    // particular conversion only stores nonzero elements,
199    // so we will have no explicit zeros, only implicit zeros.
200    %d0_i32 = sparse_tensor.convert %c_0_i32
201      : tensor<32xi32> to tensor<32xi32, #DV>
202    %d0_f32 = sparse_tensor.convert %c_0_f32
203      : tensor<32xf32> to tensor<32xf32, #DV>
204    %s0_i32 = sparse_tensor.convert %c_0_i32
205      : tensor<32xi32> to tensor<32xi32, #SV>
206    %s0_f32 = sparse_tensor.convert %c_0_f32
207      : tensor<32xf32> to tensor<32xf32, #SV>
208    %d1_i32 = sparse_tensor.convert %c_1_i32
209      : tensor<32xi32> to tensor<32xi32, #DV>
210    %d1_f32 = sparse_tensor.convert %c_1_f32
211      : tensor<32xf32> to tensor<32xf32, #DV>
212    %s1_i32 = sparse_tensor.convert %c_1_i32
213      : tensor<32xi32> to tensor<32xi32, #SV>
214    %s1_f32 = sparse_tensor.convert %c_1_f32
215      : tensor<32xf32> to tensor<32xf32, #SV>
216
217    // Special case, construct a sparse vector with an explicit zero.
218    %v0 = arith.constant sparse< [ [1] ], [ 0 ] > : tensor<32xi32>
219    %s0 = sparse_tensor.convert %v0: tensor<32xi32> to tensor<32xi32, #SV>
220
221    // Call the kernels.
222    %0 = call @prod_dreduction_i32(%d0_i32, %ri) : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
223    %1 = call @prod_dreduction_f32(%d0_f32, %rf) : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32>
224    %2 = call @prod_sreduction_i32(%s0_i32, %ri) : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
225    %3 = call @prod_sreduction_f32(%s0_f32, %rf) : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32>
226    %4 = call @prod_dreduction_i32(%d1_i32, %ri) : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
227    %5 = call @prod_dreduction_f32(%d1_f32, %rf) : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32>
228    %6 = call @prod_sreduction_i32(%s1_i32, %ri) : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
229    %7 = call @prod_sreduction_f32(%s1_f32, %rf) : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32>
230    %8 = call @prod_sreduction_i32(%s0,     %ri) : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
231    %9 = call @prod_xreduction_i32(%s0_i32, %ri) : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
232    %10 = call @prod_xreduction_i32(%s1_i32, %ri) : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
233
234    // Verify results. Note that the custom reduction gave permission
235    // to treat an explicit vs implicit zero differently to compute the
236    // full product reduction over stored elements. A "standard" product
237    // reduction would have to return 0 for any implicit zero occurrence
238    // too. An explicit zero nullifies the product, though, as requested.
239    //
240    // CHECK: 0
241    // CHECK: 0
242    // CHECK: 3087
243    // CHECK: 14
244    // CHECK: 3087
245    // CHECK: 168
246    // CHECK: 3087
247    // CHECK: 168
248    // CHECK: 0
249    // CHECK: 0
250    // CHECK: 3087
251    //
252    call @dump_i32(%0) : (tensor<i32>) -> ()
253    call @dump_f32(%1) : (tensor<f32>) -> ()
254    call @dump_i32(%2) : (tensor<i32>) -> ()
255    call @dump_f32(%3) : (tensor<f32>) -> ()
256    call @dump_i32(%4) : (tensor<i32>) -> ()
257    call @dump_f32(%5) : (tensor<f32>) -> ()
258    call @dump_i32(%6) : (tensor<i32>) -> ()
259    call @dump_f32(%7) : (tensor<f32>) -> ()
260    call @dump_i32(%8) : (tensor<i32>) -> ()
261    call @dump_i32(%9) : (tensor<i32>) -> ()
262    call @dump_i32(%10) : (tensor<i32>) -> ()
263
264    // Release the resources.
265    bufferization.dealloc_tensor %d0_i32 : tensor<32xi32, #DV>
266    bufferization.dealloc_tensor %d0_f32 : tensor<32xf32, #DV>
267    bufferization.dealloc_tensor %s0_i32 : tensor<32xi32, #SV>
268    bufferization.dealloc_tensor %s0_f32 : tensor<32xf32, #SV>
269    bufferization.dealloc_tensor %d1_i32 : tensor<32xi32, #DV>
270    bufferization.dealloc_tensor %d1_f32 : tensor<32xf32, #DV>
271    bufferization.dealloc_tensor %s1_i32 : tensor<32xi32, #SV>
272    bufferization.dealloc_tensor %s1_f32 : tensor<32xf32, #SV>
273    bufferization.dealloc_tensor %s0     : tensor<32xi32, #SV>
274    bufferization.dealloc_tensor %0 : tensor<i32>
275    bufferization.dealloc_tensor %1 : tensor<f32>
276    bufferization.dealloc_tensor %2 : tensor<i32>
277    bufferization.dealloc_tensor %3 : tensor<f32>
278    bufferization.dealloc_tensor %4 : tensor<i32>
279    bufferization.dealloc_tensor %5 : tensor<f32>
280    bufferization.dealloc_tensor %6 : tensor<i32>
281    bufferization.dealloc_tensor %7 : tensor<f32>
282    bufferization.dealloc_tensor %8 : tensor<i32>
283    bufferization.dealloc_tensor %9 : tensor<i32>
284    bufferization.dealloc_tensor %10 : tensor<i32>
285
286    return
287  }
288}
289