1//-------------------------------------------------------------------------------------------------- 2// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS. 3// 4// Set-up that's shared across all tests in this directory. In principle, this 5// config could be moved to lit.local.cfg. However, there are downstream users that 6// do not use these LIT config files. Hence why this is kept inline. 7// 8// DEFINE: %{sparsifier_opts} = enable-runtime-library=true 9// DEFINE: %{sparsifier_opts_sve} = enable-arm-sve=true %{sparsifier_opts} 10// DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}" 11// DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}" 12// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils 13// DEFINE: %{run_libs_sve} = -shared-libs=%native_mlir_runner_utils,%native_mlir_c_runner_utils 14// DEFINE: %{run_opts} = -e main -entry-point-result=void 15// DEFINE: %{run} = mlir-runner %{run_opts} %{run_libs} 16// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs_sve} 17// 18// DEFINE: %{env} = 19//-------------------------------------------------------------------------------------------------- 20 21// RUN: %{compile} | %{run} | FileCheck %s 22// 23// Do the same run, but now with direct IR generation. 24// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true 25// RUN: %{compile} | %{run} | FileCheck %s 26// 27// Do the same run, but now with direct IR generation and vectorization. 28// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true 29// RUN: %{compile} | %{run} | FileCheck %s 30 31#SV = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }> 32#DV = #sparse_tensor.encoding<{ map = (d0) -> (d0 : dense) }> 33 34#trait_reduction = { 35 indexing_maps = [ 36 affine_map<(i) -> (i)>, // a 37 affine_map<(i) -> ()> // x (scalar out) 38 ], 39 iterator_types = ["reduction"], 40 doc = "x += PROD_CUSTOM_i a(i)" 41} 42 43// An example of vector reductions. 44module { 45 46 // Custom prod reduction: stored i32 elements only. 47 func.func @prod_dreduction_i32(%arga: tensor<32xi32, #DV>, 48 %argx: tensor<i32>) -> tensor<i32> { 49 %c = tensor.extract %argx[] : tensor<i32> 50 %0 = linalg.generic #trait_reduction 51 ins(%arga: tensor<32xi32, #DV>) 52 outs(%argx: tensor<i32>) { 53 ^bb(%a: i32, %b: i32): 54 %1 = sparse_tensor.reduce %a, %b, %c : i32 { 55 ^bb0(%x: i32, %y: i32): 56 %2 = arith.muli %x, %y : i32 57 sparse_tensor.yield %2 : i32 58 } 59 linalg.yield %1 : i32 60 } -> tensor<i32> 61 return %0 : tensor<i32> 62 } 63 64 // Custom prod reduction: stored f32 elements only. 65 func.func @prod_dreduction_f32(%arga: tensor<32xf32, #DV>, 66 %argx: tensor<f32>) -> tensor<f32> { 67 %c = tensor.extract %argx[] : tensor<f32> 68 %0 = linalg.generic #trait_reduction 69 ins(%arga: tensor<32xf32, #DV>) 70 outs(%argx: tensor<f32>) { 71 ^bb(%a: f32, %b: f32): 72 %1 = sparse_tensor.reduce %a, %b, %c : f32 { 73 ^bb0(%x: f32, %y: f32): 74 %2 = arith.mulf %x, %y : f32 75 sparse_tensor.yield %2 : f32 76 } 77 linalg.yield %1 : f32 78 } -> tensor<f32> 79 return %0 : tensor<f32> 80 } 81 82 // Custom prod reduction: stored i32 elements only. 83 func.func @prod_sreduction_i32(%arga: tensor<32xi32, #SV>, 84 %argx: tensor<i32>) -> tensor<i32> { 85 %c = tensor.extract %argx[] : tensor<i32> 86 %0 = linalg.generic #trait_reduction 87 ins(%arga: tensor<32xi32, #SV>) 88 outs(%argx: tensor<i32>) { 89 ^bb(%a: i32, %b: i32): 90 %1 = sparse_tensor.reduce %a, %b, %c : i32 { 91 ^bb0(%x: i32, %y: i32): 92 %2 = arith.muli %x, %y : i32 93 sparse_tensor.yield %2 : i32 94 } 95 linalg.yield %1 : i32 96 } -> tensor<i32> 97 return %0 : tensor<i32> 98 } 99 100 // Custom prod reduction: stored f32 elements only. 101 func.func @prod_sreduction_f32(%arga: tensor<32xf32, #SV>, 102 %argx: tensor<f32>) -> tensor<f32> { 103 %c = tensor.extract %argx[] : tensor<f32> 104 %0 = linalg.generic #trait_reduction 105 ins(%arga: tensor<32xf32, #SV>) 106 outs(%argx: tensor<f32>) { 107 ^bb(%a: f32, %b: f32): 108 %1 = sparse_tensor.reduce %a, %b, %c : f32 { 109 ^bb0(%x: f32, %y: f32): 110 %2 = arith.mulf %x, %y : f32 111 sparse_tensor.yield %2 : f32 112 } 113 linalg.yield %1 : f32 114 } -> tensor<f32> 115 return %0 : tensor<f32> 116 } 117 118 // Custom prod reduction: stored i32 elements and implicit zeros. 119 // 120 // NOTE: this is a somewhat strange operation, since for most sparse 121 // situations the outcome would always be zero; it is added 122 // to test full functionality and illustrate the subtle differences 123 // between the various custom operations; it would make a bit more 124 // sense for e.g. a min/max reductions, although it still would 125 // "densify" the iteration space. 126 // 127 func.func @prod_xreduction_i32(%arga: tensor<32xi32, #SV>, 128 %argx: tensor<i32>) -> tensor<i32> { 129 %c = tensor.extract %argx[] : tensor<i32> 130 %0 = linalg.generic #trait_reduction 131 ins(%arga: tensor<32xi32, #SV>) 132 outs(%argx: tensor<i32>) { 133 ^bb(%a: i32, %b: i32): 134 %u = sparse_tensor.unary %a : i32 to i32 135 present={ 136 ^bb0(%x: i32): 137 sparse_tensor.yield %x : i32 138 } absent={ 139 ^bb0: 140 %c0 = arith.constant 0 : i32 141 sparse_tensor.yield %c0 : i32 142 } 143 %1 = sparse_tensor.reduce %u, %b, %c : i32 { 144 ^bb0(%x: i32, %y: i32): 145 %2 = arith.muli %x, %y : i32 146 sparse_tensor.yield %2 : i32 147 } 148 linalg.yield %1 : i32 149 } -> tensor<i32> 150 return %0 : tensor<i32> 151 } 152 153 154 func.func @dump_i32(%arg0 : tensor<i32>) { 155 %v = tensor.extract %arg0[] : tensor<i32> 156 vector.print %v : i32 157 return 158 } 159 160 func.func @dump_f32(%arg0 : tensor<f32>) { 161 %v = tensor.extract %arg0[] : tensor<f32> 162 vector.print %v : f32 163 return 164 } 165 166 func.func @main() { 167 // Note: Constants bufferize to read-only buffers. 168 %ri = arith.constant dense< 7 > : tensor<i32> 169 %rf = arith.constant dense< 2.0 > : tensor<f32> 170 171 // Vectors with a few zeros. 172 %c_0_i32 = arith.constant dense<[ 173 1, 1, 7, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 174 1, 1, 1, 1, 3, 0, 1, 1, 1, 1, 1, 0, 1, 1, 7, 3 175 ]> : tensor<32xi32> 176 177 %c_0_f32 = arith.constant dense<[ 178 1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0, 179 1.0, 0.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 180 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 181 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0 182 ]> : tensor<32xf32> 183 184 // Vectors with no zeros. 185 %c_1_i32 = arith.constant dense<[ 186 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 187 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3 188 ]> : tensor<32xi32> 189 190 %c_1_f32 = arith.constant dense<[ 191 1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0, 192 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 193 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 194 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0 195 ]> : tensor<32xf32> 196 197 // Convert constants to annotated tensors. Note that this 198 // particular conversion only stores nonzero elements, 199 // so we will have no explicit zeros, only implicit zeros. 200 %d0_i32 = sparse_tensor.convert %c_0_i32 201 : tensor<32xi32> to tensor<32xi32, #DV> 202 %d0_f32 = sparse_tensor.convert %c_0_f32 203 : tensor<32xf32> to tensor<32xf32, #DV> 204 %s0_i32 = sparse_tensor.convert %c_0_i32 205 : tensor<32xi32> to tensor<32xi32, #SV> 206 %s0_f32 = sparse_tensor.convert %c_0_f32 207 : tensor<32xf32> to tensor<32xf32, #SV> 208 %d1_i32 = sparse_tensor.convert %c_1_i32 209 : tensor<32xi32> to tensor<32xi32, #DV> 210 %d1_f32 = sparse_tensor.convert %c_1_f32 211 : tensor<32xf32> to tensor<32xf32, #DV> 212 %s1_i32 = sparse_tensor.convert %c_1_i32 213 : tensor<32xi32> to tensor<32xi32, #SV> 214 %s1_f32 = sparse_tensor.convert %c_1_f32 215 : tensor<32xf32> to tensor<32xf32, #SV> 216 217 // Special case, construct a sparse vector with an explicit zero. 218 %v0 = arith.constant sparse< [ [1] ], [ 0 ] > : tensor<32xi32> 219 %s0 = sparse_tensor.convert %v0: tensor<32xi32> to tensor<32xi32, #SV> 220 221 // Call the kernels. 222 %0 = call @prod_dreduction_i32(%d0_i32, %ri) : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32> 223 %1 = call @prod_dreduction_f32(%d0_f32, %rf) : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32> 224 %2 = call @prod_sreduction_i32(%s0_i32, %ri) : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 225 %3 = call @prod_sreduction_f32(%s0_f32, %rf) : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32> 226 %4 = call @prod_dreduction_i32(%d1_i32, %ri) : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32> 227 %5 = call @prod_dreduction_f32(%d1_f32, %rf) : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32> 228 %6 = call @prod_sreduction_i32(%s1_i32, %ri) : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 229 %7 = call @prod_sreduction_f32(%s1_f32, %rf) : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32> 230 %8 = call @prod_sreduction_i32(%s0, %ri) : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 231 %9 = call @prod_xreduction_i32(%s0_i32, %ri) : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 232 %10 = call @prod_xreduction_i32(%s1_i32, %ri) : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 233 234 // Verify results. Note that the custom reduction gave permission 235 // to treat an explicit vs implicit zero differently to compute the 236 // full product reduction over stored elements. A "standard" product 237 // reduction would have to return 0 for any implicit zero occurrence 238 // too. An explicit zero nullifies the product, though, as requested. 239 // 240 // CHECK: 0 241 // CHECK: 0 242 // CHECK: 3087 243 // CHECK: 14 244 // CHECK: 3087 245 // CHECK: 168 246 // CHECK: 3087 247 // CHECK: 168 248 // CHECK: 0 249 // CHECK: 0 250 // CHECK: 3087 251 // 252 call @dump_i32(%0) : (tensor<i32>) -> () 253 call @dump_f32(%1) : (tensor<f32>) -> () 254 call @dump_i32(%2) : (tensor<i32>) -> () 255 call @dump_f32(%3) : (tensor<f32>) -> () 256 call @dump_i32(%4) : (tensor<i32>) -> () 257 call @dump_f32(%5) : (tensor<f32>) -> () 258 call @dump_i32(%6) : (tensor<i32>) -> () 259 call @dump_f32(%7) : (tensor<f32>) -> () 260 call @dump_i32(%8) : (tensor<i32>) -> () 261 call @dump_i32(%9) : (tensor<i32>) -> () 262 call @dump_i32(%10) : (tensor<i32>) -> () 263 264 // Release the resources. 265 bufferization.dealloc_tensor %d0_i32 : tensor<32xi32, #DV> 266 bufferization.dealloc_tensor %d0_f32 : tensor<32xf32, #DV> 267 bufferization.dealloc_tensor %s0_i32 : tensor<32xi32, #SV> 268 bufferization.dealloc_tensor %s0_f32 : tensor<32xf32, #SV> 269 bufferization.dealloc_tensor %d1_i32 : tensor<32xi32, #DV> 270 bufferization.dealloc_tensor %d1_f32 : tensor<32xf32, #DV> 271 bufferization.dealloc_tensor %s1_i32 : tensor<32xi32, #SV> 272 bufferization.dealloc_tensor %s1_f32 : tensor<32xf32, #SV> 273 bufferization.dealloc_tensor %s0 : tensor<32xi32, #SV> 274 bufferization.dealloc_tensor %0 : tensor<i32> 275 bufferization.dealloc_tensor %1 : tensor<f32> 276 bufferization.dealloc_tensor %2 : tensor<i32> 277 bufferization.dealloc_tensor %3 : tensor<f32> 278 bufferization.dealloc_tensor %4 : tensor<i32> 279 bufferization.dealloc_tensor %5 : tensor<f32> 280 bufferization.dealloc_tensor %6 : tensor<i32> 281 bufferization.dealloc_tensor %7 : tensor<f32> 282 bufferization.dealloc_tensor %8 : tensor<i32> 283 bufferization.dealloc_tensor %9 : tensor<i32> 284 bufferization.dealloc_tensor %10 : tensor<i32> 285 286 return 287 } 288} 289