1//-------------------------------------------------------------------------------------------------- 2// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS. 3// 4// Set-up that's shared across all tests in this directory. In principle, this 5// config could be moved to lit.local.cfg. However, there are downstream users that 6// do not use these LIT config files. Hence why this is kept inline. 7// 8// DEFINE: %{sparsifier_opts} = enable-runtime-library=true 9// DEFINE: %{sparsifier_opts_sve} = enable-arm-sve=true %{sparsifier_opts} 10// DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}" 11// DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}" 12// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils 13// DEFINE: %{run_libs_sve} = -shared-libs=%native_mlir_runner_utils,%native_mlir_c_runner_utils 14// DEFINE: %{run_opts} = -e main -entry-point-result=void 15// DEFINE: %{run} = mlir-runner %{run_opts} %{run_libs} 16// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs_sve} 17// 18// DEFINE: %{env} = 19//-------------------------------------------------------------------------------------------------- 20 21// RUN: %{compile} | %{run} | FileCheck %s 22// 23// Do the same run, but now with direct IR generation. 24// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true 25// RUN: %{compile} | %{run} | FileCheck %s 26// 27// Do the same run, but now with direct IR generation and vectorization. 28// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true 29// RUN: %{compile} | %{run} | FileCheck %s 30// 31// Do the same run, but now with direct IR generation and VLA vectorization. 32// RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %} 33 34#SV = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }> 35 36#trait_cast = { 37 indexing_maps = [ 38 affine_map<(i) -> (i)>, // A (in) 39 affine_map<(i) -> (i)> // X (out) 40 ], 41 iterator_types = ["parallel"], 42 doc = "X(i) = cast A(i)" 43} 44 45// 46// Integration test that lowers a kernel annotated as sparse to actual sparse 47// code, initializes a matching sparse storage scheme from a dense vector, 48// and runs the resulting code with the JIT compiler. 49// 50module { 51 // 52 // Various kernels that cast a sparse vector from one type to another. 53 // Arithmetic supports the following casts. 54 // sitofp 55 // uitofp 56 // fptosi 57 // fptoui 58 // extf 59 // truncf 60 // extsi 61 // extui 62 // trunci 63 // bitcast 64 // Since all casts are "zero preserving" unary operations, lattice computation 65 // and conversion to sparse code is straightforward. 66 // 67 func.func @sparse_cast_s32_to_f32(%arga: tensor<10xi32, #SV>, 68 %argb: tensor<10xf32>) -> tensor<10xf32> { 69 %0 = linalg.generic #trait_cast 70 ins(%arga: tensor<10xi32, #SV>) 71 outs(%argb: tensor<10xf32>) { 72 ^bb(%a: i32, %x : f32): 73 %cst = arith.sitofp %a : i32 to f32 74 linalg.yield %cst : f32 75 } -> tensor<10xf32> 76 return %0 : tensor<10xf32> 77 } 78 func.func @sparse_cast_u32_to_f32(%arga: tensor<10xi32, #SV>, 79 %argb: tensor<10xf32>) -> tensor<10xf32> { 80 %0 = linalg.generic #trait_cast 81 ins(%arga: tensor<10xi32, #SV>) 82 outs(%argb: tensor<10xf32>) { 83 ^bb(%a: i32, %x : f32): 84 %cst = arith.uitofp %a : i32 to f32 85 linalg.yield %cst : f32 86 } -> tensor<10xf32> 87 return %0 : tensor<10xf32> 88 } 89 func.func @sparse_cast_f32_to_s32(%arga: tensor<10xf32, #SV>, 90 %argb: tensor<10xi32>) -> tensor<10xi32> { 91 %0 = linalg.generic #trait_cast 92 ins(%arga: tensor<10xf32, #SV>) 93 outs(%argb: tensor<10xi32>) { 94 ^bb(%a: f32, %x : i32): 95 %cst = arith.fptosi %a : f32 to i32 96 linalg.yield %cst : i32 97 } -> tensor<10xi32> 98 return %0 : tensor<10xi32> 99 } 100 func.func @sparse_cast_f64_to_u32(%arga: tensor<10xf64, #SV>, 101 %argb: tensor<10xi32>) -> tensor<10xi32> { 102 %0 = linalg.generic #trait_cast 103 ins(%arga: tensor<10xf64, #SV>) 104 outs(%argb: tensor<10xi32>) { 105 ^bb(%a: f64, %x : i32): 106 %cst = arith.fptoui %a : f64 to i32 107 linalg.yield %cst : i32 108 } -> tensor<10xi32> 109 return %0 : tensor<10xi32> 110 } 111 func.func @sparse_cast_f32_to_f64(%arga: tensor<10xf32, #SV>, 112 %argb: tensor<10xf64>) -> tensor<10xf64> { 113 %0 = linalg.generic #trait_cast 114 ins(%arga: tensor<10xf32, #SV>) 115 outs(%argb: tensor<10xf64>) { 116 ^bb(%a: f32, %x : f64): 117 %cst = arith.extf %a : f32 to f64 118 linalg.yield %cst : f64 119 } -> tensor<10xf64> 120 return %0 : tensor<10xf64> 121 } 122 func.func @sparse_cast_f64_to_f32(%arga: tensor<10xf64, #SV>, 123 %argb: tensor<10xf32>) -> tensor<10xf32> { 124 %0 = linalg.generic #trait_cast 125 ins(%arga: tensor<10xf64, #SV>) 126 outs(%argb: tensor<10xf32>) { 127 ^bb(%a: f64, %x : f32): 128 %cst = arith.truncf %a : f64 to f32 129 linalg.yield %cst : f32 130 } -> tensor<10xf32> 131 return %0 : tensor<10xf32> 132 } 133 func.func @sparse_cast_s32_to_u64(%arga: tensor<10xi32, #SV>, 134 %argb: tensor<10xi64>) -> tensor<10xi64> { 135 %0 = linalg.generic #trait_cast 136 ins(%arga: tensor<10xi32, #SV>) 137 outs(%argb: tensor<10xi64>) { 138 ^bb(%a: i32, %x : i64): 139 %cst = arith.extsi %a : i32 to i64 140 linalg.yield %cst : i64 141 } -> tensor<10xi64> 142 return %0 : tensor<10xi64> 143 } 144 func.func @sparse_cast_u32_to_s64(%arga: tensor<10xi32, #SV>, 145 %argb: tensor<10xi64>) -> tensor<10xi64> { 146 %0 = linalg.generic #trait_cast 147 ins(%arga: tensor<10xi32, #SV>) 148 outs(%argb: tensor<10xi64>) { 149 ^bb(%a: i32, %x : i64): 150 %cst = arith.extui %a : i32 to i64 151 linalg.yield %cst : i64 152 } -> tensor<10xi64> 153 return %0 : tensor<10xi64> 154 } 155 func.func @sparse_cast_i32_to_i8(%arga: tensor<10xi32, #SV>, 156 %argb: tensor<10xi8>) -> tensor<10xi8> { 157 %0 = linalg.generic #trait_cast 158 ins(%arga: tensor<10xi32, #SV>) 159 outs(%argb: tensor<10xi8>) { 160 ^bb(%a: i32, %x : i8): 161 %cst = arith.trunci %a : i32 to i8 162 linalg.yield %cst : i8 163 } -> tensor<10xi8> 164 return %0 : tensor<10xi8> 165 } 166 func.func @sparse_cast_f32_as_s32(%arga: tensor<10xf32, #SV>, 167 %argb: tensor<10xi32>) -> tensor<10xi32> { 168 %0 = linalg.generic #trait_cast 169 ins(%arga: tensor<10xf32, #SV>) 170 outs(%argb: tensor<10xi32>) { 171 ^bb(%a: f32, %x : i32): 172 %cst = arith.bitcast %a : f32 to i32 173 linalg.yield %cst : i32 174 } -> tensor<10xi32> 175 return %0 : tensor<10xi32> 176 } 177 178 // 179 // Main driver that converts a dense tensor into a sparse tensor 180 // and then calls the sparse casting kernel. 181 // 182 func.func @main() { 183 %z = arith.constant 0 : index 184 %b = arith.constant 0 : i8 185 %i = arith.constant 0 : i32 186 %l = arith.constant 0 : i64 187 %f = arith.constant 0.0 : f32 188 %d = arith.constant 0.0 : f64 189 190 %zero_b = arith.constant dense<0> : tensor<10xi8> 191 %zero_d = arith.constant dense<0.0> : tensor<10xf64> 192 %zero_f = arith.constant dense<0.0> : tensor<10xf32> 193 %zero_i = arith.constant dense<0> : tensor<10xi32> 194 %zero_l = arith.constant dense<0> : tensor<10xi64> 195 196 // Initialize dense tensors, convert to a sparse vectors. 197 %0 = arith.constant dense<[ -4, -3, -2, -1, 0, 1, 2, 3, 4, 305 ]> : tensor<10xi32> 198 %1 = sparse_tensor.convert %0 : tensor<10xi32> to tensor<10xi32, #SV> 199 %2 = arith.constant dense<[ -4.4, -3.3, -2.2, -1.1, 0.0, 1.1, 2.2, 3.3, 4.4, 305.5 ]> : tensor<10xf32> 200 %3 = sparse_tensor.convert %2 : tensor<10xf32> to tensor<10xf32, #SV> 201 %4 = arith.constant dense<[ -4.4, -3.3, -2.2, -1.1, 0.0, 1.1, 2.2, 3.3, 4.4, 305.5 ]> : tensor<10xf64> 202 %5 = sparse_tensor.convert %4 : tensor<10xf64> to tensor<10xf64, #SV> 203 %6 = arith.constant dense<[ 4294967295.0, 4294967294.0, 4294967293.0, 4294967292.0, 204 0.0, 1.1, 2.2, 3.3, 4.4, 305.5 ]> : tensor<10xf64> 205 %7 = sparse_tensor.convert %6 : tensor<10xf64> to tensor<10xf64, #SV> 206 207 // 208 // CHECK: ( -4, -3, -2, -1, 0, 1, 2, 3, 4, 305 ) 209 // 210 %c0 = call @sparse_cast_s32_to_f32(%1, %zero_f) : (tensor<10xi32, #SV>, tensor<10xf32>) -> tensor<10xf32> 211 %v0 = vector.transfer_read %c0[%z], %f: tensor<10xf32>, vector<10xf32> 212 vector.print %v0 : vector<10xf32> 213 214 // 215 // CHECK: ( 4.29497e+09, 4.29497e+09, 4.29497e+09, 4.29497e+09, 0, 1, 2, 3, 4, 305 ) 216 // 217 %c1 = call @sparse_cast_u32_to_f32(%1, %zero_f) : (tensor<10xi32, #SV>, tensor<10xf32>) -> tensor<10xf32> 218 %v1 = vector.transfer_read %c1[%z], %f: tensor<10xf32>, vector<10xf32> 219 vector.print %v1 : vector<10xf32> 220 221 // 222 // CHECK: ( -4, -3, -2, -1, 0, 1, 2, 3, 4, 305 ) 223 // 224 %c2 = call @sparse_cast_f32_to_s32(%3, %zero_i) : (tensor<10xf32, #SV>, tensor<10xi32>) -> tensor<10xi32> 225 %v2 = vector.transfer_read %c2[%z], %i: tensor<10xi32>, vector<10xi32> 226 vector.print %v2 : vector<10xi32> 227 228 // 229 // CHECK: ( 4294967295, 4294967294, 4294967293, 4294967292, 0, 1, 2, 3, 4, 305 ) 230 // 231 %c3 = call @sparse_cast_f64_to_u32(%7, %zero_i) : (tensor<10xf64, #SV>, tensor<10xi32>) -> tensor<10xi32> 232 %v3 = vector.transfer_read %c3[%z], %i: tensor<10xi32>, vector<10xi32> 233 %vu = vector.bitcast %v3 : vector<10xi32> to vector<10xui32> 234 vector.print %vu : vector<10xui32> 235 236 // 237 // CHECK: ( -4.4, -3.3, -2.2, -1.1, 0, 1.1, 2.2, 3.3, 4.4, 305.5 ) 238 // 239 %c4 = call @sparse_cast_f32_to_f64(%3, %zero_d) : (tensor<10xf32, #SV>, tensor<10xf64>) -> tensor<10xf64> 240 %v4 = vector.transfer_read %c4[%z], %d: tensor<10xf64>, vector<10xf64> 241 vector.print %v4 : vector<10xf64> 242 243 // 244 // CHECK: ( -4.4, -3.3, -2.2, -1.1, 0, 1.1, 2.2, 3.3, 4.4, 305.5 ) 245 // 246 %c5 = call @sparse_cast_f64_to_f32(%5, %zero_f) : (tensor<10xf64, #SV>, tensor<10xf32>) -> tensor<10xf32> 247 %v5 = vector.transfer_read %c5[%z], %f: tensor<10xf32>, vector<10xf32> 248 vector.print %v5 : vector<10xf32> 249 250 // 251 // CHECK: ( -4, -3, -2, -1, 0, 1, 2, 3, 4, 305 ) 252 // 253 %c6 = call @sparse_cast_s32_to_u64(%1, %zero_l) : (tensor<10xi32, #SV>, tensor<10xi64>) -> tensor<10xi64> 254 %v6 = vector.transfer_read %c6[%z], %l: tensor<10xi64>, vector<10xi64> 255 vector.print %v6 : vector<10xi64> 256 257 // 258 // CHECK: ( 4294967292, 4294967293, 4294967294, 4294967295, 0, 1, 2, 3, 4, 305 ) 259 // 260 %c7 = call @sparse_cast_u32_to_s64(%1, %zero_l) : (tensor<10xi32, #SV>, tensor<10xi64>) -> tensor<10xi64> 261 %v7 = vector.transfer_read %c7[%z], %l: tensor<10xi64>, vector<10xi64> 262 vector.print %v7 : vector<10xi64> 263 264 // 265 // CHECK: ( -4, -3, -2, -1, 0, 1, 2, 3, 4, 49 ) 266 // 267 %c8 = call @sparse_cast_i32_to_i8(%1, %zero_b) : (tensor<10xi32, #SV>, tensor<10xi8>) -> tensor<10xi8> 268 %v8 = vector.transfer_read %c8[%z], %b: tensor<10xi8>, vector<10xi8> 269 vector.print %v8 : vector<10xi8> 270 271 // 272 // CHECK: ( -1064514355, -1068289229, -1072902963, -1081291571, 0, 1066192077, 1074580685, 1079194419, 1082969293, 1134084096 ) 273 // 274 %c9 = call @sparse_cast_f32_as_s32(%3, %zero_i) : (tensor<10xf32, #SV>, tensor<10xi32>) -> tensor<10xi32> 275 %v9 = vector.transfer_read %c9[%z], %i: tensor<10xi32>, vector<10xi32> 276 vector.print %v9 : vector<10xi32> 277 278 // Release the resources. 279 bufferization.dealloc_tensor %1 : tensor<10xi32, #SV> 280 bufferization.dealloc_tensor %3 : tensor<10xf32, #SV> 281 bufferization.dealloc_tensor %5 : tensor<10xf64, #SV> 282 bufferization.dealloc_tensor %7 : tensor<10xf64, #SV> 283 bufferization.dealloc_tensor %c0 : tensor<10xf32> 284 bufferization.dealloc_tensor %c1 : tensor<10xf32> 285 bufferization.dealloc_tensor %c2 : tensor<10xi32> 286 bufferization.dealloc_tensor %c3 : tensor<10xi32> 287 bufferization.dealloc_tensor %c4 : tensor<10xf64> 288 bufferization.dealloc_tensor %c5 : tensor<10xf32> 289 bufferization.dealloc_tensor %c6 : tensor<10xi64> 290 bufferization.dealloc_tensor %c7 : tensor<10xi64> 291 bufferization.dealloc_tensor %c8 : tensor<10xi8> 292 bufferization.dealloc_tensor %c9 : tensor<10xi32> 293 294 return 295 } 296} 297