1//-------------------------------------------------------------------------------------------------- 2// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS. 3// 4// Set-up that's shared across all tests in this directory. In principle, this 5// config could be moved to lit.local.cfg. However, there are downstream users that 6// do not use these LIT config files. Hence why this is kept inline. 7// 8// DEFINE: %{sparsifier_opts} = enable-runtime-library=true 9// DEFINE: %{sparsifier_opts_sve} = enable-arm-sve=true %{sparsifier_opts} 10// DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}" 11// DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}" 12// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils 13// DEFINE: %{run_libs_sve} = -shared-libs=%native_mlir_runner_utils,%native_mlir_c_runner_utils 14// DEFINE: %{run_opts} = -e main -entry-point-result=void 15// DEFINE: %{run} = mlir-runner %{run_opts} %{run_libs} 16// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs_sve} 17// 18// DEFINE: %{env} = 19//-------------------------------------------------------------------------------------------------- 20 21// RUN: %{compile} | %{run} | FileCheck %s 22// 23// Do the same run, but now with direct IR generation. 24// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false 25// RUN: %{compile} | %{run} | FileCheck %s 26// 27// Do the same run, but now with direct IR generation and vectorization. 28// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false vl=2 reassociate-fp-reductions=true enable-index-optimizations=true 29// RUN: %{compile} | %{run} | FileCheck %s 30// 31// Do the same run, but now with direct IR generation and VLA vectorization. 32// RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %} 33 34// Reduction in this file _are_ supported by the AArch64 SVE backend 35 36#SV = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }> 37 38#trait_reduction = { 39 indexing_maps = [ 40 affine_map<(i) -> (i)>, // a 41 affine_map<(i) -> ()> // x (scalar out) 42 ], 43 iterator_types = ["reduction"], 44 doc = "x += OPER_i a(i)" 45} 46 47// An example of vector reductions. 48module { 49 50 func.func @sum_reduction_i32(%arga: tensor<32xi32, #SV>, 51 %argx: tensor<i32>) -> tensor<i32> { 52 %0 = linalg.generic #trait_reduction 53 ins(%arga: tensor<32xi32, #SV>) 54 outs(%argx: tensor<i32>) { 55 ^bb(%a: i32, %x: i32): 56 %0 = arith.addi %x, %a : i32 57 linalg.yield %0 : i32 58 } -> tensor<i32> 59 return %0 : tensor<i32> 60 } 61 62 func.func @sum_reduction_f32(%arga: tensor<32xf32, #SV>, 63 %argx: tensor<f32>) -> tensor<f32> { 64 %0 = linalg.generic #trait_reduction 65 ins(%arga: tensor<32xf32, #SV>) 66 outs(%argx: tensor<f32>) { 67 ^bb(%a: f32, %x: f32): 68 %0 = arith.addf %x, %a : f32 69 linalg.yield %0 : f32 70 } -> tensor<f32> 71 return %0 : tensor<f32> 72 } 73 74 func.func @or_reduction_i32(%arga: tensor<32xi32, #SV>, 75 %argx: tensor<i32>) -> tensor<i32> { 76 %0 = linalg.generic #trait_reduction 77 ins(%arga: tensor<32xi32, #SV>) 78 outs(%argx: tensor<i32>) { 79 ^bb(%a: i32, %x: i32): 80 %0 = arith.ori %x, %a : i32 81 linalg.yield %0 : i32 82 } -> tensor<i32> 83 return %0 : tensor<i32> 84 } 85 86 func.func @xor_reduction_i32(%arga: tensor<32xi32, #SV>, 87 %argx: tensor<i32>) -> tensor<i32> { 88 %0 = linalg.generic #trait_reduction 89 ins(%arga: tensor<32xi32, #SV>) 90 outs(%argx: tensor<i32>) { 91 ^bb(%a: i32, %x: i32): 92 %0 = arith.xori %x, %a : i32 93 linalg.yield %0 : i32 94 } -> tensor<i32> 95 return %0 : tensor<i32> 96 } 97 98 func.func @dump_i32(%arg0 : tensor<i32>) { 99 %v = tensor.extract %arg0[] : tensor<i32> 100 vector.print %v : i32 101 return 102 } 103 104 func.func @dump_f32(%arg0 : tensor<f32>) { 105 %v = tensor.extract %arg0[] : tensor<f32> 106 vector.print %v : f32 107 return 108 } 109 110 func.func @main() { 111 %ri = arith.constant dense< 7 > : tensor<i32> 112 %rf = arith.constant dense< 2.0 > : tensor<f32> 113 114 %c_0_i32 = arith.constant dense<[ 115 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0, 116 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0 117 ]> : tensor<32xi32> 118 119 %c_0_f32 = arith.constant dense<[ 120 0.0, 1.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 121 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 122 0.0, 0.0, 0.0, 0.0, 2.5, 0.0, 0.0, 0.0, 123 2.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 9.0 124 ]> : tensor<32xf32> 125 126 // Convert constants to annotated tensors. 127 %sparse_input_i32 = sparse_tensor.convert %c_0_i32 128 : tensor<32xi32> to tensor<32xi32, #SV> 129 %sparse_input_f32 = sparse_tensor.convert %c_0_f32 130 : tensor<32xf32> to tensor<32xf32, #SV> 131 132 // Call the kernels. 133 %0 = call @sum_reduction_i32(%sparse_input_i32, %ri) 134 : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 135 %1 = call @sum_reduction_f32(%sparse_input_f32, %rf) 136 : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32> 137 %2 = call @or_reduction_i32(%sparse_input_i32, %ri) 138 : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 139 %3 = call @xor_reduction_i32(%sparse_input_i32, %ri) 140 : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32> 141 142 // Verify results. 143 // 144 // CHECK: 26 145 // CHECK: 27.5 146 // CHECK: 15 147 // CHECK: 10 148 // 149 call @dump_i32(%0) : (tensor<i32>) -> () 150 call @dump_f32(%1) : (tensor<f32>) -> () 151 call @dump_i32(%2) : (tensor<i32>) -> () 152 call @dump_i32(%3) : (tensor<i32>) -> () 153 154 // Release the resources. 155 bufferization.dealloc_tensor %sparse_input_i32 : tensor<32xi32, #SV> 156 bufferization.dealloc_tensor %sparse_input_f32 : tensor<32xf32, #SV> 157 bufferization.dealloc_tensor %0 : tensor<i32> 158 bufferization.dealloc_tensor %1 : tensor<f32> 159 bufferization.dealloc_tensor %2 : tensor<i32> 160 bufferization.dealloc_tensor %3 : tensor<i32> 161 162 return 163 } 164} 165