1// NOTE: this test requires gpu-sm80 2// 3// DEFINE: %{compile} = mlir-opt %s \ 4// DEFINE: --sparsifier="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format 5// DEFINE: %{run} = \ 6// DEFINE: env TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \ 7// DEFINE: mlir-runner \ 8// DEFINE: --shared-libs=%mlir_cuda_runtime \ 9// DEFINE: --shared-libs=%mlir_c_runner_utils \ 10// DEFINE: --e main --entry-point-result=void \ 11// DEFINE: | FileCheck %s 12// 13// with RT lib: 14// 15// RUN: %{compile} enable-runtime-library=true" | %{run} 16// 17// without RT lib: 18// 19// RUN: %{compile} enable-runtime-library=false" | %{run} 20 21!Filename = !llvm.ptr 22 23#CSR = #sparse_tensor.encoding<{ 24 map = (d0, d1) -> (d0 : dense, d1 : compressed) 25}> 26 27#trait_sampled_dense_dense = { 28 indexing_maps = [ 29 affine_map<(i,j,k) -> (i,k)>, // A 30 affine_map<(i,j,k) -> (k,j)>, // B 31 affine_map<(i,j,k) -> (i,j)> // S (in/out) 32 ], 33 iterator_types = ["parallel", "parallel", "reduction"], 34 doc = "S(i,j) += spy[S(i,j)] x SUM_k A(i,k) B(k,j)" 35} 36 37// 38// Integration test that lowers a kernel annotated as sparse to 39// actual sparse code, initializes sparse storage schemes, and 40// runs the resulting code with the JIT compiler. 41// 42module { 43 llvm.func @mgpuCreateSparseEnv() 44 llvm.func @mgpuDestroySparseEnv() 45 46 // 47 // A kernel that computes a sampled dense matrix matrix multiplication 48 // using a "spy" function and in-place update of the sampling sparse matrix. 49 // 50 func.func @sampled_dense_dense(%args: tensor<?x?xf32, #CSR>, 51 %arga: tensor<?x?xf32>, 52 %argb: tensor<?x?xf32>) -> tensor<?x?xf32, #CSR> { 53 %result = linalg.generic #trait_sampled_dense_dense 54 ins(%arga, %argb: tensor<?x?xf32>, tensor<?x?xf32>) 55 outs(%args: tensor<?x?xf32, #CSR>) { 56 ^bb(%a: f32, %b: f32, %s: f32): 57 %f0 = arith.constant 0.0 : f32 58 %u = sparse_tensor.unary %s : f32 to f32 59 present={ 60 ^bb0(%p: f32): 61 %mul = arith.mulf %a, %b : f32 62 sparse_tensor.yield %mul : f32 63 } 64 absent={} 65 %r = sparse_tensor.reduce %s, %u, %f0 : f32 { 66 ^bb0(%p: f32, %q: f32): 67 %add = arith.addf %p, %q : f32 68 sparse_tensor.yield %add : f32 69 } 70 linalg.yield %r : f32 71 } -> tensor<?x?xf32, #CSR> 72 return %result : tensor<?x?xf32, #CSR> 73 } 74 75 func.func private @getTensorFilename(index) -> (!Filename) 76 77 // 78 // Main driver. 79 // 80 func.func @main() { 81 llvm.call @mgpuCreateSparseEnv() : () -> () 82 %d0 = arith.constant 0.0 : f32 83 %c0 = arith.constant 0 : index 84 %c1 = arith.constant 1 : index 85 %c5 = arith.constant 5 : index 86 %c10 = arith.constant 10 : index 87 88 // Initialize dense matrices. 89 %a = tensor.generate %c5, %c10 { 90 ^bb0(%i: index, %j: index): 91 %p = arith.addi %i, %c1 : index 92 %q = arith.index_cast %p : index to i32 93 %d = arith.sitofp %q : i32 to f32 94 tensor.yield %d : f32 95 } : tensor<?x?xf32> 96 %b = tensor.generate %c10, %c5 { 97 ^bb0(%i: index, %j: index): 98 %p = arith.addi %j, %c1 : index 99 %q = arith.index_cast %p : index to i32 100 %d = arith.sitofp %q : i32 to f32 101 tensor.yield %d : f32 102 } : tensor<?x?xf32> 103 104 // Read the sparse matrix from file, construct sparse storage. 105 %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename) 106 %s = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #CSR> 107 108 // Call the kernel. 109 %0 = call @sampled_dense_dense(%s, %a, %b) 110 : (tensor<?x?xf32, #CSR>, 111 tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #CSR> 112 113 // 114 // Print the result for verification. 115 // 116 // CHECK: ---- Sparse Tensor ---- 117 // CHECK-NEXT: nse = 9 118 // CHECK-NEXT: dim = ( 5, 5 ) 119 // CHECK-NEXT: lvl = ( 5, 5 ) 120 // CHECK-NEXT: pos[1] : ( 0, 2, 4, 5, 7, 9 ) 121 // CHECK-NEXT: crd[1] : ( 0, 3, 1, 4, 2, 0, 3, 1, 4 ) 122 // CHECK-NEXT: values : ( 11, 41.4, 42, 102.5, 93, 44.1, 164, 105.2, 255 ) 123 // CHECK-NEXT: ---- 124 sparse_tensor.print %0 : tensor<?x?xf32, #CSR> 125 126 // Create a much sparser sampling matrix. 127 %t = arith.constant sparse<[[0,0], [0,1], [1,0], [3,4], [7,7]], 128 [1.0, 2.0, 3.0, 4.0, 5.0] 129 > : tensor<8x8xf32> 130 %q = sparse_tensor.convert %t : tensor<8x8xf32> to tensor<?x?xf32, #CSR> 131 %a2 = arith.constant dense<2.0> : tensor<8x8xf32> 132 %b1 = arith.constant dense<1.0> : tensor<8x8xf32> 133 %a2c = tensor.cast %a2 : tensor<8x8xf32> to tensor<?x?xf32> 134 %b1c = tensor.cast %b1 : tensor<8x8xf32> to tensor<?x?xf32> 135 136 // Call the kernel again. 137 %1 = call @sampled_dense_dense(%q, %a2c, %b1c) 138 : (tensor<?x?xf32, #CSR>, 139 tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #CSR> 140 141 // 142 // Print the result for verification. 143 // 144 // CHECK: ---- Sparse Tensor ---- 145 // CHECK-NEXT: nse = 5 146 // CHECK-NEXT: dim = ( 8, 8 ) 147 // CHECK-NEXT: lvl = ( 8, 8 ) 148 // CHECK-NEXT: pos[1] : ( 0, 2, 3, 3, 4, 4, 4, 4, 5 ) 149 // CHECK-NEXT: crd[1] : ( 0, 1, 0, 4, 7 ) 150 // CHECK-NEXT: values : ( 17, 18, 19, 20, 21 ) 151 // CHECK-NEXT: ---- 152 // 153 sparse_tensor.print %1 : tensor<?x?xf32, #CSR> 154 155 // Release the resources. 156 bufferization.dealloc_tensor %0 : tensor<?x?xf32, #CSR> 157 bufferization.dealloc_tensor %1 : tensor<?x?xf32, #CSR> 158 159 llvm.call @mgpuDestroySparseEnv() : () -> () 160 return 161 } 162} 163