1// NOTE: this test requires gpu-sm80 and cusparselt 2// 3// DEFINE: %{compile} = mlir-opt %s \ 4// DEFINE: --sparsifier="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format 5// DEFINE: %{run} = mlir-runner \ 6// DEFINE: --shared-libs=%mlir_cuda_runtime \ 7// DEFINE: --shared-libs=%mlir_c_runner_utils \ 8// DEFINE: --e main --entry-point-result=void \ 9// DEFINE: | FileCheck %s 10// 11// with RT lib: 12// 13// RUN: %{compile} enable-runtime-library=true" | %{run} 14// 15// without RT lib: 16// 17// RUN: %{compile} enable-runtime-library=false" | %{run} 18 19#NV_24 = #sparse_tensor.encoding<{ 20 map = ( i, j ) -> 21 ( i : dense, 22 j floordiv 4 : dense, 23 j mod 4 : structured[2, 4] 24 ) 25}> 26 27module { 28 29 llvm.func @mgpuCreateSparseLtEnv() 30 llvm.func @mgpuDestroySparseLtEnv() 31 32 func.func @matmul24(%Ad: tensor<16x16xf16>, 33 %B: tensor<16x16xf16>, 34 %Cin: tensor<16x16xf16>) -> tensor<16x16xf16> { 35 %A = sparse_tensor.convert %Ad : tensor<16x16xf16> to tensor<16x16xf16, #NV_24> 36 %C = linalg.matmul 37 ins(%A, %B: tensor<16x16xf16, #NV_24>, tensor<16x16xf16>) 38 outs(%Cin: tensor<16x16xf16>) -> tensor<16x16xf16> 39 return %C : tensor<16x16xf16> 40 } 41 42 func.func @main() { 43 llvm.call @mgpuCreateSparseLtEnv() : () -> () 44 45 %c0 = arith.constant 0 : index 46 %c1 = arith.constant 1 : index 47 %c16 = arith.constant 16 : index 48 49 %f0 = arith.constant 0.0 : f16 50 %f1 = arith.constant 1.0 : f16 51 %f4 = arith.constant 4.0 : f16 52 53 // Initial A, B, C matrices. 54 %A = tensor.generate { 55 ^bb0(%i: index, %j: index): 56 %val = arith.andi %j, %c1 : index 57 %cmp = arith.cmpi eq, %val, %c0 : index 58 %res = arith.select %cmp, %f4, %f1 : f16 59 tensor.yield %res : f16 60 } : tensor<16x16xf16> 61 %B = tensor.generate { 62 ^bb0(%i: index, %j: index): 63 %cmp = arith.cmpi eq, %i, %j : index 64 %res = arith.select %cmp, %f1, %f0 : f16 65 tensor.yield %res : f16 66 } : tensor<16x16xf16> 67 %C = tensor.generate { 68 ^bb0(%i: index, %j: index): 69 tensor.yield %f0 : f16 70 } : tensor<16x16xf16> 71 72 // Call the kernel. 73 // 74 // By effectively computing D = A B + C with id(B) and zero(C) 75 // the resulting matrix returns the pruned A back to the caller. 76 // 77 %D = call @matmul24(%A, %B, %C): (tensor<16x16xf16>, 78 tensor<16x16xf16>, 79 tensor<16x16xf16>) -> (tensor<16x16xf16>) 80 81 // 82 // This was the original matrix. 83 // 84 // CHECK: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 85 // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 86 // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 87 // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 88 // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 89 // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 90 // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 91 // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 92 // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 93 // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 94 // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 95 // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 96 // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 97 // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 98 // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 99 // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) 100 // 101 scf.for %i = %c0 to %c16 step %c1 { 102 %va = vector.transfer_read %A[%i, %c0], %f0 : tensor<16x16xf16>, vector<16xf16> 103 vector.print %va : vector<16xf16> 104 } 105 106 // 107 // This is the STRIP-pruned matrix. 108 // 109 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 110 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 111 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 112 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 113 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 114 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 115 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 116 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 117 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 118 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 119 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 120 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 121 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 122 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 123 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 124 // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) 125 // 126 scf.for %i = %c0 to %c16 step %c1 { 127 %vd = vector.transfer_read %D[%i, %c0], %f0 : tensor<16x16xf16>, vector<16xf16> 128 vector.print %vd : vector<16xf16> 129 } 130 131 llvm.call @mgpuDestroySparseLtEnv() : () -> () 132 return 133 } 134} 135