xref: /llvm-project/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir (revision eb206e9ea84eff0a0596fed2de8316d924f946d1)
1// NOTE: this test requires gpu-sm80 and cusparselt
2//
3// DEFINE: %{compile} = mlir-opt %s \
4// DEFINE:   --sparsifier="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format
5// DEFINE: %{run} = mlir-runner \
6// DEFINE:   --shared-libs=%mlir_cuda_runtime \
7// DEFINE:   --shared-libs=%mlir_c_runner_utils \
8// DEFINE:   --e main --entry-point-result=void \
9// DEFINE: | FileCheck %s
10//
11// with RT lib:
12//
13// RUN: %{compile} enable-runtime-library=true"  | %{run}
14//
15// without RT lib:
16//
17// RUN: %{compile} enable-runtime-library=false" | %{run}
18
19#NV_24 = #sparse_tensor.encoding<{
20  map = ( i, j ) ->
21  ( i            : dense,
22    j floordiv 4 : dense,
23    j mod 4      : structured[2, 4]
24  )
25}>
26
27module {
28
29  llvm.func @mgpuCreateSparseLtEnv()
30  llvm.func @mgpuDestroySparseLtEnv()
31
32  func.func @matmul24(%Ad: tensor<16x16xf16>,
33                      %B: tensor<16x16xf16>,
34                      %Cin: tensor<16x16xf16>) -> tensor<16x16xf16> {
35    %A = sparse_tensor.convert %Ad : tensor<16x16xf16> to tensor<16x16xf16, #NV_24>
36    %C = linalg.matmul
37      ins(%A, %B: tensor<16x16xf16, #NV_24>, tensor<16x16xf16>)
38      outs(%Cin: tensor<16x16xf16>) -> tensor<16x16xf16>
39    return %C : tensor<16x16xf16>
40  }
41
42  func.func @main() {
43    llvm.call @mgpuCreateSparseLtEnv() : () -> ()
44
45    %c0 = arith.constant 0 : index
46    %c1 = arith.constant 1 : index
47    %c16 = arith.constant 16 : index
48
49    %f0 = arith.constant 0.0 : f16
50    %f1 = arith.constant 1.0 : f16
51    %f4 = arith.constant 4.0 : f16
52
53    // Initial A, B, C matrices.
54    %A = tensor.generate {
55    ^bb0(%i: index, %j: index):
56      %val = arith.andi %j, %c1 : index
57      %cmp = arith.cmpi eq, %val, %c0 : index
58      %res = arith.select %cmp, %f4, %f1 : f16
59      tensor.yield %res : f16
60    } : tensor<16x16xf16>
61    %B = tensor.generate {
62    ^bb0(%i: index, %j: index):
63      %cmp = arith.cmpi eq, %i, %j : index
64      %res = arith.select %cmp, %f1, %f0 : f16
65      tensor.yield %res : f16
66    } : tensor<16x16xf16>
67    %C = tensor.generate {
68    ^bb0(%i: index, %j: index):
69      tensor.yield %f0 : f16
70    } : tensor<16x16xf16>
71
72    // Call the kernel.
73    //
74    // By effectively computing D = A B + C with id(B) and zero(C)
75    // the resulting matrix returns the pruned A back to the caller.
76    //
77    %D = call @matmul24(%A, %B, %C): (tensor<16x16xf16>,
78                                      tensor<16x16xf16>,
79                                      tensor<16x16xf16>) -> (tensor<16x16xf16>)
80
81    //
82    // This was the original matrix.
83    //
84    // CHECK:      ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
85    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
86    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
87    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
88    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
89    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
90    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
91    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
92    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
93    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
94    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
95    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
96    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
97    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
98    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
99    // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 )
100    //
101    scf.for %i = %c0 to %c16 step %c1 {
102      %va = vector.transfer_read %A[%i, %c0], %f0 : tensor<16x16xf16>, vector<16xf16>
103      vector.print %va : vector<16xf16>
104    }
105
106    //
107    // This is the STRIP-pruned matrix.
108    //
109    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
110    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
111    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
112    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
113    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
114    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
115    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
116    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
117    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
118    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
119    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
120    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
121    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
122    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
123    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
124    // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 )
125    //
126    scf.for %i = %c0 to %c16 step %c1 {
127      %vd = vector.transfer_read %D[%i, %c0], %f0 : tensor<16x16xf16>, vector<16xf16>
128      vector.print %vd : vector<16xf16>
129    }
130
131    llvm.call @mgpuDestroySparseLtEnv() : () -> ()
132    return
133  }
134}
135