1// RUN: mlir-opt %s --linalg-generalize-named-ops \ 2// RUN: --pre-sparsification-rewrite \ 3// RUN: --sparse-reinterpret-map \ 4// RUN: --sparsification="parallelization-strategy=dense-outer-loop" \ 5// RUN: --sparse-gpu-codegen | FileCheck %s 6 7#CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }> 8 9// 10// CHECK-LABEL: gpu.module @sparse_kernels 11// CHECK: gpu.func @kernel1 12// CHECK: gpu.func @kernel0 13// 14// CHECK-LABEL: func.func @matmuls 15// CHECK: gpu.alloc async 16// CHECK: gpu.memcpy async 17// CHECK: gpu.alloc async 18// CHECK: gpu.memcpy async 19// CHECK: gpu.alloc async 20// CHECK: gpu.memcpy async 21// CHECK: gpu.alloc async 22// CHECK: gpu.memcpy async 23// CHECK: gpu.alloc async 24// CHECK: gpu.memcpy async 25// CHECK: %[[T1:.*]] = gpu.launch_func async @sparse_kernels::@kernel1 blocks 26// CHECK: gpu.memcpy async [%[[T1]]] 27// CHECK: gpu.dealloc async 28// CHECK: gpu.dealloc async 29// CHECK: gpu.dealloc async 30// CHECK: gpu.dealloc async 31// CHECK: gpu.dealloc async 32// CHECK: gpu.wait 33// CHECK: gpu.alloc async 34// CHECK: gpu.memcpy async 35// CHECK: gpu.alloc async 36// CHECK: gpu.memcpy async 37// CHECK: gpu.alloc async 38// CHECK: gpu.memcpy async 39// CHECK: gpu.alloc async 40// CHECK: gpu.memcpy async 41// CHECK: gpu.alloc async 42// CHECK: gpu.memcpy async 43// CHECK: %[[T0:.*]] = gpu.launch_func async @sparse_kernels::@kernel0 blocks 44// CHECK: gpu.memcpy async [%[[T0]]] 45// CHECK: gpu.dealloc async 46// CHECK: gpu.dealloc async 47// CHECK: gpu.dealloc async 48// CHECK: gpu.dealloc async 49// CHECK: gpu.dealloc async 50// CHECK: gpu.wait 51// 52func.func @matmuls(%A: tensor<1024x8xf64>, 53 %B: tensor<8x1024xf64, #CSR>, 54 %C: tensor<1024x1024xf64, #CSR>) -> tensor<1024x1024xf64> { 55 %Z = arith.constant dense<0.0> : tensor<1024x1024xf64> 56 %T = linalg.matmul 57 ins(%A, %B: tensor<1024x8xf64>, tensor<8x1024xf64, #CSR>) 58 outs(%Z: tensor<1024x1024xf64>) -> tensor<1024x1024xf64> 59 %D = linalg.matmul 60 ins(%T, %C: tensor<1024x1024xf64>, tensor<1024x1024xf64, #CSR>) 61 outs(%Z: tensor<1024x1024xf64>) -> tensor<1024x1024xf64> 62 return %D : tensor<1024x1024xf64> 63} 64