1// RUN: mlir-opt %s | mlir-opt | FileCheck %s 2 3// CHECK-LABEL: func @ldmatrix( 4func.func @ldmatrix(%arg0: memref<?x?xf16, 3>, %x: index, %y: index) { 5// CHECK: nvgpu.ldmatrix %{{.*}}[%{{.*}}, %{{.*}}] 6// CHECK-SAME: {numTiles = 4 : i32, transpose = false} : memref<?x?xf16, 3> -> vector<4x2xf16> 7 %l = nvgpu.ldmatrix %arg0[%x, %y] {numTiles = 4 : i32, transpose = false} : 8 memref<?x?xf16, 3> -> vector<4x2xf16> 9 return 10} 11 12// CHECK-LABEL: func @mma_sync( 13func.func @mma_sync(%arg0: vector<4x2xf16>, 14 %arg1: vector<2x2xf16>, 15 %arg2: vector<2x2xf16>) -> vector<2x2xf16> { 16// CHECK: nvgpu.mma.sync(%{{.*}}, %{{.*}}, %{{.*}}) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> 17 %d = nvgpu.mma.sync(%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : 18 (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> 19 return %d : vector<2x2xf16> 20} 21 22// CHECK-LABEL: func @mma_sp_sync_f16_16832( 23func.func @mma_sp_sync_f16_16832(%arg0: vector<4x2xf16>, 24 %arg1: vector<4x2xf16>, 25 %arg2: vector<2x2xf16>, 26 %arg3: vector<2xi16>) -> vector<2x2xf16> { 27 // CHECK: nvgpu.mma.sp.sync(%{{.*}}, %{{.*}}, %{{.*}}) metadata(%{{.+}}) { 28 // CHECK-SAME: mmaShape = [16, 8, 32] 29 // CHECK-SAME: (vector<4x2xf16>, vector<4x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> 30 %d = nvgpu.mma.sp.sync(%arg0, %arg1, %arg2) metadata(%arg3) {mmaShape = [16, 8, 32]} : 31 (vector<4x2xf16>, vector<4x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> 32 return %d : vector<2x2xf16> 33} 34 35// CHECK-LABEL: func @mma_sp_sync_f16_16816( 36func.func @mma_sp_sync_f16_16816(%arg0: vector<2x2xf16>, 37 %arg1: vector<2x2xf16>, 38 %arg2: vector<2x2xf16>, 39 %arg3: vector<2xi16>) -> vector<2x2xf16> { 40 // CHECK: nvgpu.mma.sp.sync(%{{.*}}, %{{.*}}, %{{.*}}) metadata(%{{.+}}) { 41 // CHECK-SAME: mmaShape = [16, 8, 16] 42 // CHECK-SAME: (vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> 43 %d = nvgpu.mma.sp.sync(%arg0, %arg1, %arg2) metadata(%arg3) {mmaShape = [16, 8, 16]} : 44 (vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> 45 return %d : vector<2x2xf16> 46} 47 48// CHECK-LABEL: func @mma_sp_sync_i8_16864( 49func.func @mma_sp_sync_i8_16864(%arg0: vector<4x4xi8>, 50 %arg1: vector<4x4xi8>, 51 %arg2: vector<2x2xi32>, 52 %arg3: vector<2xi16>) -> vector<2x2xi32> { 53 // CHECK: nvgpu.mma.sp.sync(%{{.*}}, %{{.*}}, %{{.*}}) metadata(%{{.+}}) { 54 // CHECK-SAME: mmaShape = [16, 8, 64] 55 // CHECK-SAME: (vector<4x4xi8>, vector<4x4xi8>, vector<2x2xi32>) -> vector<2x2xi32> 56 %d = nvgpu.mma.sp.sync(%arg0, %arg1, %arg2) metadata(%arg3) {mmaShape = [16, 8, 64]} : 57 (vector<4x4xi8>, vector<4x4xi8>, vector<2x2xi32>) -> vector<2x2xi32> 58 return %d : vector<2x2xi32> 59} 60 61func.func @async_cp(%dst : memref<2x7x5xf32, 3>, %src : memref<4x5xf32>){ 62 // CHECK-LABEL: func @async_cp 63 %c0 = arith.constant 0 : index 64 // CHECK: nvgpu.device_async_copy %{{.*}}[{{.*}}, {{.*}}], %{{.*}}[{{.*}}, {{.*}}, {{.*}}], 4 : memref<4x5xf32> to memref<2x7x5xf32, 3> 65 %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 : memref<4x5xf32> to memref<2x7x5xf32, 3> 66 // CHECK: %{{.*}} = nvgpu.device_async_create_group 67 %token = nvgpu.device_async_create_group %0 68 // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1 : i32} 69 nvgpu.device_async_wait %token {numGroups = 1 : i32} 70 return 71} 72