1// RUN: mlir-opt %s | mlir-opt -canonicalize -cse | FileCheck %s 2 3gpu.module @main_kernel { 4 5// CHECK-LABEL: @main_kernel( 6// CHECK-SAME: %[[arg0:.*]]: !nvgpu.tensormap.descriptor 7 gpu.func @main_kernel(%arg0: !nvgpu.tensormap.descriptor< 8 tensor = memref<128x32xf32, 3>, swizzle = none, l2promo = none, 9 oob = zero, interleave = none>) kernel attributes 10 { gpu.known_block_size = array<i32: 128, 1, 1>, 11 gpu.known_grid_size = array<i32: 1, 1, 1> 12 } 13 { 14 // CHECK: %[[c0:.+]] = arith.constant 0 : index 15 // CHECK: %[[S0:.+]] = gpu.thread_id x 16 // CHECK: %[[S1:.+]] = arith.cmpi eq, %[[S0]], %[[c0]] : index 17 // CHECK: %[[S2:.+]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>> 18 // CHECK: %[[S3:.+]] = memref.view %[[S2]][%[[c0]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x32xf32, #gpu.address_space<workgroup>> 19 // CHECK: nvgpu.tma.async.store %[[S3]] to %[[arg0]][%[[c0]], %[[c0]]], predicate = %[[S1]] : memref<128x32xf32, #gpu.address_space<workgroup>> -> <tensor = memref<128x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none> 20 %c0 = arith.constant 0 : index 21 %0 = gpu.thread_id x 22 %1 = arith.cmpi eq, %0, %c0 : index 23 %2 = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>> 24 %view = memref.view %2[%c0][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x32xf32, #gpu.address_space<workgroup>> 25 nvgpu.tma.async.store %view to %arg0[%c0, %c0], predicate = %1 : memref<128x32xf32, #gpu.address_space<workgroup>> -> <tensor = memref<128x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none> 26 nvvm.cp.async.bulk.commit.group 27 nvvm.cp.async.bulk.wait_group 0 28 gpu.return 29 } 30}