1// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s 2 3// Checking the translation of the `gpu.binary` & `gpu.launch_fun` ops. 4module attributes {gpu.container_module} { 5 // CHECK: [[ARGS_TY:%.*]] = type { i32, i32 } 6 // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8 7 // CHECK: @kernel_module_kernel_kernel_name = private unnamed_addr constant [7 x i8] c"kernel\00", align 1 8 gpu.binary @kernel_module [#gpu.object<#nvvm.target, "BLOB">] 9 llvm.func @foo() { 10 // CHECK: [[ARGS:%.*]] = alloca %{{.*}}, align 8 11 // CHECK: [[ARGS_ARRAY:%.*]] = alloca ptr, i64 2, align 8 12 // CHECK: [[ARG0:%.*]] = getelementptr inbounds nuw [[ARGS_TY]], ptr [[ARGS]], i32 0, i32 0 13 // CHECK: store i32 32, ptr [[ARG0]], align 4 14 // CHECK: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 0 15 // CHECK: store ptr [[ARG0]], ptr %{{.*}}, align 8 16 // CHECK: [[ARG1:%.*]] = getelementptr inbounds nuw [[ARGS_TY]], ptr [[ARGS]], i32 0, i32 1 17 // CHECK: store i32 32, ptr [[ARG1]], align 4 18 // CHECK: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 1 19 // CHECK: store ptr [[ARG1]], ptr %{{.*}}, align 8 20 // CHECK: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst, i64 4) 21 // CHECK: [[FUNC:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[MODULE]], ptr @kernel_module_kernel_kernel_name) 22 // CHECK: [[STREAM:%.*]] = call ptr @mgpuStreamCreate() 23 // CHECK: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 256, ptr [[STREAM]], ptr [[ARGS_ARRAY]], ptr null, i64 2) 24 // CHECK: call void @mgpuStreamSynchronize(ptr [[STREAM]]) 25 // CHECK: call void @mgpuStreamDestroy(ptr [[STREAM]]) 26 // CHECK: call void @mgpuModuleUnload(ptr [[MODULE]]) 27 %0 = llvm.mlir.constant(8 : index) : i64 28 %1 = llvm.mlir.constant(32 : i32) : i32 29 %2 = llvm.mlir.constant(256 : i32) : i32 30 gpu.launch_func @kernel_module::@kernel blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %2 args(%1 : i32, %1 : i32) 31 llvm.return 32 } 33} 34 35// ----- 36 37// Checking the correct selection of the second object using an index as a selector. 38module { 39 // CHECK: @kernel_module_bin_cst = internal constant [1 x i8] c"1", align 8 40 gpu.binary @kernel_module <#gpu.select_object<1>> [#gpu.object<#nvvm.target, "0">, #gpu.object<#nvvm.target, "1">] 41} 42 43// ----- 44 45// Checking the correct selection of the second object using a target as a selector. 46module { 47 // CHECK: @kernel_module_bin_cst = internal constant [6 x i8] c"AMDGPU", align 8 48 gpu.binary @kernel_module <#gpu.select_object<#rocdl.target>> [#gpu.object<#nvvm.target, "NVPTX">, #gpu.object<#rocdl.target, "AMDGPU">] 49} 50 51// ----- 52 53// Checking the correct selection of the second object using a target as a selector. 54module { 55 // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8 56 gpu.binary @kernel_module <#gpu.select_object<#spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>>> [#gpu.object<#nvvm.target, "NVPTX">, #gpu.object<#spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>, "BLOB">] 57} 58 59// ----- 60// Checking the translation of `gpu.launch_fun` with an async dependency. 61module attributes {gpu.container_module} { 62 // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8 63 gpu.binary @kernel_module [#gpu.object<#rocdl.target, "BLOB">] 64 llvm.func @foo() { 65 %0 = llvm.mlir.constant(8 : index) : i64 66 // CHECK: = call ptr @mgpuStreamCreate() 67 // CHECK-NEXT: = alloca {{.*}}, align 8 68 // CHECK-NEXT: [[ARGS:%.*]] = alloca ptr, i64 0, align 8 69 // CHECK-NEXT: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst, i64 4) 70 // CHECK-NEXT: [[FUNC:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[MODULE]], ptr @kernel_module_kernel_kernel_name) 71 // CHECK-NEXT: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 0, ptr {{.*}}, ptr [[ARGS]], ptr null, i64 0) 72 // CHECK-NEXT: call void @mgpuModuleUnload(ptr [[MODULE]]) 73 // CHECK-NEXT: call void @mgpuStreamSynchronize(ptr %{{.*}}) 74 // CHECK-NEXT: call void @mgpuStreamDestroy(ptr %{{.*}}) 75 %1 = llvm.call @mgpuStreamCreate() : () -> !llvm.ptr 76 gpu.launch_func <%1 : !llvm.ptr> @kernel_module::@kernel blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 77 llvm.call @mgpuStreamSynchronize(%1) : (!llvm.ptr) -> () 78 llvm.call @mgpuStreamDestroy(%1) : (!llvm.ptr) -> () 79 llvm.return 80 } 81 llvm.func @mgpuStreamCreate() -> !llvm.ptr 82 llvm.func @mgpuStreamSynchronize(!llvm.ptr) 83 llvm.func @mgpuStreamDestroy(!llvm.ptr) 84} 85 86// ----- 87 88// Test cluster/block/thread syntax. 89module attributes {gpu.container_module} { 90 // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8 91 gpu.binary @kernel_module [#gpu.object<#nvvm.target, "BLOB">] 92 llvm.func @foo() { 93 // CHECK: [[S2:%.*]] = alloca ptr, i64 0, align 8 94 // CHECK: [[S3:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst, i64 4) 95 // CHECK: [[S4:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[S3]], ptr @kernel_module_kernel_kernel_name) 96 // CHECK: [[S5:%.*]] = call ptr @mgpuStreamCreate() 97 // CHECK: call void @mgpuLaunchClusterKernel(ptr [[S4]], i64 2, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i32 0, ptr [[S5]], ptr [[S2]], ptr null) 98 %0 = llvm.mlir.constant(1 : index) : i64 99 %1 = llvm.mlir.constant(2 : index) : i64 100 gpu.launch_func @kernel_module::@kernel clusters in (%1, %0, %0) blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 101 llvm.return 102 } 103} 104 105// ----- 106 107// Checking that ELF section is populated 108module attributes {gpu.container_module} { 109 // CHECK: @cuda_device_mod_bin_cst = internal constant [4 x i8] c"BLOB", section "__nv_rel_fatbin", align 8 110 gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, properties = {section = "__nv_rel_fatbin"}, "BLOB">] 111} 112