xref: /llvm-project/mlir/test/Target/LLVMIR/gpu.mlir (revision 7fcc0f9065727c2c3151f7103c9d2803e507c7b7)
1// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
2
3// Checking the translation of the `gpu.binary` & `gpu.launch_fun` ops.
4module attributes {gpu.container_module} {
5  // CHECK: [[ARGS_TY:%.*]] = type { i32, i32 }
6  // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8
7  // CHECK: @kernel_module_kernel_kernel_name = private unnamed_addr constant [7 x i8] c"kernel\00", align 1
8  gpu.binary @kernel_module  [#gpu.object<#nvvm.target, "BLOB">]
9  llvm.func @foo() {
10    // CHECK: [[ARGS:%.*]] = alloca %{{.*}}, align 8
11    // CHECK: [[ARGS_ARRAY:%.*]] = alloca ptr, i64 2, align 8
12    // CHECK: [[ARG0:%.*]] = getelementptr inbounds nuw [[ARGS_TY]], ptr [[ARGS]], i32 0, i32 0
13    // CHECK: store i32 32, ptr [[ARG0]], align 4
14    // CHECK: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 0
15    // CHECK: store ptr [[ARG0]], ptr %{{.*}}, align 8
16    // CHECK: [[ARG1:%.*]] = getelementptr inbounds nuw [[ARGS_TY]], ptr [[ARGS]], i32 0, i32 1
17    // CHECK: store i32 32, ptr [[ARG1]], align 4
18    // CHECK: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 1
19    // CHECK: store ptr [[ARG1]], ptr %{{.*}}, align 8
20    // CHECK: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst, i64 4)
21    // CHECK: [[FUNC:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[MODULE]], ptr @kernel_module_kernel_kernel_name)
22    // CHECK: [[STREAM:%.*]] = call ptr @mgpuStreamCreate()
23    // CHECK: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 256, ptr [[STREAM]], ptr [[ARGS_ARRAY]], ptr null, i64 2)
24    // CHECK: call void @mgpuStreamSynchronize(ptr [[STREAM]])
25    // CHECK: call void @mgpuStreamDestroy(ptr [[STREAM]])
26    // CHECK: call void @mgpuModuleUnload(ptr [[MODULE]])
27    %0 = llvm.mlir.constant(8 : index) : i64
28    %1 = llvm.mlir.constant(32 : i32) : i32
29    %2 = llvm.mlir.constant(256 : i32) : i32
30    gpu.launch_func @kernel_module::@kernel blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %2 args(%1 : i32, %1 : i32)
31    llvm.return
32  }
33}
34
35// -----
36
37// Checking the correct selection of the second object using an index as a selector.
38module {
39  // CHECK: @kernel_module_bin_cst = internal constant [1 x i8] c"1", align 8
40  gpu.binary @kernel_module <#gpu.select_object<1>> [#gpu.object<#nvvm.target, "0">, #gpu.object<#nvvm.target, "1">]
41}
42
43// -----
44
45// Checking the correct selection of the second object using a target as a selector.
46module {
47  // CHECK: @kernel_module_bin_cst = internal constant [6 x i8] c"AMDGPU", align 8
48  gpu.binary @kernel_module <#gpu.select_object<#rocdl.target>> [#gpu.object<#nvvm.target, "NVPTX">, #gpu.object<#rocdl.target, "AMDGPU">]
49}
50
51// -----
52
53// Checking the correct selection of the second object using a target as a selector.
54module {
55  // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8
56  gpu.binary @kernel_module <#gpu.select_object<#spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>>> [#gpu.object<#nvvm.target, "NVPTX">, #gpu.object<#spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>, "BLOB">]
57}
58
59// -----
60// Checking the translation of `gpu.launch_fun` with an async dependency.
61module attributes {gpu.container_module} {
62  // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8
63  gpu.binary @kernel_module  [#gpu.object<#rocdl.target, "BLOB">]
64  llvm.func @foo() {
65    %0 = llvm.mlir.constant(8 : index) : i64
66    // CHECK: = call ptr @mgpuStreamCreate()
67    // CHECK-NEXT: = alloca {{.*}}, align 8
68    // CHECK-NEXT: [[ARGS:%.*]] = alloca ptr, i64 0, align 8
69    // CHECK-NEXT: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst, i64 4)
70    // CHECK-NEXT: [[FUNC:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[MODULE]], ptr @kernel_module_kernel_kernel_name)
71    // CHECK-NEXT: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 0, ptr {{.*}}, ptr [[ARGS]], ptr null, i64 0)
72    // CHECK-NEXT: call void @mgpuModuleUnload(ptr [[MODULE]])
73    // CHECK-NEXT: call void @mgpuStreamSynchronize(ptr %{{.*}})
74    // CHECK-NEXT: call void @mgpuStreamDestroy(ptr %{{.*}})
75    %1 = llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
76    gpu.launch_func <%1 : !llvm.ptr> @kernel_module::@kernel blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64
77    llvm.call @mgpuStreamSynchronize(%1) : (!llvm.ptr) -> ()
78    llvm.call @mgpuStreamDestroy(%1) : (!llvm.ptr) -> ()
79    llvm.return
80  }
81  llvm.func @mgpuStreamCreate() -> !llvm.ptr
82  llvm.func @mgpuStreamSynchronize(!llvm.ptr)
83  llvm.func @mgpuStreamDestroy(!llvm.ptr)
84}
85
86// -----
87
88// Test cluster/block/thread syntax.
89module attributes {gpu.container_module} {
90  // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8
91  gpu.binary @kernel_module  [#gpu.object<#nvvm.target, "BLOB">]
92  llvm.func @foo() {
93  // CHECK: [[S2:%.*]] = alloca ptr, i64 0, align 8
94  // CHECK: [[S3:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst, i64 4)
95  // CHECK: [[S4:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[S3]], ptr @kernel_module_kernel_kernel_name)
96  // CHECK: [[S5:%.*]] = call ptr @mgpuStreamCreate()
97  // CHECK: call void @mgpuLaunchClusterKernel(ptr [[S4]], i64 2, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i32 0, ptr [[S5]], ptr [[S2]], ptr null)
98    %0 = llvm.mlir.constant(1 : index) : i64
99    %1 = llvm.mlir.constant(2 : index) : i64
100    gpu.launch_func @kernel_module::@kernel clusters in (%1, %0, %0) blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64
101    llvm.return
102  }
103}
104
105// -----
106
107// Checking that ELF section is populated
108module attributes {gpu.container_module} {
109  // CHECK: @cuda_device_mod_bin_cst = internal constant [4 x i8] c"BLOB", section "__nv_rel_fatbin", align 8
110  gpu.binary @cuda_device_mod  [#gpu.object<#nvvm.target, properties = {section = "__nv_rel_fatbin"}, "BLOB">]
111}
112