xref: /llvm-project/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll (revision 18f8106f310ee702046a11f360af47947c030d2e)
1; See ./README.md for how to maintain the LLVM IR in this test.
2
3; REQUIRES: amdgpu-registered-target
4
5; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \
6; RUN:     -disable-output %s 2>&1 | \
7; RUN:   FileCheck -match-full-lines %s
8
9;  CHECK-NOT: remark:
10;      CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes
11; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'i' with static size of 4 bytes
12; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'a' with static size of 8 bytes
13; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space
14; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_init'
15; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@f'
16; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g'
17; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_deinit'
18; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0
19; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[0] = 4294967295
20; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[1] = 4294967295
21; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[2] = 4294967295
22; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[0] = 1
23; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[1] = 1024
24; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[0] = 4
25; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[1] = 10
26; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3
27; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20
28; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0
29; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4
30; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0
31; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 1
32; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0
33; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0
34; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1
35
36; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes
37; CHECK-NEXT: remark: <unknown>:0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space
38; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space
39; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__'
40; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0
41; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', omp_target_thread_limit = 256
42; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[0] = 4294967295
43; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[1] = 4294967295
44; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[2] = 4294967295
45; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[0] = 1
46; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[1] = 256
47; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[0] = 1
48; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[1] = 10
49; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1
50; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8
51; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0
52; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1
53; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0
54; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1
55; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0
56; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0
57; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2
58
59; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca ('%[[#]]') for 'i' with static size of 4 bytes
60; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca ('%[[#]]') for 'a' with static size of 8 bytes
61; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is '@f'
62; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g'
63; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1
64; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[0] = 4294967295
65; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[1] = 4294967295
66; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[2] = 4294967295
67; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-flat-work-group-size[0] = 1
68; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-flat-work-group-size[1] = 1024
69; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-waves-per-eu[0] = 4
70; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-waves-per-eu[1] = 10
71; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2
72; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12
73; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0
74; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2
75; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0
76; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1
77; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0
78; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0
79; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0
80;  CHECK-NOT: {{.}}
81
82; ModuleID = 'test-openmp-amdgcn-amd-amdhsa-gfx906.bc'
83source_filename = "test.c"
84target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
85target triple = "amdgcn-amd-amdhsa"
86
87%struct.ident_t = type { i32, i32, i32, i32, ptr }
88%struct.DynamicEnvironmentTy = type { i16 }
89%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
90%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 }
91
92@__omp_rtl_debug_kind = weak_odr hidden addrspace(1) constant i32 0
93@__omp_rtl_assume_teams_oversubscription = weak_odr hidden addrspace(1) constant i32 0
94@__omp_rtl_assume_threads_oversubscription = weak_odr hidden addrspace(1) constant i32 0
95@__omp_rtl_assume_no_thread_state = weak_odr hidden addrspace(1) constant i32 0
96@__omp_rtl_assume_no_nested_parallelism = weak_odr hidden addrspace(1) constant i32 0
97@0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_727e9_h_l12_debug__;13;3;;\00", align 1
98@1 = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 56, ptr @0 }, align 8
99@__omp_offloading_fd02_727e9_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer
100@__omp_offloading_fd02_727e9_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_727e9_h_l12_dynamic_environment to ptr) }
101@__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
102
103; Function Attrs: convergent noinline norecurse nounwind optnone
104define internal void @__omp_offloading_fd02_727e9_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !15 {
105  %2 = alloca ptr, align 8, addrspace(5)
106  %3 = alloca i32, align 4, addrspace(5)
107  %4 = alloca [2 x i32], align 4, addrspace(5)
108  %5 = addrspacecast ptr addrspace(5) %2 to ptr
109  %6 = addrspacecast ptr addrspace(5) %3 to ptr
110  %7 = addrspacecast ptr addrspace(5) %4 to ptr
111  store ptr %0, ptr %5, align 8
112    #dbg_declare(ptr addrspace(5) %2, !23, !DIExpression(), !24)
113  %8 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_727e9_h_l12_kernel_environment to ptr), ptr %0), !dbg !25
114  %9 = icmp eq i32 %8, -1, !dbg !25
115  br i1 %9, label %10, label %11, !dbg !25
116
11710:                                               ; preds = %1
118    #dbg_declare(ptr addrspace(5) %3, !26, !DIExpression(), !29)
119    #dbg_declare(ptr addrspace(5) %4, !30, !DIExpression(), !34)
120  call void @f() #4, !dbg !35
121  call void @g() #4, !dbg !36
122  call void @__kmpc_target_deinit(), !dbg !37
123  ret void, !dbg !38
124
12511:                                               ; preds = %1
126  ret void, !dbg !25
127}
128
129; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
130define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_727e9_h_l12(ptr noalias noundef %0) #1 !dbg !39 {
131  %2 = alloca ptr, align 8, addrspace(5)
132  %3 = addrspacecast ptr addrspace(5) %2 to ptr
133  store ptr %0, ptr %3, align 8
134    #dbg_declare(ptr addrspace(5) %2, !40, !DIExpression(), !41)
135  %4 = load ptr, ptr %3, align 8, !dbg !42
136  call void @__omp_offloading_fd02_727e9_h_l12_debug__(ptr %4) #5, !dbg !42
137  ret void, !dbg !42
138}
139
140declare i32 @__kmpc_target_init(ptr, ptr)
141
142; Function Attrs: convergent
143declare void @f(...) #2
144
145declare void @__kmpc_target_deinit()
146
147; Function Attrs: convergent noinline nounwind optnone
148define hidden void @g() #3 !dbg !43 {
149  %1 = alloca i32, align 4, addrspace(5)
150  %2 = alloca [2 x i32], align 4, addrspace(5)
151  %3 = addrspacecast ptr addrspace(5) %1 to ptr
152  %4 = addrspacecast ptr addrspace(5) %2 to ptr
153    #dbg_declare(ptr addrspace(5) %1, !46, !DIExpression(), !47)
154    #dbg_declare(ptr addrspace(5) %2, !48, !DIExpression(), !49)
155  call void @f() #4, !dbg !50
156  call void @g() #4, !dbg !51
157  ret void, !dbg !52
158}
159
160attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
161attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "frame-pointer"="all" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="256" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
162attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
163attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
164attributes #4 = { convergent }
165attributes #5 = { nounwind }
166
167!llvm.dbg.cu = !{!0}
168!omp_offload.info = !{!2}
169!llvm.module.flags = !{!3, !4, !5, !6, !7, !8, !9, !10, !11}
170!llvm.ident = !{!12, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13}
171!opencl.ocl.version = !{!14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14}
172
173!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
174!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "27a878d5e894ab6d41bfe96f997f8821")
175!2 = !{i32 0, i32 64770, i32 468969, !"h", i32 12, i32 0, i32 0}
176!3 = !{i32 1, !"amdhsa_code_object_version", i32 500}
177!4 = !{i32 7, !"Dwarf Version", i32 5}
178!5 = !{i32 2, !"Debug Info Version", i32 3}
179!6 = !{i32 1, !"wchar_size", i32 4}
180!7 = !{i32 7, !"openmp", i32 51}
181!8 = !{i32 7, !"openmp-device", i32 51}
182!9 = !{i32 8, !"PIC Level", i32 2}
183!10 = !{i32 7, !"frame-pointer", i32 2}
184!11 = !{i32 4, !"amdgpu_hostcall", i32 1}
185!12 = !{!"clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)"}
186!13 = !{!"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"}
187!14 = !{i32 2, i32 0}
188!15 = distinct !DISubprogram(name: "__omp_offloading_fd02_727e9_h_l12_debug__", scope: !16, file: !16, line: 13, type: !17, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !22)
189!16 = !DIFile(filename: "test.c", directory: "/tmp")
190!17 = !DISubroutineType(types: !18)
191!18 = !{null, !19}
192!19 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !20)
193!20 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !21)
194!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
195!22 = !{}
196!23 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !15, type: !19, flags: DIFlagArtificial)
197!24 = !DILocation(line: 0, scope: !15)
198!25 = !DILocation(line: 13, column: 3, scope: !15)
199!26 = !DILocalVariable(name: "i", scope: !27, file: !16, line: 14, type: !28)
200!27 = distinct !DILexicalBlock(scope: !15, file: !16, line: 13, column: 3)
201!28 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
202!29 = !DILocation(line: 14, column: 9, scope: !27)
203!30 = !DILocalVariable(name: "a", scope: !27, file: !16, line: 15, type: !31)
204!31 = !DICompositeType(tag: DW_TAG_array_type, baseType: !28, size: 64, elements: !32)
205!32 = !{!33}
206!33 = !DISubrange(count: 2)
207!34 = !DILocation(line: 15, column: 9, scope: !27)
208!35 = !DILocation(line: 16, column: 5, scope: !27)
209!36 = !DILocation(line: 17, column: 5, scope: !27)
210!37 = !DILocation(line: 18, column: 3, scope: !27)
211!38 = !DILocation(line: 18, column: 3, scope: !15)
212!39 = distinct !DISubprogram(name: "__omp_offloading_fd02_727e9_h_l12", scope: !16, file: !16, line: 12, type: !17, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !22)
213!40 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !39, type: !19, flags: DIFlagArtificial)
214!41 = !DILocation(line: 0, scope: !39)
215!42 = !DILocation(line: 12, column: 1, scope: !39)
216!43 = distinct !DISubprogram(name: "g", scope: !16, file: !16, line: 3, type: !44, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !22)
217!44 = !DISubroutineType(types: !45)
218!45 = !{null}
219!46 = !DILocalVariable(name: "i", scope: !43, file: !16, line: 4, type: !28)
220!47 = !DILocation(line: 4, column: 7, scope: !43)
221!48 = !DILocalVariable(name: "a", scope: !43, file: !16, line: 5, type: !31)
222!49 = !DILocation(line: 5, column: 7, scope: !43)
223!50 = !DILocation(line: 6, column: 3, scope: !43)
224!51 = !DILocation(line: 7, column: 3, scope: !43)
225!52 = !DILocation(line: 8, column: 1, scope: !43)
226