xref: /llvm-project/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll (revision 7dbd6cd2946ec3a9b4ad2dfd7ead177baac15bd7)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck %s
3; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
4
5; Check propagation of amdgpu-flat-work-group-size attribute.
6
7; Called from a single kernel with 1,256
8define internal void @default_to_1_256() {
9; CHECK-LABEL: define {{[^@]+}}@default_to_1_256
10; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
11; CHECK-NEXT:    ret void
12;
13  ret void
14}
15
16define amdgpu_kernel void @kernel_1_256() #0 {
17; CHECK-LABEL: define {{[^@]+}}@kernel_1_256
18; CHECK-SAME: () #[[ATTR0]] {
19; CHECK-NEXT:    call void @default_to_1_256()
20; CHECK-NEXT:    ret void
21;
22  call void @default_to_1_256()
23  ret void
24}
25
26; Called from a single kernel with 64,128
27define internal void @default_to_64_128() {
28; CHECK-LABEL: define {{[^@]+}}@default_to_64_128
29; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
30; CHECK-NEXT:    ret void
31;
32  ret void
33}
34
35define amdgpu_kernel void @kernel_64_128() #1 {
36; CHECK-LABEL: define {{[^@]+}}@kernel_64_128
37; CHECK-SAME: () #[[ATTR1]] {
38; CHECK-NEXT:    call void @default_to_64_128()
39; CHECK-NEXT:    call void @flat_group_64_64()
40; CHECK-NEXT:    call void @default_to_64_256()
41; CHECK-NEXT:    call void @flat_group_128_256()
42; CHECK-NEXT:    ret void
43;
44  call void @default_to_64_128()
45  call void @flat_group_64_64()
46  call void @default_to_64_256()
47  call void @flat_group_128_256()
48  ret void
49}
50
51; Called from kernels with 128,512 and 512,512
52define internal void @default_to_128_512() {
53; CHECK-LABEL: define {{[^@]+}}@default_to_128_512
54; CHECK-SAME: () #[[ATTR2:[0-9]+]] {
55; CHECK-NEXT:    ret void
56;
57  ret void
58}
59
60; This already has a strict bounds, but called from kernels with wider
61; bounds, and should not be changed.
62define internal void @flat_group_64_64() #2 {
63; CHECK-LABEL: define {{[^@]+}}@flat_group_64_64
64; CHECK-SAME: () #[[ATTR3:[0-9]+]] {
65; CHECK-NEXT:    ret void
66;
67  ret void
68}
69
70; 128,256 -> 128,128
71define internal void @flat_group_128_256() #3 {
72; CHECK-LABEL: define {{[^@]+}}@flat_group_128_256
73; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
74; CHECK-NEXT:    ret void
75;
76  ret void
77}
78
79define internal void @flat_group_512_1024() #4 {
80; CHECK-LABEL: define {{[^@]+}}@flat_group_512_1024
81; CHECK-SAME: () #[[ATTR5:[0-9]+]] {
82; CHECK-NEXT:    ret void
83;
84  ret void
85}
86
87define amdgpu_kernel void @kernel_128_512() #5 {
88; CHECK-LABEL: define {{[^@]+}}@kernel_128_512
89; CHECK-SAME: () #[[ATTR2]] {
90; CHECK-NEXT:    call void @default_to_128_512()
91; CHECK-NEXT:    call void @flat_group_64_64()
92; CHECK-NEXT:    ret void
93;
94  call void @default_to_128_512()
95  call void @flat_group_64_64()
96  ret void
97}
98
99define amdgpu_kernel void @kernel_512_512() #6 {
100; CHECK-LABEL: define {{[^@]+}}@kernel_512_512
101; CHECK-SAME: () #[[ATTR6:[0-9]+]] {
102; CHECK-NEXT:    call void @default_to_128_512()
103; CHECK-NEXT:    call void @flat_group_512_1024()
104; CHECK-NEXT:    ret void
105;
106  call void @default_to_128_512()
107  call void @flat_group_512_1024()
108  ret void
109}
110
111; Called from kernels with 128,256 and 64,128 => 64,256
112define internal void @default_to_64_256() {
113; CHECK-LABEL: define {{[^@]+}}@default_to_64_256
114; CHECK-SAME: () #[[ATTR7:[0-9]+]] {
115; CHECK-NEXT:    ret void
116;
117  ret void
118}
119
120; The kernel's lower bound is higher than the callee's lower bound, so
121; this should probably be illegal.
122define amdgpu_kernel void @kernel_128_256() #3 {
123; CHECK-LABEL: define {{[^@]+}}@kernel_128_256
124; CHECK-SAME: () #[[ATTR4]] {
125; CHECK-NEXT:    call void @default_to_64_256()
126; CHECK-NEXT:    ret void
127;
128  call void @default_to_64_256()
129  ret void
130}
131
132; 64,128 -> 64,128
133define internal void @merge_cycle_0() #1 {
134; CHECK-LABEL: define {{[^@]+}}@merge_cycle_0
135; CHECK-SAME: () #[[ATTR1]] {
136; CHECK-NEXT:    call void @merge_cycle_1()
137; CHECK-NEXT:    ret void
138;
139  call void @merge_cycle_1()
140  ret void
141}
142
143; 128,256 -> 128,128
144define internal void @merge_cycle_1() #3 {
145; CHECK-LABEL: define {{[^@]+}}@merge_cycle_1
146; CHECK-SAME: () #[[ATTR4]] {
147; CHECK-NEXT:    call void @merge_cycle_0()
148; CHECK-NEXT:    ret void
149;
150  call void @merge_cycle_0()
151  ret void
152}
153
154define amdgpu_kernel void @kernel_64_256() #7 {
155; CHECK-LABEL: define {{[^@]+}}@kernel_64_256
156; CHECK-SAME: () #[[ATTR7]] {
157; CHECK-NEXT:    call void @merge_cycle_0()
158; CHECK-NEXT:    call void @default_captured_address()
159; CHECK-NEXT:    call void @externally_visible_default()
160; CHECK-NEXT:    [[F32:%.*]] = call float @bitcasted_function()
161; CHECK-NEXT:    ret void
162;
163  call void @merge_cycle_0()
164  call void @default_captured_address()
165  call void @externally_visible_default()
166  %f32 = call float @bitcasted_function()
167  ret void
168}
169
170define internal void @default_captured_address() {
171; CHECK-LABEL: define {{[^@]+}}@default_captured_address
172; CHECK-SAME: () #[[ATTR8:[0-9]+]] {
173; CHECK-NEXT:    store volatile ptr @default_captured_address, ptr undef, align 8
174; CHECK-NEXT:    ret void
175;
176  store volatile ptr @default_captured_address, ptr undef, align 8
177  ret void
178}
179
180define void @externally_visible_default() {
181; CHECK-LABEL: define {{[^@]+}}@externally_visible_default
182; CHECK-SAME: () #[[ATTR8]] {
183; CHECK-NEXT:    ret void
184;
185  ret void
186}
187
188; 1,1024 -> 64,256
189define internal i32 @bitcasted_function() {
190; CHECK-LABEL: define {{[^@]+}}@bitcasted_function
191; CHECK-SAME: () #[[ATTR7]] {
192; CHECK-NEXT:    ret i32 0
193;
194  ret i32 0
195}
196
197attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
198attributes #1 = { "amdgpu-flat-work-group-size"="64,128" }
199attributes #2 = { "amdgpu-flat-work-group-size"="64,64" }
200attributes #3 = { "amdgpu-flat-work-group-size"="128,256" }
201attributes #4 = { "amdgpu-flat-work-group-size"="512,1024" }
202attributes #5 = { "amdgpu-flat-work-group-size"="128,512" }
203attributes #6 = { "amdgpu-flat-work-group-size"="512,512" }
204attributes #7 = { "amdgpu-flat-work-group-size"="64,256" }
205;.
206; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
207; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
208; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
209; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
210; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
211; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,1024" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
212; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
213; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
214; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
215;.
216