xref: /llvm-project/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll (revision 50caf6936ba91b4cc45ffa4e3591f0dcf0c4e387)
1; RUN: opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=SI,SICI,ALL %s
2; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=CI,SICI,ALL %s
3; RUN: opt -S -mcpu=gfx1010 -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=GFX10PLUS,ALL %s
4; RUN: opt -S -mcpu=gfx1100 -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=GFX10PLUS,ALL %s
5
6; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] poison, align 4
7; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] poison, align 4
8
9define amdgpu_kernel void @promote_alloca_size_63(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
10entry:
11  %stack = alloca [5 x i32], align 4, addrspace(5)
12  %0 = load i32, ptr addrspace(1) %in, align 4
13  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
14  store i32 4, ptr addrspace(5) %arrayidx1, align 4
15  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
16  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
17  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
18  store i32 5, ptr addrspace(5) %arrayidx3, align 4
19  %2 = load i32, ptr addrspace(5) %stack, align 4
20  store i32 %2, ptr addrspace(1) %out, align 4
21  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
22  %3 = load i32, ptr addrspace(5) %arrayidx12
23  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
24  store i32 %3, ptr addrspace(1) %arrayidx13
25  ret void
26}
27
28; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] poison, align 4
29
30define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 {
31entry:
32  %stack = alloca [5 x i32], align 4, addrspace(5)
33  %0 = load i32, ptr addrspace(1) %in, align 4
34  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
35  store i32 4, ptr addrspace(5) %arrayidx1, align 4
36  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
37  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
38  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
39  store i32 5, ptr addrspace(5) %arrayidx3, align 4
40  %2 = load i32, ptr addrspace(5) %stack, align 4
41  store i32 %2, ptr addrspace(1) %out, align 4
42  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
43  %3 = load i32, ptr addrspace(5) %arrayidx12
44  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
45  store i32 %3, ptr addrspace(1) %arrayidx13
46  ret void
47}
48
49; SI-NOT: @promote_alloca_size_1600.stack
50; CI: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] poison, align 4
51; GFX10PLUS: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] poison, align 4
52
53define amdgpu_kernel void @promote_alloca_size_1600(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #2 {
54entry:
55  %stack = alloca [5 x i32], align 4, addrspace(5)
56  %0 = load i32, ptr addrspace(1) %in, align 4
57  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
58  store i32 4, ptr addrspace(5) %arrayidx1, align 4
59  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
60  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
61  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
62  store i32 5, ptr addrspace(5) %arrayidx3, align 4
63  %2 = load i32, ptr addrspace(5) %stack, align 4
64  store i32 %2, ptr addrspace(1) %out, align 4
65  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
66  %3 = load i32, ptr addrspace(5) %arrayidx12
67  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
68  store i32 %3, ptr addrspace(1) %arrayidx13
69  ret void
70}
71
72; ALL-LABEL: @occupancy_0(
73; CI-NOT: alloca [5 x i32]
74; SI: alloca [5 x i32]
75define amdgpu_kernel void @occupancy_0(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #3 {
76entry:
77  %stack = alloca [5 x i32], align 4, addrspace(5)
78  %0 = load i32, ptr addrspace(1) %in, align 4
79  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
80  store i32 4, ptr addrspace(5) %arrayidx1, align 4
81  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
82  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
83  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
84  store i32 5, ptr addrspace(5) %arrayidx3, align 4
85  %2 = load i32, ptr addrspace(5) %stack, align 4
86  store i32 %2, ptr addrspace(1) %out, align 4
87  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
88  %3 = load i32, ptr addrspace(5) %arrayidx12
89  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
90  store i32 %3, ptr addrspace(1) %arrayidx13
91  ret void
92}
93
94; ALL-LABEL: @occupancy_max(
95; CI-NOT: alloca [5 x i32]
96; SI: alloca [5 x i32]
97define amdgpu_kernel void @occupancy_max(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #4 {
98entry:
99  %stack = alloca [5 x i32], align 4, addrspace(5)
100  %0 = load i32, ptr addrspace(1) %in, align 4
101  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
102  store i32 4, ptr addrspace(5) %arrayidx1, align 4
103  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
104  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
105  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
106  store i32 5, ptr addrspace(5) %arrayidx3, align 4
107  %2 = load i32, ptr addrspace(5) %stack, align 4
108  store i32 %2, ptr addrspace(1) %out, align 4
109  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
110  %3 = load i32, ptr addrspace(5) %arrayidx12
111  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
112  store i32 %3, ptr addrspace(1) %arrayidx13
113  ret void
114}
115
116; SI-LABEL: @occupancy_6(
117; CI-LABEL: @occupancy_6(
118; SI: alloca
119; CI-NOT: alloca
120define amdgpu_kernel void @occupancy_6(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #5 {
121entry:
122  %stack = alloca [42 x i8], align 4, addrspace(5)
123  %tmp = load i8, ptr addrspace(1) %in, align 1
124  %tmp4 = sext i8 %tmp to i64
125  %arrayidx1 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
126  store i8 4, ptr addrspace(5) %arrayidx1, align 1
127  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
128  %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
129  %tmp5 = sext i8 %tmp1 to i64
130  %arrayidx3 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
131  store i8 5, ptr addrspace(5) %arrayidx3, align 1
132  %tmp2 = load i8, ptr addrspace(5) %stack, align 1
133  store i8 %tmp2, ptr addrspace(1) %out, align 1
134  %arrayidx12 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 1
135  %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
136  %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
137  store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
138  ret void
139}
140
141; ALL-LABEL: @occupancy_6_over(
142; SICI: alloca [43 x i8]
143; GFX10PLUS-NOT: alloca
144
145define amdgpu_kernel void @occupancy_6_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #5 {
146entry:
147  %stack = alloca [43 x i8], align 4, addrspace(5)
148  %tmp = load i8, ptr addrspace(1) %in, align 1
149  %tmp4 = sext i8 %tmp to i64
150  %arrayidx1 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
151  store i8 4, ptr addrspace(5) %arrayidx1, align 1
152  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
153  %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
154  %tmp5 = sext i8 %tmp1 to i64
155  %arrayidx3 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
156  store i8 5, ptr addrspace(5) %arrayidx3, align 1
157  %tmp2 = load i8, ptr addrspace(5) %stack, align 1
158  store i8 %tmp2, ptr addrspace(1) %out, align 1
159  %arrayidx12 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 1
160  %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
161  %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
162  store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
163  ret void
164}
165
166; SI-LABEL: @occupancy_8(
167; CI-LABEL: @occupancy_8(
168; SI: alloca
169; CI-NOT: alloca
170define amdgpu_kernel void @occupancy_8(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #6 {
171entry:
172  %stack = alloca [32 x i8], align 4, addrspace(5)
173  %tmp = load i8, ptr addrspace(1) %in, align 1
174  %tmp4 = sext i8 %tmp to i64
175  %arrayidx1 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
176  store i8 4, ptr addrspace(5) %arrayidx1, align 1
177  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
178  %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
179  %tmp5 = sext i8 %tmp1 to i64
180  %arrayidx3 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
181  store i8 5, ptr addrspace(5) %arrayidx3, align 1
182  %tmp2 = load i8, ptr addrspace(5) %stack, align 1
183  store i8 %tmp2, ptr addrspace(1) %out, align 1
184  %arrayidx12 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 1
185  %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
186  %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
187  store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
188  ret void
189}
190
191; ALL-LABEL: @occupancy_8_over(
192; SICI: alloca [33 x i8]
193; GFX10PLUS-NOT: alloca
194
195define amdgpu_kernel void @occupancy_8_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #6 {
196entry:
197  %stack = alloca [33 x i8], align 4, addrspace(5)
198  %tmp = load i8, ptr addrspace(1) %in, align 1
199  %tmp4 = sext i8 %tmp to i64
200  %arrayidx1 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
201  store i8 4, ptr addrspace(5) %arrayidx1, align 1
202  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
203  %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
204  %tmp5 = sext i8 %tmp1 to i64
205  %arrayidx3 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
206  store i8 5, ptr addrspace(5) %arrayidx3, align 1
207  %tmp2 = load i8, ptr addrspace(5) %stack, align 1
208  store i8 %tmp2, ptr addrspace(1) %out, align 1
209  %arrayidx12 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 1
210  %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
211  %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
212  store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
213  ret void
214}
215
216; SI-LABEL: @occupancy_9(
217; CI-LABEL: @occupancy_9(
218; SI: alloca
219; CI-NOT: alloca
220define amdgpu_kernel void @occupancy_9(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #7 {
221entry:
222  %stack = alloca [28 x i8], align 4, addrspace(5)
223  %tmp = load i8, ptr addrspace(1) %in, align 1
224  %tmp4 = sext i8 %tmp to i64
225  %arrayidx1 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
226  store i8 4, ptr addrspace(5) %arrayidx1, align 1
227  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
228  %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
229  %tmp5 = sext i8 %tmp1 to i64
230  %arrayidx3 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
231  store i8 5, ptr addrspace(5) %arrayidx3, align 1
232  %tmp2 = load i8, ptr addrspace(5) %stack, align 1
233  store i8 %tmp2, ptr addrspace(1) %out, align 1
234  %arrayidx12 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 1
235  %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
236  %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
237  store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
238  ret void
239}
240
241; ALL-LABEL: @occupancy_9_over(
242; SICI: alloca [29 x i8]
243; GFX10PLUS-NOT: alloca
244
245define amdgpu_kernel void @occupancy_9_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #7 {
246entry:
247  %stack = alloca [29 x i8], align 4, addrspace(5)
248  %tmp = load i8, ptr addrspace(1) %in, align 1
249  %tmp4 = sext i8 %tmp to i64
250  %arrayidx1 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
251  store i8 4, ptr addrspace(5) %arrayidx1, align 1
252  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
253  %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
254  %tmp5 = sext i8 %tmp1 to i64
255  %arrayidx3 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
256  store i8 5, ptr addrspace(5) %arrayidx3, align 1
257  %tmp2 = load i8, ptr addrspace(5) %stack, align 1
258  store i8 %tmp2, ptr addrspace(1) %out, align 1
259  %arrayidx12 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 1
260  %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
261  %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
262  store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
263  ret void
264}
265
266attributes #0 = { nounwind "amdgpu-flat-work-group-size"="63,63" }
267attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="256,256" }
268attributes #2 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="1024,1024" }
269attributes #3 = { nounwind "amdgpu-waves-per-eu"="1,10" }
270attributes #4 = { nounwind "amdgpu-waves-per-eu"="1,10" }
271attributes #5 = { nounwind "amdgpu-waves-per-eu"="1,6" "amdgpu-flat-work-group-size"="64,64" }
272attributes #6 = { nounwind "amdgpu-waves-per-eu"="1,8" "amdgpu-flat-work-group-size"="64,64" }
273attributes #7 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="64,64" }
274