xref: /llvm-project/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll (revision 50caf6936ba91b4cc45ffa4e3591f0dcf0c4e387)
1; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s
2; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=ASM %s
3
4target datalayout = "A5"
5
6@all_lds = internal unnamed_addr addrspace(3) global [16384 x i32] undef, align 4
7@some_lds = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4
8@some_dynamic_lds = external hidden addrspace(3) global [0 x i32], align 4
9
10@initializer_user_some = addrspace(1) global i32 ptrtoint (ptr addrspace(3) @some_lds to i32), align 4
11@initializer_user_all = addrspace(1) global i32 ptrtoint (ptr addrspace(3) @all_lds to i32), align 4
12
13; This function cannot promote to using LDS because of the size of the
14; constant expression use in the function, which was previously not
15; detected.
16; IR-LABEL: @constant_expression_uses_all_lds(
17; IR: alloca
18
19; ASM-LABEL: constant_expression_uses_all_lds:
20; ASM: .amdhsa_group_segment_fixed_size 65536
21define amdgpu_kernel void @constant_expression_uses_all_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
22entry:
23  %stack = alloca [4 x i32], align 4, addrspace(5)
24  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
25  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
26  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
27  store i32 9, ptr addrspace(5) %stack
28  store i32 10, ptr addrspace(5) %gep1
29  store i32 99, ptr addrspace(5) %gep2
30  store i32 43, ptr addrspace(5) %gep3
31  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
32  %load = load i32, ptr addrspace(5) %arrayidx, align 4
33  store i32 %load, ptr addrspace(1) %out
34
35  store volatile i32 ptrtoint (ptr addrspace(3) @all_lds to i32), ptr addrspace(1) undef
36  ret void
37}
38
39; Has a constant expression use through a single level of constant
40; expression, but not enough LDS to block promotion
41
42; IR-LABEL: @constant_expression_uses_some_lds(
43; IR-NOT: alloca
44
45; ASM-LABEL: {{^}}constant_expression_uses_some_lds:
46; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
47define amdgpu_kernel void @constant_expression_uses_some_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
48entry:
49  %stack = alloca [4 x i32], align 4, addrspace(5)
50  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
51  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
52  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
53  store i32 9, ptr addrspace(5) %stack
54  store i32 10, ptr addrspace(5) %gep1
55  store i32 99, ptr addrspace(5) %gep2
56  store i32 43, ptr addrspace(5) %gep3
57  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
58  %load = load i32, ptr addrspace(5) %arrayidx, align 4
59  store i32 %load, ptr addrspace(1) %out
60  store volatile i32 ptrtoint (ptr addrspace(3) @some_lds to i32), ptr addrspace(1) undef
61  ret void
62}
63
64; Has a constant expression use through a single level of constant
65; expression, but usage of dynamic LDS should block promotion
66
67; IR-LABEL: @constant_expression_uses_some_dynamic_lds(
68; IR: alloca
69
70; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds:
71; ASM: .amdhsa_group_segment_fixed_size 0{{$}}
72define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
73entry:
74  %stack = alloca [4 x i32], align 4, addrspace(5)
75  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
76  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
77  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
78  store i32 9, ptr addrspace(5) %stack
79  store i32 10, ptr addrspace(5) %gep1
80  store i32 99, ptr addrspace(5) %gep2
81  store i32 43, ptr addrspace(5) %gep3
82  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
83  %load = load i32, ptr addrspace(5) %arrayidx, align 4
84  store i32 %load, ptr addrspace(1) %out
85  store i32 1234, ptr addrspacecast (ptr addrspace(3) @some_dynamic_lds to ptr), align 4
86  ret void
87}
88
89declare void @callee(ptr)
90
91; IR-LABEL: @constant_expression_uses_all_lds_multi_level(
92; IR: alloca
93
94; ASM-LABEL: {{^}}constant_expression_uses_all_lds_multi_level:
95; ASM: .amdhsa_group_segment_fixed_size 65536{{$}}
96define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
97entry:
98  %stack = alloca [4 x i32], align 4, addrspace(5)
99  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
100  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
101  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
102  store i32 9, ptr addrspace(5) %stack
103  store i32 10, ptr addrspace(5) %gep1
104  store i32 99, ptr addrspace(5) %gep2
105  store i32 43, ptr addrspace(5) %gep3
106  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
107  %load = load i32, ptr addrspace(5) %arrayidx, align 4
108  store i32 %load, ptr addrspace(1) %out
109  call void @callee(ptr addrspacecast (ptr addrspace(3) getelementptr inbounds ([16384 x i32], ptr addrspace(3) @all_lds, i32 0, i32 8) to ptr))
110  ret void
111}
112
113; IR-LABEL: @constant_expression_uses_some_lds_multi_level(
114; IR-NOT: alloca
115; IR: llvm.amdgcn.workitem.id
116
117; ASM-LABEL: {{^}}constant_expression_uses_some_lds_multi_level:
118; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
119define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
120entry:
121  %stack = alloca [4 x i32], align 4, addrspace(5)
122  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
123  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
124  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
125  store i32 9, ptr addrspace(5) %stack
126  store i32 10, ptr addrspace(5) %gep1
127  store i32 99, ptr addrspace(5) %gep2
128  store i32 43, ptr addrspace(5) %gep3
129  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
130  %load = load i32, ptr addrspace(5) %arrayidx, align 4
131  store i32 %load, ptr addrspace(1) %out
132  call void @callee(ptr addrspacecast (ptr addrspace(3) getelementptr inbounds ([32 x i32], ptr addrspace(3) @some_lds, i32 0, i32 8) to ptr))
133  ret void
134}
135
136; IR-LABEL: @constant_expression_uses_some_dynamic_lds_multi_level(
137; IR: alloca
138
139; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds_multi_level:
140; ASM: .amdhsa_group_segment_fixed_size 0{{$}}
141define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
142entry:
143  %stack = alloca [4 x i32], align 4, addrspace(5)
144  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
145  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
146  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
147  store i32 9, ptr addrspace(5) %stack
148  store i32 10, ptr addrspace(5) %gep1
149  store i32 99, ptr addrspace(5) %gep2
150  store i32 43, ptr addrspace(5) %gep3
151  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
152  %load = load i32, ptr addrspace(5) %arrayidx, align 4
153  store i32 %load, ptr addrspace(1) %out
154  call void @callee(ptr addrspacecast (ptr addrspace(3) @some_dynamic_lds to ptr))
155  ret void
156}
157
158; IR-LABEL: @constant_expression_uses_some_lds_global_initializer(
159; IR-NOT: alloca
160; IR: llvm.amdgcn.workitem.id
161
162; ASM-LABEL: {{^}}constant_expression_uses_some_lds_global_initializer:
163; ASM: .amdhsa_group_segment_fixed_size 4096{{$}}
164define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
165entry:
166  %stack = alloca [4 x i32], align 4, addrspace(5)
167  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
168  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
169  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
170  store i32 9, ptr addrspace(5) %stack
171  store i32 10, ptr addrspace(5) %gep1
172  store i32 99, ptr addrspace(5) %gep2
173  store i32 43, ptr addrspace(5) %gep3
174  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
175  %load = load i32, ptr addrspace(5) %arrayidx, align 4
176  store i32 %load, ptr addrspace(1) %out
177
178  store volatile i32 ptrtoint (ptr addrspace(1) @initializer_user_some to i32), ptr addrspace(1) undef
179  ret void
180}
181
182; We can't actually handle LDS initializers in global initializers,
183; but this should count as usage.
184
185; IR-LABEL: @constant_expression_uses_all_lds_global_initializer(
186; IR: alloca
187
188; ASM-LABEL: {{^}}constant_expression_uses_all_lds_global_initializer:
189; ASM: .group_segment_fixed_size: 65536
190define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
191entry:
192  %stack = alloca [4 x i32], align 4, addrspace(5)
193  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
194  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
195  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
196  store i32 9, ptr addrspace(5) %stack
197  store i32 10, ptr addrspace(5) %gep1
198  store i32 99, ptr addrspace(5) %gep2
199  store i32 43, ptr addrspace(5) %gep3
200  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
201  %load = load i32, ptr addrspace(5) %arrayidx, align 4
202  store i32 %load, ptr addrspace(1) %out
203  store volatile i32 ptrtoint (ptr addrspace(1) @initializer_user_all to i32), ptr addrspace(1) undef
204  ret void
205}
206
207attributes #0 = { "amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="256,256" }
208