xref: /llvm-project/llvm/test/CodeGen/AMDGPU/lds-alignment.ll (revision 4490003a22658dcd12527029b2c8682b63d8a9d6)
1; RUN: llc -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=HSA %s
2
3@lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
4@lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
5
6@lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8
7@lds.align32.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 32
8
9@lds.missing.align.0 = internal unnamed_addr addrspace(3) global [39 x i32] undef
10@lds.missing.align.1 = internal unnamed_addr addrspace(3) global [7 x i64] undef
11
12declare void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) nocapture, ptr addrspace(1) nocapture readonly, i32, i1) #0
13declare void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #0
14
15
16; HSA-LABEL: {{^}}test_no_round_size_1:
17; HSA: .amdhsa_group_segment_fixed_size 38
18define amdgpu_kernel void @test_no_round_size_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
19  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 @lds.align16.0, ptr addrspace(1) align 4 %in, i32 38, i1 false)
20  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 @lds.align16.0, i32 38, i1 false)
21  ret void
22}
23
24; There are two objects, so one requires padding to be correctly
25; aligned after the other.
26
27; (38 -> 48) + 38 = 92
28
29; I don't think it is necessary to add padding after since if there
30; were to be a dynamically sized LDS kernel arg, the runtime should
31; add the alignment padding if necessary alignment padding if needed.
32
33; HSA-LABEL: {{^}}test_round_size_2:
34; HSA: .amdhsa_group_segment_fixed_size 86
35define amdgpu_kernel void @test_round_size_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
36  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 @lds.align16.0, ptr addrspace(1) align 4 %in, i32 38, i1 false)
37  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 @lds.align16.0, i32 38, i1 false)
38
39  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 @lds.align16.1, ptr addrspace(1) align 4 %in, i32 38, i1 false)
40  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 @lds.align16.1, i32 38, i1 false)
41
42  ret void
43}
44
45; 38 + (10 pad) + 38  (= 86)
46; HSA-LABEL: {{^}}test_round_size_2_align_8:
47; HSA: .amdhsa_group_segment_fixed_size 86
48define amdgpu_kernel void @test_round_size_2_align_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
49  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align16.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
50  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align16.0, i32 38, i1 false)
51
52  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align8.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
53  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align8.0, i32 38, i1 false)
54
55  ret void
56}
57
58; HSA-LABEL: {{^}}test_round_local_lds_and_arg:
59; HSA: .amdhsa_group_segment_fixed_size 38
60define amdgpu_kernel void @test_round_local_lds_and_arg(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(3) %lds.arg) #1 {
61  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 @lds.align16.0, ptr addrspace(1) align 4 %in, i32 38, i1 false)
62
63  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 @lds.align16.0, i32 38, i1 false)
64  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 %lds.arg, ptr addrspace(1) align 4 %in, i32 38, i1 false)
65  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 %lds.arg, i32 38, i1 false)
66  ret void
67}
68
69; HSA-LABEL: {{^}}test_round_lds_arg:
70; HSA: .amdhsa_group_segment_fixed_size 0
71define amdgpu_kernel void @test_round_lds_arg(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(3) %lds.arg) #1 {
72  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 %lds.arg, ptr addrspace(1) align 4 %in, i32 38, i1 false)
73  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 %lds.arg, i32 38, i1 false)
74  ret void
75}
76
77; FIXME: Parameter alignment not considered
78; HSA-LABEL: {{^}}test_high_align_lds_arg:
79; HSA: .amdhsa_group_segment_fixed_size 0
80define amdgpu_kernel void @test_high_align_lds_arg(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(3) align 64 %lds.arg) #1 {
81  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 64 %lds.arg, ptr addrspace(1) align 64 %in, i32 38, i1 false)
82  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 64 %out, ptr addrspace(3) align 64 %lds.arg, i32 38, i1 false)
83  ret void
84}
85
86; (39 * 4) + (4 pad) + (7 * 8) = 216
87; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
88; HSA: .amdhsa_group_segment_fixed_size 216
89define amdgpu_kernel void @test_missing_alignment_size_2_order0(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
90  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 @lds.missing.align.0, ptr addrspace(1) align 4 %in, i32 160, i1 false)
91  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 @lds.missing.align.0, i32 160, i1 false)
92
93  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.missing.align.1, ptr addrspace(1) align 8 %in, i32 56, i1 false)
94  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.missing.align.1, i32 56, i1 false)
95
96  ret void
97}
98
99; (39 * 4) + (4 pad) + (7 * 8) = 216
100; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1:
101; HSA: .amdhsa_group_segment_fixed_size 216
102define amdgpu_kernel void @test_missing_alignment_size_2_order1(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
103  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.missing.align.1, ptr addrspace(1) align 8 %in, i32 56, i1 false)
104  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.missing.align.1, i32 56, i1 false)
105
106  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 @lds.missing.align.0, ptr addrspace(1) align 4 %in, i32 160, i1 false)
107  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 @lds.missing.align.0, i32 160, i1 false)
108
109  ret void
110}
111
112; align 32, 16, 16
113; 38 + (10 pad) + 38 + (10 pad) + 38  ( = 134)
114; HSA-LABEL: {{^}}test_round_size_3_order0:
115; HSA: .amdhsa_group_segment_fixed_size 134
116define amdgpu_kernel void @test_round_size_3_order0(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
117  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align32.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
118  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align32.0, i32 38, i1 false)
119
120  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align16.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
121  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align16.0, i32 38, i1 false)
122
123  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align8.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
124  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align8.0, i32 38, i1 false)
125
126  ret void
127}
128
129; align 32, 16, 16
130; 38 (+ 10 pad) + 38 + (10 pad) + 38 ( = 134)
131; HSA-LABEL: {{^}}test_round_size_3_order1:
132; HSA: .amdhsa_group_segment_fixed_size 134
133define amdgpu_kernel void @test_round_size_3_order1(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
134  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align32.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
135  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align32.0, i32 38, i1 false)
136
137  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align8.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
138  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align8.0, i32 38, i1 false)
139
140  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align16.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
141  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align16.0, i32 38, i1 false)
142
143  ret void
144}
145
146; align 32, 16, 16
147; 38 + (10 pad) + 38 + (10 pad) + 38  ( = 126)
148; HSA-LABEL: {{^}}test_round_size_3_order2:
149; HSA: .amdhsa_group_segment_fixed_size 134
150define amdgpu_kernel void @test_round_size_3_order2(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
151  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align16.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
152  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align16.0, i32 38, i1 false)
153
154  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align32.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
155  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align32.0, i32 38, i1 false)
156
157  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align8.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
158  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align8.0, i32 38, i1 false)
159
160  ret void
161}
162
163; align 32, 16, 16
164; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 134)
165; HSA-LABEL: {{^}}test_round_size_3_order3:
166; HSA: .amdhsa_group_segment_fixed_size 134
167define amdgpu_kernel void @test_round_size_3_order3(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
168  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align16.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
169  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align16.0, i32 38, i1 false)
170
171  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align8.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
172  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align8.0, i32 38, i1 false)
173
174  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align32.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
175  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align32.0, i32 38, i1 false)
176
177  ret void
178}
179
180; align 32, 16, 16
181; 38 + (10 pad) + 38 + (10 pad) + 38  (= 134)
182; HSA-LABEL: {{^}}test_round_size_3_order4:
183; HSA: .amdhsa_group_segment_fixed_size 134
184define amdgpu_kernel void @test_round_size_3_order4(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
185  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align8.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
186  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align8.0, i32 38, i1 false)
187
188  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align32.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
189  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align32.0, i32 38, i1 false)
190
191  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align16.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
192  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align16.0, i32 38, i1 false)
193
194  ret void
195}
196
197; align 32, 16, 16
198; 38 + (10 pad) + 38 + (10 pad) + 38  (= 134)
199; HSA-LABEL: {{^}}test_round_size_3_order5:
200; HSA: .amdhsa_group_segment_fixed_size 134
201define amdgpu_kernel void @test_round_size_3_order5(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
202  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align8.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
203  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align8.0, i32 38, i1 false)
204
205  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align16.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
206  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align16.0, i32 38, i1 false)
207
208  call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 8 @lds.align32.0, ptr addrspace(1) align 8 %in, i32 38, i1 false)
209  call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 8 %out, ptr addrspace(3) align 8 @lds.align32.0, i32 38, i1 false)
210
211  ret void
212}
213
214attributes #0 = { argmemonly nounwind }
215attributes #1 = { nounwind }
216attributes #2 = { convergent nounwind }
217
218!llvm.module.flags = !{!0}
219!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
220