xref: /llvm-project/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll (revision 41ed16c3b3362e51b7063eaef6461ab704c1ec7a)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -S -mtriple=amdgcn--amdhsa -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=OPT %s
3; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=GCN %s
4
5; Opt checks from utils/update_test_checks.py, llc checks from utils/update_llc_test_checks.py
6
7; Define four variables and four non-kernel functions which access exactly one variable each
8@v0 = addrspace(3) global float poison
9@v1 = addrspace(3) global i16 poison, align 16
10@v2 = addrspace(3) global i64 poison
11@v3 = addrspace(3) global i8 poison
12@unused = addrspace(3) global i16 poison
13
14; OPT{LITERAL}: @llvm.amdgcn.lds.offset.table = internal addrspace(4) constant [2 x [1 x i32]] [[1 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds to i32)], [1 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds to i32)]]
15
16define void @f0() {
17; OPT-LABEL: @f0(
18; OPT-NEXT:    [[LD:%.*]] = load float, ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4
19; OPT-NEXT:    [[MUL:%.*]] = fmul float [[LD]], 2.000000e+00
20; OPT-NEXT:    store float [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4
21; OPT-NEXT:    ret void
22;
23; GCN-LABEL: f0:
24; GCN:       ; %bb.0:
25; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26; GCN-NEXT:    v_mov_b32_e32 v0, 0
27; GCN-NEXT:    s_mov_b32 m0, -1
28; GCN-NEXT:    ds_read_b32 v1, v0 offset:4
29; GCN-NEXT:    s_waitcnt lgkmcnt(0)
30; GCN-NEXT:    v_add_f32_e32 v1, v1, v1
31; GCN-NEXT:    ds_write_b32 v0, v1 offset:4
32; GCN-NEXT:    s_waitcnt lgkmcnt(0)
33; GCN-NEXT:    s_setpc_b64 s[30:31]
34  %ld = load float, ptr addrspace(3) @v0
35  %mul = fmul float %ld, 2.
36  store float %mul, ptr  addrspace(3) @v0
37  ret void
38}
39
40define void @f1() {
41; OPT-LABEL: @f1(
42; OPT-NEXT:    [[LD:%.*]] = load i16, ptr addrspace(3) @llvm.amdgcn.module.lds, align 16
43; OPT-NEXT:    [[MUL:%.*]] = mul i16 [[LD]], 3
44; OPT-NEXT:    store i16 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 16
45; OPT-NEXT:    ret void
46;
47; GCN-LABEL: f1:
48; GCN:       ; %bb.0:
49; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50; GCN-NEXT:    v_mov_b32_e32 v0, 0
51; GCN-NEXT:    s_mov_b32 m0, -1
52; GCN-NEXT:    ds_read_u16 v1, v0
53; GCN-NEXT:    s_waitcnt lgkmcnt(0)
54; GCN-NEXT:    v_mul_lo_u32 v1, v1, 3
55; GCN-NEXT:    ds_write_b16 v0, v1
56; GCN-NEXT:    s_waitcnt lgkmcnt(0)
57; GCN-NEXT:    s_setpc_b64 s[30:31]
58  %ld = load i16, ptr addrspace(3) @v1
59  %mul = mul i16 %ld, 3
60  store i16 %mul, ptr  addrspace(3) @v1
61  ret void
62}
63
64define void @f2() {
65; OPT-LABEL: @f2(
66; OPT-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
67; OPT-NEXT:    [[V22:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
68; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4
69; OPT-NEXT:    [[V23:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
70; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 8
71; OPT-NEXT:    [[MUL:%.*]] = mul i64 [[LD]], 4
72; OPT-NEXT:    [[V2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
73; OPT-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V2]], align 4
74; OPT-NEXT:    [[V21:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
75; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) [[V21]], align 8
76; OPT-NEXT:    ret void
77;
78; GCN-LABEL: f2:
79; GCN:       ; %bb.0:
80; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
81; GCN-NEXT:    s_mov_b32 s4, s15
82; GCN-NEXT:    s_ashr_i32 s5, s15, 31
83; GCN-NEXT:    s_getpc_b64 s[6:7]
84; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4
85; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12
86; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
87; GCN-NEXT:    s_add_u32 s4, s4, s6
88; GCN-NEXT:    s_addc_u32 s5, s5, s7
89; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
90; GCN-NEXT:    s_waitcnt lgkmcnt(0)
91; GCN-NEXT:    v_mov_b32_e32 v2, s4
92; GCN-NEXT:    s_mov_b32 m0, -1
93; GCN-NEXT:    ds_read_b64 v[0:1], v2
94; GCN-NEXT:    s_waitcnt lgkmcnt(0)
95; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
96; GCN-NEXT:    ds_write_b64 v2, v[0:1]
97; GCN-NEXT:    s_waitcnt lgkmcnt(0)
98; GCN-NEXT:    s_setpc_b64 s[30:31]
99  %ld = load i64, ptr addrspace(3) @v2
100  %mul = mul i64 %ld, 4
101  store i64 %mul, ptr  addrspace(3) @v2
102  ret void
103}
104
105define void @f3() {
106; OPT-LABEL: @f3(
107; OPT-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K23_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8
108; OPT-NEXT:    [[MUL:%.*]] = mul i8 [[LD]], 5
109; OPT-NEXT:    store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K23_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8
110; OPT-NEXT:    ret void
111;
112; GCN-LABEL: f3:
113; GCN:       ; %bb.0:
114; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115; GCN-NEXT:    v_mov_b32_e32 v0, 0
116; GCN-NEXT:    s_mov_b32 m0, -1
117; GCN-NEXT:    ds_read_u8 v1, v0 offset:8
118; GCN-NEXT:    s_waitcnt lgkmcnt(0)
119; GCN-NEXT:    v_mul_lo_u32 v1, v1, 5
120; GCN-NEXT:    ds_write_b8 v0, v1 offset:8
121; GCN-NEXT:    s_waitcnt lgkmcnt(0)
122; GCN-NEXT:    s_setpc_b64 s[30:31]
123  %ld = load i8, ptr addrspace(3) @v3
124  %mul = mul i8 %ld, 5
125  store i8 %mul, ptr  addrspace(3) @v3
126  ret void
127}
128
129; Doesn't access any via a function, won't be in the lookup table
130define amdgpu_kernel void @kernel_no_table() {
131; OPT-LABEL: @kernel_no_table(
132; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8
133; OPT-NEXT:    [[MUL:%.*]] = mul i64 [[LD]], 8
134; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8
135; OPT-NEXT:    ret void
136;
137; GCN-LABEL: kernel_no_table:
138; GCN:       ; %bb.0:
139; GCN-NEXT:    v_mov_b32_e32 v2, 0
140; GCN-NEXT:    s_mov_b32 m0, -1
141; GCN-NEXT:    ds_read_b64 v[0:1], v2
142; GCN-NEXT:    s_waitcnt lgkmcnt(0)
143; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
144; GCN-NEXT:    ds_write_b64 v2, v[0:1]
145; GCN-NEXT:    s_endpgm
146  %ld = load i64, ptr addrspace(3) @v2
147  %mul = mul i64 %ld, 8
148  store i64 %mul, ptr  addrspace(3) @v2
149  ret void
150}
151
152; Access two variables, will allocate those two
153define amdgpu_kernel void @k01() {
154; OPT-LABEL: @k01(
155; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds) ]
156; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
157; OPT-NEXT:    call void @f0()
158; OPT-NEXT:    call void @f1()
159; OPT-NEXT:    ret void
160;
161; GCN-LABEL: k01:
162; GCN:       ; %bb.0:
163; GCN-NEXT:    s_mov_b32 s32, 0
164; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
165; GCN-NEXT:    s_add_i32 s12, s12, s17
166; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
167; GCN-NEXT:    s_add_u32 s0, s0, s17
168; GCN-NEXT:    s_addc_u32 s1, s1, 0
169; GCN-NEXT:    s_mov_b32 s13, s15
170; GCN-NEXT:    s_mov_b32 s12, s14
171; GCN-NEXT:    s_getpc_b64 s[14:15]
172; GCN-NEXT:    s_add_u32 s14, s14, f0@gotpcrel32@lo+4
173; GCN-NEXT:    s_addc_u32 s15, s15, f0@gotpcrel32@hi+12
174; GCN-NEXT:    s_load_dwordx2 s[18:19], s[14:15], 0x0
175; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
176; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
177; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
178; GCN-NEXT:    v_or_b32_e32 v31, v0, v2
179; GCN-NEXT:    s_mov_b32 s14, s16
180; GCN-NEXT:    s_waitcnt lgkmcnt(0)
181; GCN-NEXT:    s_swappc_b64 s[30:31], s[18:19]
182; GCN-NEXT:    s_getpc_b64 s[14:15]
183; GCN-NEXT:    s_add_u32 s14, s14, f1@gotpcrel32@lo+4
184; GCN-NEXT:    s_addc_u32 s15, s15, f1@gotpcrel32@hi+12
185; GCN-NEXT:    s_load_dwordx2 s[18:19], s[14:15], 0x0
186; GCN-NEXT:    s_mov_b32 s14, s16
187; GCN-NEXT:    s_waitcnt lgkmcnt(0)
188; GCN-NEXT:    s_swappc_b64 s[30:31], s[18:19]
189; GCN-NEXT:    s_endpgm
190
191  call void @f0()
192  call void @f1()
193  ret void
194}
195
196define amdgpu_kernel void @k23() {
197; OPT-LABEL: @k23(
198; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]]
199; OPT-NEXT:    call void @f2()
200; OPT-NEXT:    call void @f3()
201; OPT-NEXT:    ret void
202;
203; GCN-LABEL: k23:
204; GCN:       ; %bb.0:
205; GCN-NEXT:    s_mov_b32 s32, 0
206; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
207; GCN-NEXT:    s_add_i32 s12, s12, s17
208; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
209; GCN-NEXT:    s_add_u32 s0, s0, s17
210; GCN-NEXT:    s_addc_u32 s1, s1, 0
211; GCN-NEXT:    s_mov_b32 s20, s16
212; GCN-NEXT:    s_mov_b32 s13, s15
213; GCN-NEXT:    s_mov_b32 s12, s14
214; GCN-NEXT:    s_mov_b64 s[16:17], s[6:7]
215; GCN-NEXT:    s_mov_b64 s[18:19], s[4:5]
216; GCN-NEXT:    s_getpc_b64 s[4:5]
217; GCN-NEXT:    s_add_u32 s4, s4, f2@gotpcrel32@lo+4
218; GCN-NEXT:    s_addc_u32 s5, s5, f2@gotpcrel32@hi+12
219; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
220; GCN-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x0
221; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
222; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
223; GCN-NEXT:    v_or_b32_e32 v31, v0, v2
224; GCN-NEXT:    s_mov_b32 s15, 1
225; GCN-NEXT:    s_mov_b64 s[4:5], s[18:19]
226; GCN-NEXT:    s_mov_b32 s14, s20
227; GCN-NEXT:    s_waitcnt lgkmcnt(0)
228; GCN-NEXT:    s_swappc_b64 s[30:31], s[22:23]
229; GCN-NEXT:    s_getpc_b64 s[4:5]
230; GCN-NEXT:    s_add_u32 s4, s4, f3@gotpcrel32@lo+4
231; GCN-NEXT:    s_addc_u32 s5, s5, f3@gotpcrel32@hi+12
232; GCN-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x0
233; GCN-NEXT:    s_mov_b64 s[4:5], s[18:19]
234; GCN-NEXT:    s_mov_b64 s[6:7], s[16:17]
235; GCN-NEXT:    s_waitcnt lgkmcnt(0)
236; GCN-NEXT:    s_swappc_b64 s[30:31], s[22:23]
237; GCN-NEXT:    s_endpgm
238
239
240  call void @f2()
241  call void @f3()
242  ret void
243}
244
245; Access and allocate three variables
246define amdgpu_kernel void @k123() {
247; OPT-LABEL: @k123(
248; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope [[META11:![0-9]+]], !noalias [[META14:![0-9]+]]
249; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
250; OPT-NEXT:    call void @f1()
251; OPT-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META14]], !noalias [[META11]]
252; OPT-NEXT:    [[MUL:%.*]] = mul i8 [[LD]], 8
253; OPT-NEXT:    store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META14]], !noalias [[META11]]
254; OPT-NEXT:    call void @f2()
255; OPT-NEXT:    ret void
256;
257; GCN-LABEL: k123:
258; GCN:       ; %bb.0:
259; GCN-NEXT:    s_mov_b32 s32, 0
260; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
261; GCN-NEXT:    s_add_i32 s12, s12, s17
262; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
263; GCN-NEXT:    s_add_u32 s0, s0, s17
264; GCN-NEXT:    s_addc_u32 s1, s1, 0
265; GCN-NEXT:    s_mov_b32 s13, s15
266; GCN-NEXT:    s_mov_b32 s12, s14
267; GCN-NEXT:    s_getpc_b64 s[14:15]
268; GCN-NEXT:    s_add_u32 s14, s14, f1@gotpcrel32@lo+4
269; GCN-NEXT:    s_addc_u32 s15, s15, f1@gotpcrel32@hi+12
270; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
271; GCN-NEXT:    s_load_dwordx2 s[18:19], s[14:15], 0x0
272; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
273; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
274; GCN-NEXT:    v_or_b32_e32 v31, v0, v2
275; GCN-NEXT:    s_mov_b32 s15, 0
276; GCN-NEXT:    s_mov_b32 s14, s16
277; GCN-NEXT:    s_waitcnt lgkmcnt(0)
278; GCN-NEXT:    s_swappc_b64 s[30:31], s[18:19]
279; GCN-NEXT:    v_mov_b32_e32 v0, 0
280; GCN-NEXT:    s_mov_b32 m0, -1
281; GCN-NEXT:    ds_read_u8 v1, v0 offset:16
282; GCN-NEXT:    s_getpc_b64 s[14:15]
283; GCN-NEXT:    s_add_u32 s14, s14, f2@gotpcrel32@lo+4
284; GCN-NEXT:    s_addc_u32 s15, s15, f2@gotpcrel32@hi+12
285; GCN-NEXT:    s_load_dwordx2 s[18:19], s[14:15], 0x0
286; GCN-NEXT:    s_waitcnt lgkmcnt(0)
287; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
288; GCN-NEXT:    ds_write_b8 v0, v1 offset:16
289; GCN-NEXT:    s_mov_b32 s15, 0
290; GCN-NEXT:    s_mov_b32 s14, s16
291; GCN-NEXT:    s_swappc_b64 s[30:31], s[18:19]
292; GCN-NEXT:    s_endpgm
293  call void @f1()
294  %ld = load i8, ptr addrspace(3) @v3
295  %mul = mul i8 %ld, 8
296  store i8 %mul, ptr  addrspace(3) @v3
297  call void @f2()
298  ret void
299}
300
301!0 = !{i32 0}
302!1 = !{i32 2}
303!2 = !{i32 1}
304
305; OPT: attributes #0 = { "amdgpu-lds-size"="8" }
306; OPT: attributes #1 = { "amdgpu-lds-size"="16" }
307; OPT: attributes #2 = { "amdgpu-lds-size"="24" }
308; OPT: attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) }
309; OPT: attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
310
311; OPT: !0 = !{i32 0, i32 1}
312; OPT: !1 = !{i32 4, i32 5}
313; OPT: !2 = !{i32 8, i32 9}
314; OPT: !3 = !{i32 1, !"amdhsa_code_object_version", i32 500}
315; OPT: !4 = !{i32 1}
316; OPT: !5 = !{!6}
317; OPT: !6 = distinct !{!6, !7}
318; OPT: !7 = distinct !{!7}
319; OPT: !8 = !{!9}
320; OPT: !9 = distinct !{!9, !7}
321; OPT: !10 = !{i32 0}
322; OPT: !11 = !{!12}
323; OPT: !12 = distinct !{!12, !13}
324; OPT: !13 = distinct !{!13}
325; OPT: !14 = !{!15}
326; OPT: !15 = distinct !{!15, !13}
327
328attributes #0 = { "amdgpu-lds-size"="8" }
329attributes #1 = { "amdgpu-lds-size"="16" }
330attributes #2 = { "amdgpu-lds-size"="24" }
331attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) }
332attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
333
334; Table size length number-kernels * number-variables * sizeof(uint16_t)
335; GCN:      .type	llvm.amdgcn.lds.offset.table,@object
336; GCN-NEXT: .section	.data.rel.ro,"aw"
337; GCN-NEXT: .p2align	2, 0x0
338; GCN-NEXT: llvm.amdgcn.lds.offset.table:
339; GCN-NEXT: .long	8
340; GCN-NEXT: .long	0
341; GCN-NEXT: .size	llvm.amdgcn.lds.offset.table, 8
342
343!llvm.module.flags = !{!3}
344!3 = !{i32 1, !"amdhsa_code_object_version", i32 500}
345