xref: /llvm-project/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
2; RUN: opt -S -mtriple=amdgcn--amdhsa -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=table | FileCheck -check-prefix=OPT %s
3; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=table | FileCheck -check-prefix=GCN %s
4
5; Opt checks from utils/update_test_checks.py, llc checks from utils/update_llc_test_checks.py, both modified.
6
7; Define four variables and four non-kernel functions which access exactly one variable each
8@v0 = addrspace(3) global float poison
9@v1 = addrspace(3) global i16 poison, align 16
10@v2 = addrspace(3) global i64 poison
11@v3 = addrspace(3) global i8 poison
12@unused = addrspace(3) global i16 poison
13
14; OPT: %llvm.amdgcn.kernel.kernel_no_table.lds.t = type { i64 }
15; OPT: %llvm.amdgcn.kernel.k01.lds.t = type { i16, [2 x i8], float }
16; OPT: %llvm.amdgcn.kernel.k23.lds.t = type { i64, i8 }
17; OPT: %llvm.amdgcn.kernel.k123.lds.t = type { i16, i8, [5 x i8], i64 }
18
19
20; Salient parts of the IR lookup table check:
21; It has (top level) size 3 as there are 3 kernels that call functions which use lds
22; The next level down has type [4 x i16] as there are 4 variables accessed by functions which use lds
23; The kernel naming pattern and the structs being named after the functions helps verify placement of poison
24; The remainder are constant expressions into the variable instances checked above
25
26; OPT{LITERAL}: @llvm.amdgcn.lds.offset.table = internal addrspace(4) constant [3 x [4 x i32]] [[4 x i32] [i32 ptrtoint (ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k01.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, i32 0, i32 2) to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds to i32), i32 poison, i32 poison], [4 x i32] [i32 poison, i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds to i32), i32 ptrtoint (ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k123.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 3) to i32), i32 ptrtoint (ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k123.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1) to i32)], [4 x i32] [i32 poison, i32 poison, i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds to i32), i32 ptrtoint (ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k23.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1) to i32)]]
27
28
29define void @f0() {
30; OPT-LABEL: define void @f0() {
31; OPT-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
32; OPT-NEXT:    [[V02:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
33; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V02]], align 4
34; OPT-NEXT:    [[V03:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
35; OPT-NEXT:    [[LD:%.*]] = load float, ptr addrspace(3) [[V03]], align 4
36; OPT-NEXT:    [[MUL:%.*]] = fmul float [[LD]], 2.000000e+00
37; OPT-NEXT:    [[V0:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
38; OPT-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V0]], align 4
39; OPT-NEXT:    [[V01:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
40; OPT-NEXT:    store float [[MUL]], ptr addrspace(3) [[V01]], align 4
41; OPT-NEXT:    ret void
42;
43; GCN-LABEL: f0:
44; GCN:       ; %bb.0:
45; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46; GCN-NEXT:    s_mov_b32 s4, s15
47; GCN-NEXT:    s_ashr_i32 s5, s15, 31
48; GCN-NEXT:    s_getpc_b64 s[6:7]
49; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4
50; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12
51; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 4
52; GCN-NEXT:    s_add_u32 s4, s4, s6
53; GCN-NEXT:    s_addc_u32 s5, s5, s7
54; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
55; GCN-NEXT:    s_waitcnt lgkmcnt(0)
56; GCN-NEXT:    v_mov_b32_e32 v0, s4
57; GCN-NEXT:    s_mov_b32 m0, -1
58; GCN-NEXT:    ds_read_b32 v1, v0
59; GCN-NEXT:    s_waitcnt lgkmcnt(0)
60; GCN-NEXT:    v_add_f32_e32 v1, v1, v1
61; GCN-NEXT:    ds_write_b32 v0, v1
62; GCN-NEXT:    s_waitcnt lgkmcnt(0)
63; GCN-NEXT:    s_setpc_b64 s[30:31]
64  %ld = load float, ptr addrspace(3) @v0
65  %mul = fmul float %ld, 2.
66  store float %mul, ptr  addrspace(3) @v0
67  ret void
68}
69
70define void @f1() {
71; OPT-LABEL: define void @f1() {
72; OPT-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
73; OPT-NEXT:    [[V12:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
74; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V12]], align 4
75; OPT-NEXT:    [[V13:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
76; OPT-NEXT:    [[LD:%.*]] = load i16, ptr addrspace(3) [[V13]], align 2
77; OPT-NEXT:    [[MUL:%.*]] = mul i16 [[LD]], 3
78; OPT-NEXT:    [[V1:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
79; OPT-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V1]], align 4
80; OPT-NEXT:    [[V11:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
81; OPT-NEXT:    store i16 [[MUL]], ptr addrspace(3) [[V11]], align 2
82; OPT-NEXT:    ret void
83;
84; GCN-LABEL: f1:
85; GCN:       ; %bb.0:
86; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87; GCN-NEXT:    s_mov_b32 s4, s15
88; GCN-NEXT:    s_ashr_i32 s5, s15, 31
89; GCN-NEXT:    s_getpc_b64 s[6:7]
90; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+8
91; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+16
92; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 4
93; GCN-NEXT:    s_add_u32 s4, s4, s6
94; GCN-NEXT:    s_addc_u32 s5, s5, s7
95; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
96; GCN-NEXT:    s_waitcnt lgkmcnt(0)
97; GCN-NEXT:    v_mov_b32_e32 v0, s4
98; GCN-NEXT:    s_mov_b32 m0, -1
99; GCN-NEXT:    ds_read_u16 v1, v0
100; GCN-NEXT:    s_waitcnt lgkmcnt(0)
101; GCN-NEXT:    v_mul_lo_u32 v1, v1, 3
102; GCN-NEXT:    ds_write_b16 v0, v1
103; GCN-NEXT:    s_waitcnt lgkmcnt(0)
104; GCN-NEXT:    s_setpc_b64 s[30:31]
105  %ld = load i16, ptr addrspace(3) @v1
106  %mul = mul i16 %ld, 3
107  store i16 %mul, ptr  addrspace(3) @v1
108  ret void
109}
110
111define void @f2() {
112; OPT-LABEL: define void @f2() {
113; OPT-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
114; OPT-NEXT:    [[V22:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 2
115; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4
116; OPT-NEXT:    [[V23:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
117; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 8
118; OPT-NEXT:    [[MUL:%.*]] = mul i64 [[LD]], 4
119; OPT-NEXT:    [[V2:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 2
120; OPT-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V2]], align 4
121; OPT-NEXT:    [[V21:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
122; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) [[V21]], align 8
123; OPT-NEXT:    ret void
124;
125; GCN-LABEL: f2:
126; GCN:       ; %bb.0:
127; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128; GCN-NEXT:    s_mov_b32 s4, s15
129; GCN-NEXT:    s_ashr_i32 s5, s15, 31
130; GCN-NEXT:    s_getpc_b64 s[6:7]
131; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+12
132; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+20
133; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 4
134; GCN-NEXT:    s_add_u32 s4, s4, s6
135; GCN-NEXT:    s_addc_u32 s5, s5, s7
136; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
137; GCN-NEXT:    s_waitcnt lgkmcnt(0)
138; GCN-NEXT:    v_mov_b32_e32 v2, s4
139; GCN-NEXT:    s_mov_b32 m0, -1
140; GCN-NEXT:    ds_read_b64 v[0:1], v2
141; GCN-NEXT:    s_waitcnt lgkmcnt(0)
142; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
143; GCN-NEXT:    ds_write_b64 v2, v[0:1]
144; GCN-NEXT:    s_waitcnt lgkmcnt(0)
145; GCN-NEXT:    s_setpc_b64 s[30:31]
146  %ld = load i64, ptr addrspace(3) @v2
147  %mul = mul i64 %ld, 4
148  store i64 %mul, ptr  addrspace(3) @v2
149  ret void
150}
151
152define void @f3() {
153; OPT-LABEL: define void @f3() {
154; OPT-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
155; OPT-NEXT:    [[V32:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 3
156; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V32]], align 4
157; OPT-NEXT:    [[V33:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
158; OPT-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) [[V33]], align 1
159; OPT-NEXT:    [[MUL:%.*]] = mul i8 [[LD]], 5
160; OPT-NEXT:    [[V3:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 3
161; OPT-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V3]], align 4
162; OPT-NEXT:    [[V31:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
163; OPT-NEXT:    store i8 [[MUL]], ptr addrspace(3) [[V31]], align 1
164; OPT-NEXT:    ret void
165;
166; GCN-LABEL: f3:
167; GCN:       ; %bb.0:
168; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
169; GCN-NEXT:    s_mov_b32 s4, s15
170; GCN-NEXT:    s_ashr_i32 s5, s15, 31
171; GCN-NEXT:    s_getpc_b64 s[6:7]
172; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+16
173; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+24
174; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 4
175; GCN-NEXT:    s_add_u32 s4, s4, s6
176; GCN-NEXT:    s_addc_u32 s5, s5, s7
177; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
178; GCN-NEXT:    s_waitcnt lgkmcnt(0)
179; GCN-NEXT:    v_mov_b32_e32 v0, s4
180; GCN-NEXT:    s_mov_b32 m0, -1
181; GCN-NEXT:    ds_read_u8 v1, v0
182; GCN-NEXT:    s_waitcnt lgkmcnt(0)
183; GCN-NEXT:    v_mul_lo_u32 v1, v1, 5
184; GCN-NEXT:    ds_write_b8 v0, v1
185; GCN-NEXT:    s_waitcnt lgkmcnt(0)
186; GCN-NEXT:    s_setpc_b64 s[30:31]
187  %ld = load i8, ptr addrspace(3) @v3
188  %mul = mul i8 %ld, 5
189  store i8 %mul, ptr  addrspace(3) @v3
190  ret void
191}
192
193; Doesn't access any via a function, won't be in the lookup table
194define amdgpu_kernel void @kernel_no_table() {
195; OPT-LABEL: define amdgpu_kernel void @kernel_no_table(
196; OPT-SAME: ) #[[ATTR0:[0-9]+]] {
197; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8
198; OPT-NEXT:    [[MUL:%.*]] = mul i64 [[LD]], 8
199; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8
200; OPT-NEXT:    ret void
201;
202; GCN-LABEL: kernel_no_table:
203; GCN:       ; %bb.0:
204; GCN-NEXT:    v_mov_b32_e32 v2, 0
205; GCN-NEXT:    s_mov_b32 m0, -1
206; GCN-NEXT:    ds_read_b64 v[0:1], v2
207; GCN-NEXT:    s_waitcnt lgkmcnt(0)
208; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 3
209; GCN-NEXT:    ds_write_b64 v2, v[0:1]
210; GCN-NEXT:    s_endpgm
211  %ld = load i64, ptr addrspace(3) @v2
212  %mul = mul i64 %ld, 8
213  store i64 %mul, ptr  addrspace(3) @v2
214  ret void
215}
216
217; Access two variables, will allocate those two
218define amdgpu_kernel void @k01() {
219; OPT-LABEL: define amdgpu_kernel void @k01(
220; OPT-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
221; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds) ], !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]]
222; OPT-NEXT:    call void @f0()
223; OPT-NEXT:    call void @f1()
224; OPT-NEXT:    ret void
225;
226; GCN-LABEL: k01:
227; GCN:       ; %bb.0:
228; GCN-NEXT:    s_mov_b32 s32, 0
229; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
230; GCN-NEXT:    s_add_i32 s12, s12, s17
231; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
232; GCN-NEXT:    s_add_u32 s0, s0, s17
233; GCN-NEXT:    s_addc_u32 s1, s1, 0
234; GCN-NEXT:    s_mov_b32 s20, s16
235; GCN-NEXT:    s_mov_b32 s13, s15
236; GCN-NEXT:    s_mov_b32 s12, s14
237; GCN-NEXT:    s_mov_b64 s[16:17], s[6:7]
238; GCN-NEXT:    s_mov_b64 s[18:19], s[4:5]
239; GCN-NEXT:    s_getpc_b64 s[4:5]
240; GCN-NEXT:    s_add_u32 s4, s4, f0@gotpcrel32@lo+4
241; GCN-NEXT:    s_addc_u32 s5, s5, f0@gotpcrel32@hi+12
242; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
243; GCN-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x0
244; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
245; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
246; GCN-NEXT:    v_or_b32_e32 v31, v0, v2
247; GCN-NEXT:    s_mov_b32 s15, 0
248; GCN-NEXT:    s_mov_b64 s[4:5], s[18:19]
249; GCN-NEXT:    s_mov_b32 s14, s20
250; GCN-NEXT:    s_waitcnt lgkmcnt(0)
251; GCN-NEXT:    s_swappc_b64 s[30:31], s[22:23]
252; GCN-NEXT:    s_getpc_b64 s[4:5]
253; GCN-NEXT:    s_add_u32 s4, s4, f1@gotpcrel32@lo+4
254; GCN-NEXT:    s_addc_u32 s5, s5, f1@gotpcrel32@hi+12
255; GCN-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x0
256; GCN-NEXT:    s_mov_b64 s[4:5], s[18:19]
257; GCN-NEXT:    s_mov_b64 s[6:7], s[16:17]
258; GCN-NEXT:    s_waitcnt lgkmcnt(0)
259; GCN-NEXT:    s_swappc_b64 s[30:31], s[22:23]
260; GCN-NEXT:    s_endpgm
261
262
263
264  call void @f0()
265  call void @f1()
266  ret void
267}
268
269define amdgpu_kernel void @k23() {
270; OPT-LABEL: define amdgpu_kernel void @k23(
271; OPT-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META8:![0-9]+]] {
272; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope [[META9:![0-9]+]], !noalias [[META12:![0-9]+]]
273; OPT-NEXT:    call void @f2()
274; OPT-NEXT:    call void @f3()
275; OPT-NEXT:    ret void
276;
277; GCN-LABEL: k23:
278; GCN:       ; %bb.0:
279; GCN-NEXT:    s_mov_b32 s32, 0
280; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
281; GCN-NEXT:    s_add_i32 s12, s12, s17
282; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
283; GCN-NEXT:    s_add_u32 s0, s0, s17
284; GCN-NEXT:    s_addc_u32 s1, s1, 0
285; GCN-NEXT:    s_mov_b32 s20, s16
286; GCN-NEXT:    s_mov_b32 s13, s15
287; GCN-NEXT:    s_mov_b32 s12, s14
288; GCN-NEXT:    s_mov_b64 s[16:17], s[6:7]
289; GCN-NEXT:    s_mov_b64 s[18:19], s[4:5]
290; GCN-NEXT:    s_getpc_b64 s[4:5]
291; GCN-NEXT:    s_add_u32 s4, s4, f2@gotpcrel32@lo+4
292; GCN-NEXT:    s_addc_u32 s5, s5, f2@gotpcrel32@hi+12
293; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
294; GCN-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x0
295; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
296; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
297; GCN-NEXT:    v_or_b32_e32 v31, v0, v2
298; GCN-NEXT:    s_mov_b32 s15, 2
299; GCN-NEXT:    s_mov_b64 s[4:5], s[18:19]
300; GCN-NEXT:    s_mov_b32 s14, s20
301; GCN-NEXT:    s_waitcnt lgkmcnt(0)
302; GCN-NEXT:    s_swappc_b64 s[30:31], s[22:23]
303; GCN-NEXT:    s_getpc_b64 s[4:5]
304; GCN-NEXT:    s_add_u32 s4, s4, f3@gotpcrel32@lo+4
305; GCN-NEXT:    s_addc_u32 s5, s5, f3@gotpcrel32@hi+12
306; GCN-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x0
307; GCN-NEXT:    s_mov_b64 s[4:5], s[18:19]
308; GCN-NEXT:    s_mov_b64 s[6:7], s[16:17]
309; GCN-NEXT:    s_waitcnt lgkmcnt(0)
310; GCN-NEXT:    s_swappc_b64 s[30:31], s[22:23]
311; GCN-NEXT:    s_endpgm
312
313
314  call void @f2()
315  call void @f3()
316  ret void
317}
318
319; Access and allocate three variables
320define amdgpu_kernel void @k123() {
321; OPT-LABEL: define amdgpu_kernel void @k123(
322; OPT-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META14:![0-9]+]] {
323; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope [[META15:![0-9]+]], !noalias [[META18:![0-9]+]]
324; OPT-NEXT:    call void @f1()
325; OPT-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope [[META21:![0-9]+]], !noalias [[META22:![0-9]+]]
326; OPT-NEXT:    [[MUL:%.*]] = mul i8 [[LD]], 8
327; OPT-NEXT:    store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope [[META21]], !noalias [[META22]]
328; OPT-NEXT:    call void @f2()
329; OPT-NEXT:    ret void
330;
331; GCN-LABEL: k123:
332; GCN:       ; %bb.0:
333; GCN-NEXT:    s_mov_b32 s32, 0
334; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
335; GCN-NEXT:    s_add_i32 s12, s12, s17
336; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
337; GCN-NEXT:    s_add_u32 s0, s0, s17
338; GCN-NEXT:    s_addc_u32 s1, s1, 0
339; GCN-NEXT:    s_mov_b32 s20, s16
340; GCN-NEXT:    s_mov_b32 s13, s15
341; GCN-NEXT:    s_mov_b32 s12, s14
342; GCN-NEXT:    s_mov_b64 s[16:17], s[6:7]
343; GCN-NEXT:    s_mov_b64 s[18:19], s[4:5]
344; GCN-NEXT:    s_getpc_b64 s[4:5]
345; GCN-NEXT:    s_add_u32 s4, s4, f1@gotpcrel32@lo+4
346; GCN-NEXT:    s_addc_u32 s5, s5, f1@gotpcrel32@hi+12
347; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
348; GCN-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x0
349; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
350; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
351; GCN-NEXT:    v_or_b32_e32 v31, v0, v2
352; GCN-NEXT:    s_mov_b32 s15, 1
353; GCN-NEXT:    s_mov_b64 s[4:5], s[18:19]
354; GCN-NEXT:    s_mov_b32 s14, s20
355; GCN-NEXT:    s_waitcnt lgkmcnt(0)
356; GCN-NEXT:    s_swappc_b64 s[30:31], s[22:23]
357; GCN-NEXT:    v_mov_b32_e32 v0, 0
358; GCN-NEXT:    s_mov_b32 m0, -1
359; GCN-NEXT:    ds_read_u8 v1, v0 offset:2
360; GCN-NEXT:    s_getpc_b64 s[4:5]
361; GCN-NEXT:    s_add_u32 s4, s4, f2@gotpcrel32@lo+4
362; GCN-NEXT:    s_addc_u32 s5, s5, f2@gotpcrel32@hi+12
363; GCN-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x0
364; GCN-NEXT:    s_waitcnt lgkmcnt(0)
365; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
366; GCN-NEXT:    ds_write_b8 v0, v1 offset:2
367; GCN-NEXT:    s_mov_b64 s[4:5], s[18:19]
368; GCN-NEXT:    s_mov_b64 s[6:7], s[16:17]
369; GCN-NEXT:    s_swappc_b64 s[30:31], s[22:23]
370; GCN-NEXT:    s_endpgm
371
372
373  call void @f1()
374  %ld = load i8, ptr addrspace(3) @v3
375  %mul = mul i8 %ld, 8
376  store i8 %mul, ptr  addrspace(3) @v3
377  call void @f2()
378  ret void
379}
380
381
382
383; OPT: attributes #0 = { "amdgpu-lds-size"="8" }
384; OPT: attributes #1 = { "amdgpu-lds-size"="16" }
385
386!0 = !{i64 0, i64 1}
387!1 = !{i32 0}
388!2 = !{i32 2}
389!3 = !{i32 1}
390
391
392; Table size length number-kernels * number-variables * sizeof(uint16_t)
393; GCN:      .type	llvm.amdgcn.lds.offset.table,@object
394; GCN-NEXT: .section	.data.rel.ro,"aw"
395; GCN-NEXT: .p2align	4, 0x0
396; GCN-NEXT: llvm.amdgcn.lds.offset.table:
397; GCN-NEXT: .long	0+4
398; GCN-NEXT: .long	0
399; GCN-NEXT: .zero	4
400; GCN-NEXT: .zero	4
401; GCN-NEXT: .zero	4
402; GCN-NEXT: .long	0
403; GCN-NEXT: .long	0+8
404; GCN-NEXT: .long	0+2
405; GCN-NEXT: .zero	4
406; GCN-NEXT: .zero	4
407; GCN-NEXT: .long	0
408; GCN-NEXT: .long	0+8
409; GCN-NEXT: .size	llvm.amdgcn.lds.offset.table, 48
410
411!llvm.module.flags = !{!4}
412!4 = !{i32 1, !"amdhsa_code_object_version", i32 500}
413