xref: /llvm-project/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll (revision 2e5c2982819625d84e0b61aea0ec00de859f0e95)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940 %s
3; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s
4
5define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 {
6; GFX940-LABEL: preload_block_count_x:
7; GFX940:       ; %bb.1:
8; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
9; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
10; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
11; GFX940-NEXT:    s_branch .LBB0_0
12; GFX940-NEXT:    .p2align 8
13; GFX940-NEXT:  ; %bb.2:
14; GFX940-NEXT:  .LBB0_0:
15; GFX940-NEXT:    v_mov_b32_e32 v0, 0
16; GFX940-NEXT:    v_mov_b32_e32 v1, s4
17; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
18; GFX940-NEXT:    s_endpgm
19;
20; GFX90a-LABEL: preload_block_count_x:
21; GFX90a:       ; %bb.1:
22; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
23; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
24; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
25; GFX90a-NEXT:    s_branch .LBB0_0
26; GFX90a-NEXT:    .p2align 8
27; GFX90a-NEXT:  ; %bb.2:
28; GFX90a-NEXT:  .LBB0_0:
29; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
30; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
31; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
32; GFX90a-NEXT:    s_endpgm
33  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
34  %load = load i32, ptr addrspace(4) %imp_arg_ptr
35  store i32 %load, ptr addrspace(1) %out
36  ret void
37}
38
39define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inreg %out, i32 inreg) #0 {
40; GFX940-LABEL: preload_unused_arg_block_count_x:
41; GFX940:       ; %bb.1:
42; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
43; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
44; GFX940-NEXT:    s_load_dword s6, s[0:1], 0x10
45; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX940-NEXT:    s_branch .LBB1_0
47; GFX940-NEXT:    .p2align 8
48; GFX940-NEXT:  ; %bb.2:
49; GFX940-NEXT:  .LBB1_0:
50; GFX940-NEXT:    v_mov_b32_e32 v0, 0
51; GFX940-NEXT:    v_mov_b32_e32 v1, s6
52; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
53; GFX940-NEXT:    s_endpgm
54;
55; GFX90a-LABEL: preload_unused_arg_block_count_x:
56; GFX90a:       ; %bb.1:
57; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
58; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
59; GFX90a-NEXT:    s_load_dword s10, s[4:5], 0x10
60; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
61; GFX90a-NEXT:    s_branch .LBB1_0
62; GFX90a-NEXT:    .p2align 8
63; GFX90a-NEXT:  ; %bb.2:
64; GFX90a-NEXT:  .LBB1_0:
65; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
66; GFX90a-NEXT:    v_mov_b32_e32 v1, s10
67; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
68; GFX90a-NEXT:    s_endpgm
69  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
70  %load = load i32, ptr addrspace(4) %imp_arg_ptr
71  store i32 %load, ptr addrspace(1) %out
72  ret void
73}
74
75define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %out, i256 inreg) {
76; GFX940-LABEL: no_free_sgprs_block_count_x:
77; GFX940:       ; %bb.1:
78; GFX940-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
79; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
80; GFX940-NEXT:    s_branch .LBB2_0
81; GFX940-NEXT:    .p2align 8
82; GFX940-NEXT:  ; %bb.2:
83; GFX940-NEXT:  .LBB2_0:
84; GFX940-NEXT:    s_load_dword s0, s[4:5], 0x28
85; GFX940-NEXT:    v_mov_b32_e32 v0, 0
86; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
87; GFX940-NEXT:    v_mov_b32_e32 v1, s0
88; GFX940-NEXT:    global_store_dword v0, v1, s[8:9] sc0 sc1
89; GFX940-NEXT:    s_endpgm
90;
91; GFX90a-LABEL: no_free_sgprs_block_count_x:
92; GFX90a:       ; %bb.1:
93; GFX90a-NEXT:    s_load_dwordx4 s[12:15], s[8:9], 0x0
94; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX90a-NEXT:    s_branch .LBB2_0
96; GFX90a-NEXT:    .p2align 8
97; GFX90a-NEXT:  ; %bb.2:
98; GFX90a-NEXT:  .LBB2_0:
99; GFX90a-NEXT:    s_load_dword s0, s[8:9], 0x28
100; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
101; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
102; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
103; GFX90a-NEXT:    global_store_dword v0, v1, s[12:13]
104; GFX90a-NEXT:    s_endpgm
105  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
106  %load = load i32, ptr addrspace(4) %imp_arg_ptr
107  store i32 %load, ptr addrspace(1) %out
108  ret void
109}
110
111define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 {
112; GFX940-LABEL: no_inreg_block_count_x:
113; GFX940:       ; %bb.0:
114; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
115; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
116; GFX940-NEXT:    v_mov_b32_e32 v0, 0
117; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX940-NEXT:    v_mov_b32_e32 v1, s4
119; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
120; GFX940-NEXT:    s_endpgm
121;
122; GFX90a-LABEL: no_inreg_block_count_x:
123; GFX90a:       ; %bb.0:
124; GFX90a-NEXT:    s_load_dword s2, s[4:5], 0x8
125; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
126; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
127; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX90a-NEXT:    v_mov_b32_e32 v1, s2
129; GFX90a-NEXT:    global_store_dword v0, v1, s[0:1]
130; GFX90a-NEXT:    s_endpgm
131  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
132  %load = load i32, ptr addrspace(4) %imp_arg_ptr
133  store i32 %load, ptr addrspace(1) %out
134  ret void
135}
136
137; Implicit arg preloading is currently restricted to cases where all explicit
138; args are inreg (preloaded).
139
140define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg) #0 {
141; GFX940-LABEL: mixed_inreg_block_count_x:
142; GFX940:       ; %bb.0:
143; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x10
144; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
145; GFX940-NEXT:    v_mov_b32_e32 v0, 0
146; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX940-NEXT:    v_mov_b32_e32 v1, s4
148; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
149; GFX940-NEXT:    s_endpgm
150;
151; GFX90a-LABEL: mixed_inreg_block_count_x:
152; GFX90a:       ; %bb.0:
153; GFX90a-NEXT:    s_load_dword s2, s[4:5], 0x10
154; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
155; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
156; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX90a-NEXT:    v_mov_b32_e32 v1, s2
158; GFX90a-NEXT:    global_store_dword v0, v1, s[0:1]
159; GFX90a-NEXT:    s_endpgm
160  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
161  %load = load i32, ptr addrspace(4) %imp_arg_ptr
162  store i32 %load, ptr addrspace(1) %out
163  ret void
164}
165
166define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inreg %out) #0 {
167; GFX940-LABEL: incorrect_type_i64_block_count_x:
168; GFX940:       ; %bb.1:
169; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
170; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
171; GFX940-NEXT:    s_branch .LBB5_0
172; GFX940-NEXT:    .p2align 8
173; GFX940-NEXT:  ; %bb.2:
174; GFX940-NEXT:  .LBB5_0:
175; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
176; GFX940-NEXT:    v_mov_b32_e32 v2, 0
177; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
178; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
179; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
180; GFX940-NEXT:    s_endpgm
181;
182; GFX90a-LABEL: incorrect_type_i64_block_count_x:
183; GFX90a:       ; %bb.1:
184; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
185; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
186; GFX90a-NEXT:    s_branch .LBB5_0
187; GFX90a-NEXT:    .p2align 8
188; GFX90a-NEXT:  ; %bb.2:
189; GFX90a-NEXT:  .LBB5_0:
190; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
191; GFX90a-NEXT:    v_mov_b32_e32 v2, 0
192; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
193; GFX90a-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
194; GFX90a-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
195; GFX90a-NEXT:    s_endpgm
196  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
197  %load = load i64, ptr addrspace(4) %imp_arg_ptr
198  store i64 %load, ptr addrspace(1) %out
199  ret void
200}
201
202define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inreg %out) #0 {
203; GFX940-LABEL: incorrect_type_i16_block_count_x:
204; GFX940:       ; %bb.1:
205; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
206; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
207; GFX940-NEXT:    s_branch .LBB6_0
208; GFX940-NEXT:    .p2align 8
209; GFX940-NEXT:  ; %bb.2:
210; GFX940-NEXT:  .LBB6_0:
211; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x8
212; GFX940-NEXT:    v_mov_b32_e32 v0, 0
213; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
214; GFX940-NEXT:    v_mov_b32_e32 v1, s0
215; GFX940-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
216; GFX940-NEXT:    s_endpgm
217;
218; GFX90a-LABEL: incorrect_type_i16_block_count_x:
219; GFX90a:       ; %bb.1:
220; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
221; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
222; GFX90a-NEXT:    s_branch .LBB6_0
223; GFX90a-NEXT:    .p2align 8
224; GFX90a-NEXT:  ; %bb.2:
225; GFX90a-NEXT:  .LBB6_0:
226; GFX90a-NEXT:    s_load_dword s0, s[4:5], 0x8
227; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
228; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
229; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
230; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
231; GFX90a-NEXT:    s_endpgm
232  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
233  %load = load i16, ptr addrspace(4) %imp_arg_ptr
234  store i16 %load, ptr addrspace(1) %out
235  ret void
236}
237
238define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0 {
239; GFX940-LABEL: preload_block_count_y:
240; GFX940:       ; %bb.1:
241; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
242; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
243; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
244; GFX940-NEXT:    s_branch .LBB7_0
245; GFX940-NEXT:    .p2align 8
246; GFX940-NEXT:  ; %bb.2:
247; GFX940-NEXT:  .LBB7_0:
248; GFX940-NEXT:    v_mov_b32_e32 v0, 0
249; GFX940-NEXT:    v_mov_b32_e32 v1, s5
250; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
251; GFX940-NEXT:    s_endpgm
252;
253; GFX90a-LABEL: preload_block_count_y:
254; GFX90a:       ; %bb.1:
255; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
256; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
257; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
258; GFX90a-NEXT:    s_branch .LBB7_0
259; GFX90a-NEXT:    .p2align 8
260; GFX90a-NEXT:  ; %bb.2:
261; GFX90a-NEXT:  .LBB7_0:
262; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
263; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
264; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
265; GFX90a-NEXT:    s_endpgm
266  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
267  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
268  %load = load i32, ptr addrspace(4) %gep
269  store i32 %load, ptr addrspace(1) %out
270  ret void
271}
272
273define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) #0 {
274; GFX940-LABEL: random_incorrect_offset:
275; GFX940:       ; %bb.1:
276; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
277; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
278; GFX940-NEXT:    s_branch .LBB8_0
279; GFX940-NEXT:    .p2align 8
280; GFX940-NEXT:  ; %bb.2:
281; GFX940-NEXT:  .LBB8_0:
282; GFX940-NEXT:    s_mov_b32 s4, 8
283; GFX940-NEXT:    s_load_dword s0, s[0:1], s4 offset:0x2
284; GFX940-NEXT:    v_mov_b32_e32 v0, 0
285; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
286; GFX940-NEXT:    v_mov_b32_e32 v1, s0
287; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
288; GFX940-NEXT:    s_endpgm
289;
290; GFX90a-LABEL: random_incorrect_offset:
291; GFX90a:       ; %bb.1:
292; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
293; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
294; GFX90a-NEXT:    s_branch .LBB8_0
295; GFX90a-NEXT:    .p2align 8
296; GFX90a-NEXT:  ; %bb.2:
297; GFX90a-NEXT:  .LBB8_0:
298; GFX90a-NEXT:    s_mov_b32 s0, 8
299; GFX90a-NEXT:    s_load_dword s0, s[4:5], s0 offset:0x2
300; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
301; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
302; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
303; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
304; GFX90a-NEXT:    s_endpgm
305  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
306  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
307  %load = load i32, ptr addrspace(4) %gep
308  store i32 %load, ptr addrspace(1) %out
309  ret void
310}
311
312define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0 {
313; GFX940-LABEL: preload_block_count_z:
314; GFX940:       ; %bb.1:
315; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
316; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
317; GFX940-NEXT:    s_load_dword s6, s[0:1], 0x10
318; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
319; GFX940-NEXT:    s_branch .LBB9_0
320; GFX940-NEXT:    .p2align 8
321; GFX940-NEXT:  ; %bb.2:
322; GFX940-NEXT:  .LBB9_0:
323; GFX940-NEXT:    v_mov_b32_e32 v0, 0
324; GFX940-NEXT:    v_mov_b32_e32 v1, s6
325; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
326; GFX940-NEXT:    s_endpgm
327;
328; GFX90a-LABEL: preload_block_count_z:
329; GFX90a:       ; %bb.1:
330; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
331; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
332; GFX90a-NEXT:    s_load_dword s10, s[4:5], 0x10
333; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
334; GFX90a-NEXT:    s_branch .LBB9_0
335; GFX90a-NEXT:    .p2align 8
336; GFX90a-NEXT:  ; %bb.2:
337; GFX90a-NEXT:  .LBB9_0:
338; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
339; GFX90a-NEXT:    v_mov_b32_e32 v1, s10
340; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
341; GFX90a-NEXT:    s_endpgm
342  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
343  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
344  %load = load i32, ptr addrspace(4) %gep
345  store i32 %load, ptr addrspace(1) %out
346  ret void
347}
348
349define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) inreg %out, i8 inreg %val) #0 {
350; GFX940-LABEL: preload_block_count_x_imparg_align_ptr_i8:
351; GFX940:       ; %bb.1:
352; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
353; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
354; GFX940-NEXT:    s_load_dword s6, s[0:1], 0x10
355; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
356; GFX940-NEXT:    s_branch .LBB10_0
357; GFX940-NEXT:    .p2align 8
358; GFX940-NEXT:  ; %bb.2:
359; GFX940-NEXT:  .LBB10_0:
360; GFX940-NEXT:    s_and_b32 s0, s4, 0xff
361; GFX940-NEXT:    s_add_i32 s0, s6, s0
362; GFX940-NEXT:    v_mov_b32_e32 v0, 0
363; GFX940-NEXT:    v_mov_b32_e32 v1, s0
364; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
365; GFX940-NEXT:    s_endpgm
366;
367; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8:
368; GFX90a:       ; %bb.1:
369; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
370; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
371; GFX90a-NEXT:    s_load_dword s10, s[4:5], 0x10
372; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
373; GFX90a-NEXT:    s_branch .LBB10_0
374; GFX90a-NEXT:    .p2align 8
375; GFX90a-NEXT:  ; %bb.2:
376; GFX90a-NEXT:  .LBB10_0:
377; GFX90a-NEXT:    s_and_b32 s0, s8, 0xff
378; GFX90a-NEXT:    s_add_i32 s0, s10, s0
379; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
380; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
381; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
382; GFX90a-NEXT:    s_endpgm
383  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
384  %load = load i32, ptr addrspace(4) %imp_arg_ptr
385  %ext = zext i8 %val to i32
386  %add = add i32 %load, %ext
387  store i32 %add, ptr addrspace(1) %out
388  ret void
389}
390
391define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out) #0 {
392; GFX940-LABEL: preload_block_count_xyz:
393; GFX940:       ; %bb.1:
394; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
395; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
396; GFX940-NEXT:    s_load_dword s6, s[0:1], 0x10
397; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
398; GFX940-NEXT:    s_branch .LBB11_0
399; GFX940-NEXT:    .p2align 8
400; GFX940-NEXT:  ; %bb.2:
401; GFX940-NEXT:  .LBB11_0:
402; GFX940-NEXT:    v_mov_b32_e32 v3, 0
403; GFX940-NEXT:    v_mov_b32_e32 v0, s4
404; GFX940-NEXT:    v_mov_b32_e32 v1, s5
405; GFX940-NEXT:    v_mov_b32_e32 v2, s6
406; GFX940-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
407; GFX940-NEXT:    s_endpgm
408;
409; GFX90a-LABEL: preload_block_count_xyz:
410; GFX90a:       ; %bb.1:
411; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
412; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
413; GFX90a-NEXT:    s_load_dword s10, s[4:5], 0x10
414; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
415; GFX90a-NEXT:    s_branch .LBB11_0
416; GFX90a-NEXT:    .p2align 8
417; GFX90a-NEXT:  ; %bb.2:
418; GFX90a-NEXT:  .LBB11_0:
419; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
420; GFX90a-NEXT:    v_mov_b32_e32 v0, s8
421; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
422; GFX90a-NEXT:    v_mov_b32_e32 v2, s10
423; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
424; GFX90a-NEXT:    s_endpgm
425  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
426  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
427  %load_x = load i32, ptr addrspace(4) %gep_x
428  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
429  %load_y = load i32, ptr addrspace(4) %gep_y
430  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
431  %load_z = load i32, ptr addrspace(4) %gep_z
432  %ins.0 =  insertelement <3 x i32> poison, i32 %load_x, i32 0
433  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %load_y, i32 1
434  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %load_z, i32 2
435  store <3 x i32> %ins.2, ptr addrspace(1) %out
436  ret void
437}
438
439define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out) #0 {
440; GFX940-LABEL: preload_workgroup_size_x:
441; GFX940:       ; %bb.1:
442; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
443; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
444; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
445; GFX940-NEXT:    s_branch .LBB12_0
446; GFX940-NEXT:    .p2align 8
447; GFX940-NEXT:  ; %bb.2:
448; GFX940-NEXT:  .LBB12_0:
449; GFX940-NEXT:    s_and_b32 s0, s7, 0xffff
450; GFX940-NEXT:    v_mov_b32_e32 v0, 0
451; GFX940-NEXT:    v_mov_b32_e32 v1, s0
452; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
453; GFX940-NEXT:    s_endpgm
454;
455; GFX90a-LABEL: preload_workgroup_size_x:
456; GFX90a:       ; %bb.1:
457; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
458; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
459; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
460; GFX90a-NEXT:    s_branch .LBB12_0
461; GFX90a-NEXT:    .p2align 8
462; GFX90a-NEXT:  ; %bb.2:
463; GFX90a-NEXT:  .LBB12_0:
464; GFX90a-NEXT:    s_and_b32 s0, s11, 0xffff
465; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
466; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
467; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
468; GFX90a-NEXT:    s_endpgm
469  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
470  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
471  %load = load i16, ptr addrspace(4) %gep
472  %conv = zext i16 %load to i32
473  store i32 %conv, ptr addrspace(1) %out
474  ret void
475}
476
477define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out) #0 {
478; GFX940-LABEL: preload_workgroup_size_y:
479; GFX940:       ; %bb.1:
480; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
481; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
482; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
483; GFX940-NEXT:    s_branch .LBB13_0
484; GFX940-NEXT:    .p2align 8
485; GFX940-NEXT:  ; %bb.2:
486; GFX940-NEXT:  .LBB13_0:
487; GFX940-NEXT:    s_lshr_b32 s0, s7, 16
488; GFX940-NEXT:    v_mov_b32_e32 v0, 0
489; GFX940-NEXT:    v_mov_b32_e32 v1, s0
490; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
491; GFX940-NEXT:    s_endpgm
492;
493; GFX90a-LABEL: preload_workgroup_size_y:
494; GFX90a:       ; %bb.1:
495; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
496; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
497; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
498; GFX90a-NEXT:    s_branch .LBB13_0
499; GFX90a-NEXT:    .p2align 8
500; GFX90a-NEXT:  ; %bb.2:
501; GFX90a-NEXT:  .LBB13_0:
502; GFX90a-NEXT:    s_lshr_b32 s0, s11, 16
503; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
504; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
505; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
506; GFX90a-NEXT:    s_endpgm
507  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
508  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
509  %load = load i16, ptr addrspace(4) %gep
510  %conv = zext i16 %load to i32
511  store i32 %conv, ptr addrspace(1) %out
512  ret void
513}
514
515define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out) #0 {
516; GFX940-LABEL: preload_workgroup_size_z:
517; GFX940:       ; %bb.1:
518; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
519; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
520; GFX940-NEXT:    s_load_dword s8, s[0:1], 0x18
521; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
522; GFX940-NEXT:    s_branch .LBB14_0
523; GFX940-NEXT:    .p2align 8
524; GFX940-NEXT:  ; %bb.2:
525; GFX940-NEXT:  .LBB14_0:
526; GFX940-NEXT:    s_and_b32 s0, s8, 0xffff
527; GFX940-NEXT:    v_mov_b32_e32 v0, 0
528; GFX940-NEXT:    v_mov_b32_e32 v1, s0
529; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
530; GFX940-NEXT:    s_endpgm
531;
532; GFX90a-LABEL: preload_workgroup_size_z:
533; GFX90a:       ; %bb.1:
534; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
535; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
536; GFX90a-NEXT:    s_load_dword s12, s[4:5], 0x18
537; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
538; GFX90a-NEXT:    s_branch .LBB14_0
539; GFX90a-NEXT:    .p2align 8
540; GFX90a-NEXT:  ; %bb.2:
541; GFX90a-NEXT:  .LBB14_0:
542; GFX90a-NEXT:    s_and_b32 s0, s12, 0xffff
543; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
544; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
545; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
546; GFX90a-NEXT:    s_endpgm
547  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
548  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
549  %load = load i16, ptr addrspace(4) %gep
550  %conv = zext i16 %load to i32
551  store i32 %conv, ptr addrspace(1) %out
552  ret void
553}
554
555define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %out) #0 {
556; GFX940-LABEL: preload_workgroup_size_xyz:
557; GFX940:       ; %bb.1:
558; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
559; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
560; GFX940-NEXT:    s_load_dword s8, s[0:1], 0x18
561; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
562; GFX940-NEXT:    s_branch .LBB15_0
563; GFX940-NEXT:    .p2align 8
564; GFX940-NEXT:  ; %bb.2:
565; GFX940-NEXT:  .LBB15_0:
566; GFX940-NEXT:    s_lshr_b32 s0, s7, 16
567; GFX940-NEXT:    s_and_b32 s1, s7, 0xffff
568; GFX940-NEXT:    s_and_b32 s4, s8, 0xffff
569; GFX940-NEXT:    v_mov_b32_e32 v3, 0
570; GFX940-NEXT:    v_mov_b32_e32 v0, s1
571; GFX940-NEXT:    v_mov_b32_e32 v1, s0
572; GFX940-NEXT:    v_mov_b32_e32 v2, s4
573; GFX940-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
574; GFX940-NEXT:    s_endpgm
575;
576; GFX90a-LABEL: preload_workgroup_size_xyz:
577; GFX90a:       ; %bb.1:
578; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
579; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
580; GFX90a-NEXT:    s_load_dword s12, s[4:5], 0x18
581; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
582; GFX90a-NEXT:    s_branch .LBB15_0
583; GFX90a-NEXT:    .p2align 8
584; GFX90a-NEXT:  ; %bb.2:
585; GFX90a-NEXT:  .LBB15_0:
586; GFX90a-NEXT:    s_lshr_b32 s0, s11, 16
587; GFX90a-NEXT:    s_and_b32 s1, s11, 0xffff
588; GFX90a-NEXT:    s_and_b32 s2, s12, 0xffff
589; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
590; GFX90a-NEXT:    v_mov_b32_e32 v0, s1
591; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
592; GFX90a-NEXT:    v_mov_b32_e32 v2, s2
593; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
594; GFX90a-NEXT:    s_endpgm
595  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
596  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
597  %load_x = load i16, ptr addrspace(4) %gep_x
598  %conv_x = zext i16 %load_x to i32
599  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
600  %load_y = load i16, ptr addrspace(4) %gep_y
601  %conv_y = zext i16 %load_y to i32
602  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
603  %load_z = load i16, ptr addrspace(4) %gep_z
604  %conv_z = zext i16 %load_z to i32
605  %ins.0 =  insertelement <3 x i32> poison, i32 %conv_x, i32 0
606  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
607  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
608  store <3 x i32> %ins.2, ptr addrspace(1) %out
609  ret void
610}
611
612define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
613; GFX940-LABEL: preload_remainder_x:
614; GFX940:       ; %bb.1:
615; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
616; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
617; GFX940-NEXT:    s_load_dword s8, s[0:1], 0x18
618; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
619; GFX940-NEXT:    s_branch .LBB16_0
620; GFX940-NEXT:    .p2align 8
621; GFX940-NEXT:  ; %bb.2:
622; GFX940-NEXT:  .LBB16_0:
623; GFX940-NEXT:    s_lshr_b32 s0, s8, 16
624; GFX940-NEXT:    v_mov_b32_e32 v0, 0
625; GFX940-NEXT:    v_mov_b32_e32 v1, s0
626; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
627; GFX940-NEXT:    s_endpgm
628;
629; GFX90a-LABEL: preload_remainder_x:
630; GFX90a:       ; %bb.1:
631; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
632; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
633; GFX90a-NEXT:    s_load_dword s12, s[4:5], 0x18
634; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
635; GFX90a-NEXT:    s_branch .LBB16_0
636; GFX90a-NEXT:    .p2align 8
637; GFX90a-NEXT:  ; %bb.2:
638; GFX90a-NEXT:  .LBB16_0:
639; GFX90a-NEXT:    s_lshr_b32 s0, s12, 16
640; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
641; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
642; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
643; GFX90a-NEXT:    s_endpgm
644  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
645  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
646  %load = load i16, ptr addrspace(4) %gep
647  %conv = zext i16 %load to i32
648  store i32 %conv, ptr addrspace(1) %out
649  ret void
650}
651
652define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
653; GFX940-LABEL: preloadremainder_y:
654; GFX940:       ; %bb.1:
655; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
656; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
657; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
658; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
659; GFX940-NEXT:    s_branch .LBB17_0
660; GFX940-NEXT:    .p2align 8
661; GFX940-NEXT:  ; %bb.2:
662; GFX940-NEXT:  .LBB17_0:
663; GFX940-NEXT:    s_and_b32 s0, s9, 0xffff
664; GFX940-NEXT:    v_mov_b32_e32 v0, 0
665; GFX940-NEXT:    v_mov_b32_e32 v1, s0
666; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
667; GFX940-NEXT:    s_endpgm
668;
669; GFX90a-LABEL: preloadremainder_y:
670; GFX90a:       ; %bb.1:
671; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
672; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
673; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
674; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
675; GFX90a-NEXT:    s_branch .LBB17_0
676; GFX90a-NEXT:    .p2align 8
677; GFX90a-NEXT:  ; %bb.2:
678; GFX90a-NEXT:  .LBB17_0:
679; GFX90a-NEXT:    s_and_b32 s0, s13, 0xffff
680; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
681; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
682; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
683; GFX90a-NEXT:    s_endpgm
684  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
685  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
686  %load = load i16, ptr addrspace(4) %gep
687  %conv = zext i16 %load to i32
688  store i32 %conv, ptr addrspace(1) %out
689  ret void
690}
691
692define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
693; GFX940-LABEL: preloadremainder_z:
694; GFX940:       ; %bb.1:
695; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
696; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
697; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
698; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
699; GFX940-NEXT:    s_branch .LBB18_0
700; GFX940-NEXT:    .p2align 8
701; GFX940-NEXT:  ; %bb.2:
702; GFX940-NEXT:  .LBB18_0:
703; GFX940-NEXT:    s_lshr_b32 s0, s9, 16
704; GFX940-NEXT:    v_mov_b32_e32 v0, 0
705; GFX940-NEXT:    v_mov_b32_e32 v1, s0
706; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
707; GFX940-NEXT:    s_endpgm
708;
709; GFX90a-LABEL: preloadremainder_z:
710; GFX90a:       ; %bb.1:
711; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
712; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
713; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
714; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
715; GFX90a-NEXT:    s_branch .LBB18_0
716; GFX90a-NEXT:    .p2align 8
717; GFX90a-NEXT:  ; %bb.2:
718; GFX90a-NEXT:  .LBB18_0:
719; GFX90a-NEXT:    s_lshr_b32 s0, s13, 16
720; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
721; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
722; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
723; GFX90a-NEXT:    s_endpgm
724  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
725  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
726  %load = load i16, ptr addrspace(4) %gep
727  %conv = zext i16 %load to i32
728  store i32 %conv, ptr addrspace(1) %out
729  ret void
730}
731
732define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0 {
733; GFX940-LABEL: preloadremainder_xyz:
734; GFX940:       ; %bb.1:
735; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
736; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
737; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
738; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
739; GFX940-NEXT:    s_branch .LBB19_0
740; GFX940-NEXT:    .p2align 8
741; GFX940-NEXT:  ; %bb.2:
742; GFX940-NEXT:  .LBB19_0:
743; GFX940-NEXT:    s_lshr_b32 s0, s9, 16
744; GFX940-NEXT:    s_lshr_b32 s1, s8, 16
745; GFX940-NEXT:    s_and_b32 s4, s9, 0xffff
746; GFX940-NEXT:    v_mov_b32_e32 v3, 0
747; GFX940-NEXT:    v_mov_b32_e32 v0, s1
748; GFX940-NEXT:    v_mov_b32_e32 v1, s4
749; GFX940-NEXT:    v_mov_b32_e32 v2, s0
750; GFX940-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
751; GFX940-NEXT:    s_endpgm
752;
753; GFX90a-LABEL: preloadremainder_xyz:
754; GFX90a:       ; %bb.1:
755; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
756; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
757; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
758; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
759; GFX90a-NEXT:    s_branch .LBB19_0
760; GFX90a-NEXT:    .p2align 8
761; GFX90a-NEXT:  ; %bb.2:
762; GFX90a-NEXT:  .LBB19_0:
763; GFX90a-NEXT:    s_lshr_b32 s0, s13, 16
764; GFX90a-NEXT:    s_lshr_b32 s1, s12, 16
765; GFX90a-NEXT:    s_and_b32 s2, s13, 0xffff
766; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
767; GFX90a-NEXT:    v_mov_b32_e32 v0, s1
768; GFX90a-NEXT:    v_mov_b32_e32 v1, s2
769; GFX90a-NEXT:    v_mov_b32_e32 v2, s0
770; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
771; GFX90a-NEXT:    s_endpgm
772  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
773  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
774  %load_x = load i16, ptr addrspace(4) %gep_x
775  %conv_x = zext i16 %load_x to i32
776  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
777  %load_y = load i16, ptr addrspace(4) %gep_y
778  %conv_y = zext i16 %load_y to i32
779  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
780  %load_z = load i16, ptr addrspace(4) %gep_z
781  %conv_z = zext i16 %load_z to i32
782  %ins.0 =  insertelement <3 x i32> poison, i32 %conv_x, i32 0
783  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
784  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
785  store <3 x i32> %ins.2, ptr addrspace(1) %out
786  ret void
787}
788
789define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inreg %out) {
790; GFX940-LABEL: no_free_sgprs_preloadremainder_z:
791; GFX940:       ; %bb.1:
792; GFX940-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
793; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
794; GFX940-NEXT:    s_branch .LBB20_0
795; GFX940-NEXT:    .p2align 8
796; GFX940-NEXT:  ; %bb.2:
797; GFX940-NEXT:  .LBB20_0:
798; GFX940-NEXT:    s_lshr_b32 s0, s15, 16
799; GFX940-NEXT:    v_mov_b32_e32 v0, 0
800; GFX940-NEXT:    v_mov_b32_e32 v1, s0
801; GFX940-NEXT:    global_store_dword v0, v1, s[8:9] sc0 sc1
802; GFX940-NEXT:    s_endpgm
803;
804; GFX90a-LABEL: no_free_sgprs_preloadremainder_z:
805; GFX90a:       ; %bb.1:
806; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x0
807; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
808; GFX90a-NEXT:    s_branch .LBB20_0
809; GFX90a-NEXT:    .p2align 8
810; GFX90a-NEXT:  ; %bb.2:
811; GFX90a-NEXT:  .LBB20_0:
812; GFX90a-NEXT:    s_load_dword s0, s[8:9], 0x1c
813; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
814; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
815; GFX90a-NEXT:    s_lshr_b32 s0, s0, 16
816; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
817; GFX90a-NEXT:    global_store_dword v0, v1, s[12:13]
818; GFX90a-NEXT:    s_endpgm
819  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
820  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
821  %load = load i16, ptr addrspace(4) %gep
822  %conv = zext i16 %load to i32
823  store i32 %conv, ptr addrspace(1) %out
824  ret void
825}
826
827; Check for consistency between isel and earlier passes preload SGPR accounting with max preload SGPRs.
828
829define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %out, i192 inreg %t0, i32 inreg %t1) #0 {
830; GFX940-LABEL: preload_block_max_user_sgprs:
831; GFX940:       ; %bb.1:
832; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
833; GFX940-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x8
834; GFX940-NEXT:    s_load_dword s12, s[0:1], 0x28
835; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
836; GFX940-NEXT:    s_branch .LBB21_0
837; GFX940-NEXT:    .p2align 8
838; GFX940-NEXT:  ; %bb.2:
839; GFX940-NEXT:  .LBB21_0:
840; GFX940-NEXT:    v_mov_b32_e32 v0, 0
841; GFX940-NEXT:    v_mov_b32_e32 v1, s12
842; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
843; GFX940-NEXT:    s_endpgm
844;
845; GFX90a-LABEL: preload_block_max_user_sgprs:
846; GFX90a:       ; %bb.1:
847; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
848; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
849; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
850; GFX90a-NEXT:    s_load_dword s14, s[4:5], 0x20
851; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
852; GFX90a-NEXT:    s_branch .LBB21_0
853; GFX90a-NEXT:    .p2align 8
854; GFX90a-NEXT:  ; %bb.2:
855; GFX90a-NEXT:  .LBB21_0:
856; GFX90a-NEXT:    s_load_dword s0, s[4:5], 0x28
857; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
858; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
859; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
860; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
861; GFX90a-NEXT:    s_endpgm
862  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
863  %load = load i32, ptr addrspace(4) %imp_arg_ptr
864  store i32 %load, ptr addrspace(1) %out
865  ret void
866}
867
868define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out) #0 {
869; GFX940-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
870; GFX940:       ; %bb.1:
871; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
872; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
873; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
874; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
875; GFX940-NEXT:    s_branch .LBB22_0
876; GFX940-NEXT:    .p2align 8
877; GFX940-NEXT:  ; %bb.2:
878; GFX940-NEXT:  .LBB22_0:
879; GFX940-NEXT:    s_lshr_b32 s0, s9, 16
880; GFX940-NEXT:    s_and_b32 s1, s8, 0xffff
881; GFX940-NEXT:    v_mov_b32_e32 v3, 0
882; GFX940-NEXT:    v_mov_b32_e32 v0, s6
883; GFX940-NEXT:    v_mov_b32_e32 v1, s1
884; GFX940-NEXT:    v_mov_b32_e32 v2, s0
885; GFX940-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
886; GFX940-NEXT:    s_endpgm
887;
888; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
889; GFX90a:       ; %bb.1:
890; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
891; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
892; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
893; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
894; GFX90a-NEXT:    s_branch .LBB22_0
895; GFX90a-NEXT:    .p2align 8
896; GFX90a-NEXT:  ; %bb.2:
897; GFX90a-NEXT:  .LBB22_0:
898; GFX90a-NEXT:    s_lshr_b32 s0, s13, 16
899; GFX90a-NEXT:    s_and_b32 s1, s12, 0xffff
900; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
901; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
902; GFX90a-NEXT:    v_mov_b32_e32 v1, s1
903; GFX90a-NEXT:    v_mov_b32_e32 v2, s0
904; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
905; GFX90a-NEXT:    s_endpgm
906  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
907  %gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
908  %gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
909  %gep2 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
910  %load0 = load i32, ptr addrspace(4) %gep0
911  %load1 = load i16, ptr addrspace(4) %gep1
912  %load2 = load i16, ptr addrspace(4) %gep2
913  %conv1 = zext i16 %load1 to i32
914  %conv2 = zext i16 %load2 to i32
915  %ins.0 =  insertelement <3 x i32> poison, i32 %load0, i32 0
916  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %conv1, i32 1
917  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %conv2, i32 2
918  store <3 x i32> %ins.2, ptr addrspace(1) %out
919  ret void
920}
921
922attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
923