xref: /llvm-project/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll (revision 2e5c2982819625d84e0b61aea0ec00de859f0e95)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s
3
4; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a %s
5
6define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0) #0 {
7; GFX940-LABEL: ptr1_i8:
8; GFX940:       ; %bb.1:
9; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
10; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
11; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
12; GFX940-NEXT:    s_branch .LBB0_0
13; GFX940-NEXT:    .p2align 8
14; GFX940-NEXT:  ; %bb.2:
15; GFX940-NEXT:  .LBB0_0:
16; GFX940-NEXT:    s_and_b32 s0, s4, 0xff
17; GFX940-NEXT:    v_mov_b32_e32 v0, 0
18; GFX940-NEXT:    v_mov_b32_e32 v1, s0
19; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
20; GFX940-NEXT:    s_endpgm
21;
22; GFX90a-LABEL: ptr1_i8:
23; GFX90a:       ; %bb.1:
24; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
25; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
26; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
27; GFX90a-NEXT:    s_branch .LBB0_0
28; GFX90a-NEXT:    .p2align 8
29; GFX90a-NEXT:  ; %bb.2:
30; GFX90a-NEXT:  .LBB0_0:
31; GFX90a-NEXT:    s_and_b32 s0, s8, 0xff
32; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
33; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
34; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
35; GFX90a-NEXT:    s_endpgm
36  %ext = zext i8 %arg0 to i32
37  store i32 %ext, ptr addrspace(1) %out
38  ret void
39}
40
41define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zeroext inreg %arg0) #0 {
42; GFX940-LABEL: ptr1_i8_zext_arg:
43; GFX940:       ; %bb.1:
44; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
45; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
46; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
47; GFX940-NEXT:    s_branch .LBB1_0
48; GFX940-NEXT:    .p2align 8
49; GFX940-NEXT:  ; %bb.2:
50; GFX940-NEXT:  .LBB1_0:
51; GFX940-NEXT:    s_and_b32 s0, s4, 0xff
52; GFX940-NEXT:    v_mov_b32_e32 v0, 0
53; GFX940-NEXT:    v_mov_b32_e32 v1, s0
54; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
55; GFX940-NEXT:    s_endpgm
56;
57; GFX90a-LABEL: ptr1_i8_zext_arg:
58; GFX90a:       ; %bb.1:
59; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
60; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
61; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
62; GFX90a-NEXT:    s_branch .LBB1_0
63; GFX90a-NEXT:    .p2align 8
64; GFX90a-NEXT:  ; %bb.2:
65; GFX90a-NEXT:  .LBB1_0:
66; GFX90a-NEXT:    s_and_b32 s0, s8, 0xff
67; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
68; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
69; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
70; GFX90a-NEXT:    s_endpgm
71  %ext = zext i8 %arg0 to i32
72  store i32 %ext, ptr addrspace(1) %out, align 4
73  ret void
74}
75
76define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %arg0) #0 {
77; GFX940-LABEL: ptr1_i16_preload_arg:
78; GFX940:       ; %bb.1:
79; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
80; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
81; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
82; GFX940-NEXT:    s_branch .LBB2_0
83; GFX940-NEXT:    .p2align 8
84; GFX940-NEXT:  ; %bb.2:
85; GFX940-NEXT:  .LBB2_0:
86; GFX940-NEXT:    s_and_b32 s0, s4, 0xffff
87; GFX940-NEXT:    v_mov_b32_e32 v0, 0
88; GFX940-NEXT:    v_mov_b32_e32 v1, s0
89; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
90; GFX940-NEXT:    s_endpgm
91;
92; GFX90a-LABEL: ptr1_i16_preload_arg:
93; GFX90a:       ; %bb.1:
94; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
95; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
96; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX90a-NEXT:    s_branch .LBB2_0
98; GFX90a-NEXT:    .p2align 8
99; GFX90a-NEXT:  ; %bb.2:
100; GFX90a-NEXT:  .LBB2_0:
101; GFX90a-NEXT:    s_and_b32 s0, s8, 0xffff
102; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
103; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
104; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
105; GFX90a-NEXT:    s_endpgm
106  %ext = zext i16 %arg0 to i32
107  store i32 %ext, ptr addrspace(1) %out, align 4
108  ret void
109}
110
111define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32 inreg %arg0) #0 {
112; GFX940-LABEL: ptr1_i32_preload_arg:
113; GFX940:       ; %bb.1:
114; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
115; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
116; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
117; GFX940-NEXT:    s_branch .LBB3_0
118; GFX940-NEXT:    .p2align 8
119; GFX940-NEXT:  ; %bb.2:
120; GFX940-NEXT:  .LBB3_0:
121; GFX940-NEXT:    v_mov_b32_e32 v0, 0
122; GFX940-NEXT:    v_mov_b32_e32 v1, s4
123; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
124; GFX940-NEXT:    s_endpgm
125;
126; GFX90a-LABEL: ptr1_i32_preload_arg:
127; GFX90a:       ; %bb.1:
128; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
129; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
130; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
131; GFX90a-NEXT:    s_branch .LBB3_0
132; GFX90a-NEXT:    .p2align 8
133; GFX90a-NEXT:  ; %bb.2:
134; GFX90a-NEXT:  .LBB3_0:
135; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
136; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
137; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
138; GFX90a-NEXT:    s_endpgm
139  store i32 %arg0, ptr addrspace(1) %out
140  ret void
141}
142
143
144define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspace(1) inreg %out, i32 inreg %arg1) #0 {
145; GFX940-LABEL: i32_ptr1_i32_preload_arg:
146; GFX940:       ; %bb.1:
147; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
148; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
149; GFX940-NEXT:    s_load_dword s6, s[0:1], 0x10
150; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
151; GFX940-NEXT:    s_branch .LBB4_0
152; GFX940-NEXT:    .p2align 8
153; GFX940-NEXT:  ; %bb.2:
154; GFX940-NEXT:  .LBB4_0:
155; GFX940-NEXT:    s_add_i32 s0, s2, s6
156; GFX940-NEXT:    v_mov_b32_e32 v0, 0
157; GFX940-NEXT:    v_mov_b32_e32 v1, s0
158; GFX940-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
159; GFX940-NEXT:    s_endpgm
160;
161; GFX90a-LABEL: i32_ptr1_i32_preload_arg:
162; GFX90a:       ; %bb.1:
163; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
164; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
165; GFX90a-NEXT:    s_load_dword s10, s[4:5], 0x10
166; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
167; GFX90a-NEXT:    s_branch .LBB4_0
168; GFX90a-NEXT:    .p2align 8
169; GFX90a-NEXT:  ; %bb.2:
170; GFX90a-NEXT:  .LBB4_0:
171; GFX90a-NEXT:    s_add_i32 s0, s6, s10
172; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
173; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
174; GFX90a-NEXT:    global_store_dword v0, v1, s[8:9]
175; GFX90a-NEXT:    s_endpgm
176  %add = add i32 %arg0, %arg1
177  store i32 %add, ptr addrspace(1) %out
178  ret void
179}
180
181define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %arg0, i16 inreg %arg1) #0 {
182; GFX940-LABEL: ptr1_i16_i16_preload_arg:
183; GFX940:       ; %bb.1:
184; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
185; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
186; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
187; GFX940-NEXT:    s_branch .LBB5_0
188; GFX940-NEXT:    .p2align 8
189; GFX940-NEXT:  ; %bb.2:
190; GFX940-NEXT:  .LBB5_0:
191; GFX940-NEXT:    s_lshr_b32 s0, s4, 16
192; GFX940-NEXT:    s_and_b32 s1, s4, 0xffff
193; GFX940-NEXT:    s_add_i32 s0, s1, s0
194; GFX940-NEXT:    v_mov_b32_e32 v0, 0
195; GFX940-NEXT:    v_mov_b32_e32 v1, s0
196; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
197; GFX940-NEXT:    s_endpgm
198;
199; GFX90a-LABEL: ptr1_i16_i16_preload_arg:
200; GFX90a:       ; %bb.1:
201; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
202; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
203; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
204; GFX90a-NEXT:    s_branch .LBB5_0
205; GFX90a-NEXT:    .p2align 8
206; GFX90a-NEXT:  ; %bb.2:
207; GFX90a-NEXT:  .LBB5_0:
208; GFX90a-NEXT:    s_lshr_b32 s0, s8, 16
209; GFX90a-NEXT:    s_and_b32 s1, s8, 0xffff
210; GFX90a-NEXT:    s_add_i32 s0, s1, s0
211; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
212; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
213; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
214; GFX90a-NEXT:    s_endpgm
215  %ext = zext i16 %arg0 to i32
216  %ext1 = zext i16 %arg1 to i32
217  %add = add i32 %ext, %ext1
218  store i32 %add, ptr addrspace(1) %out, align 4
219  ret void
220}
221
222define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2 x i8> inreg %in) #0 {
223; GFX940-LABEL: ptr1_v2i8_preload_arg:
224; GFX940:       ; %bb.1:
225; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
226; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
227; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX940-NEXT:    s_branch .LBB6_0
229; GFX940-NEXT:    .p2align 8
230; GFX940-NEXT:  ; %bb.2:
231; GFX940-NEXT:  .LBB6_0:
232; GFX940-NEXT:    v_mov_b32_e32 v0, 0
233; GFX940-NEXT:    v_mov_b32_e32 v1, s4
234; GFX940-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
235; GFX940-NEXT:    s_endpgm
236;
237; GFX90a-LABEL: ptr1_v2i8_preload_arg:
238; GFX90a:       ; %bb.1:
239; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
240; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
241; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
242; GFX90a-NEXT:    s_branch .LBB6_0
243; GFX90a-NEXT:    .p2align 8
244; GFX90a-NEXT:  ; %bb.2:
245; GFX90a-NEXT:  .LBB6_0:
246; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
247; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
248; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
249; GFX90a-NEXT:    s_endpgm
250  store <2 x i8> %in, ptr addrspace(1) %out
251  ret void
252}
253
254
255define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 {
256; GFX940-LABEL: byref_preload_arg:
257; GFX940:       ; %bb.1:
258; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
259; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
260; GFX940-NEXT:    s_branch .LBB7_0
261; GFX940-NEXT:    .p2align 8
262; GFX940-NEXT:  ; %bb.2:
263; GFX940-NEXT:  .LBB7_0:
264; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x100
265; GFX940-NEXT:    v_mov_b32_e32 v0, 0
266; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
267; GFX940-NEXT:    v_mov_b32_e32 v1, s4
268; GFX940-NEXT:    v_mov_b32_e32 v2, s5
269; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
270; GFX940-NEXT:    s_waitcnt vmcnt(0)
271; GFX940-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
272; GFX940-NEXT:    s_waitcnt vmcnt(0)
273; GFX940-NEXT:    s_endpgm
274;
275; GFX90a-LABEL: byref_preload_arg:
276; GFX90a:       ; %bb.1:
277; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
278; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
279; GFX90a-NEXT:    s_branch .LBB7_0
280; GFX90a-NEXT:    .p2align 8
281; GFX90a-NEXT:  ; %bb.2:
282; GFX90a-NEXT:  .LBB7_0:
283; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
284; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
285; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
286; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
287; GFX90a-NEXT:    v_mov_b32_e32 v2, s1
288; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
289; GFX90a-NEXT:    s_waitcnt vmcnt(0)
290; GFX90a-NEXT:    global_store_dword v0, v2, s[6:7]
291; GFX90a-NEXT:    s_waitcnt vmcnt(0)
292; GFX90a-NEXT:    s_endpgm
293  %in = load i32, ptr addrspace(4) %in.byref
294  store volatile i32 %in, ptr addrspace(1) %out, align 4
295  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
296  ret void
297}
298
299; The second argument is not expected to be preloaded with the current behavior.
300
301define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 inreg %after.offset) #0 {
302; GFX940-LABEL: byref_staggered_preload_arg:
303; GFX940:       ; %bb.1:
304; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
305; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
306; GFX940-NEXT:    s_branch .LBB8_0
307; GFX940-NEXT:    .p2align 8
308; GFX940-NEXT:  ; %bb.2:
309; GFX940-NEXT:  .LBB8_0:
310; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x100
311; GFX940-NEXT:    v_mov_b32_e32 v0, 0
312; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
313; GFX940-NEXT:    v_mov_b32_e32 v1, s4
314; GFX940-NEXT:    v_mov_b32_e32 v2, s5
315; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
316; GFX940-NEXT:    s_waitcnt vmcnt(0)
317; GFX940-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
318; GFX940-NEXT:    s_waitcnt vmcnt(0)
319; GFX940-NEXT:    s_endpgm
320;
321; GFX90a-LABEL: byref_staggered_preload_arg:
322; GFX90a:       ; %bb.1:
323; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
324; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
325; GFX90a-NEXT:    s_branch .LBB8_0
326; GFX90a-NEXT:    .p2align 8
327; GFX90a-NEXT:  ; %bb.2:
328; GFX90a-NEXT:  .LBB8_0:
329; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
330; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
331; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
332; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
333; GFX90a-NEXT:    v_mov_b32_e32 v2, s1
334; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
335; GFX90a-NEXT:    s_waitcnt vmcnt(0)
336; GFX90a-NEXT:    global_store_dword v0, v2, s[6:7]
337; GFX90a-NEXT:    s_waitcnt vmcnt(0)
338; GFX90a-NEXT:    s_endpgm
339  %in = load i32, ptr addrspace(4) %in.byref
340  store volatile i32 %in, ptr addrspace(1) %out, align 4
341  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
342  ret void
343}
344
345
346define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x i32> inreg %in) #0 {
347; GFX940-LABEL: v8i32_arg:
348; GFX940:       ; %bb.1:
349; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
350; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
351; GFX940-NEXT:    s_branch .LBB9_0
352; GFX940-NEXT:    .p2align 8
353; GFX940-NEXT:  ; %bb.2:
354; GFX940-NEXT:  .LBB9_0:
355; GFX940-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
356; GFX940-NEXT:    v_mov_b32_e32 v4, 0
357; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
358; GFX940-NEXT:    v_mov_b32_e32 v0, s8
359; GFX940-NEXT:    v_mov_b32_e32 v1, s9
360; GFX940-NEXT:    v_mov_b32_e32 v2, s10
361; GFX940-NEXT:    v_mov_b32_e32 v3, s11
362; GFX940-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
363; GFX940-NEXT:    s_nop 1
364; GFX940-NEXT:    v_mov_b32_e32 v0, s4
365; GFX940-NEXT:    v_mov_b32_e32 v1, s5
366; GFX940-NEXT:    v_mov_b32_e32 v2, s6
367; GFX940-NEXT:    v_mov_b32_e32 v3, s7
368; GFX940-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
369; GFX940-NEXT:    s_endpgm
370;
371; GFX90a-LABEL: v8i32_arg:
372; GFX90a:       ; %bb.1:
373; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
374; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
375; GFX90a-NEXT:    s_branch .LBB9_0
376; GFX90a-NEXT:    .p2align 8
377; GFX90a-NEXT:  ; %bb.2:
378; GFX90a-NEXT:  .LBB9_0:
379; GFX90a-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
380; GFX90a-NEXT:    v_mov_b32_e32 v4, 0
381; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
382; GFX90a-NEXT:    v_mov_b32_e32 v0, s12
383; GFX90a-NEXT:    v_mov_b32_e32 v1, s13
384; GFX90a-NEXT:    v_mov_b32_e32 v2, s14
385; GFX90a-NEXT:    v_mov_b32_e32 v3, s15
386; GFX90a-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
387; GFX90a-NEXT:    s_nop 0
388; GFX90a-NEXT:    v_mov_b32_e32 v0, s8
389; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
390; GFX90a-NEXT:    v_mov_b32_e32 v2, s10
391; GFX90a-NEXT:    v_mov_b32_e32 v3, s11
392; GFX90a-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
393; GFX90a-NEXT:    s_endpgm
394  store <8 x i32> %in, ptr addrspace(1) %out, align 4
395  ret void
396}
397
398define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x i16> inreg %in) #0 {
399; GFX940-LABEL: v3i16_preload_arg:
400; GFX940:       ; %bb.1:
401; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
402; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
403; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
404; GFX940-NEXT:    s_branch .LBB10_0
405; GFX940-NEXT:    .p2align 8
406; GFX940-NEXT:  ; %bb.2:
407; GFX940-NEXT:  .LBB10_0:
408; GFX940-NEXT:    v_mov_b32_e32 v0, 0
409; GFX940-NEXT:    v_mov_b32_e32 v1, s5
410; GFX940-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
411; GFX940-NEXT:    v_mov_b32_e32 v1, s4
412; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
413; GFX940-NEXT:    s_endpgm
414;
415; GFX90a-LABEL: v3i16_preload_arg:
416; GFX90a:       ; %bb.1:
417; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
418; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
419; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
420; GFX90a-NEXT:    s_branch .LBB10_0
421; GFX90a-NEXT:    .p2align 8
422; GFX90a-NEXT:  ; %bb.2:
423; GFX90a-NEXT:  .LBB10_0:
424; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
425; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
426; GFX90a-NEXT:    global_store_short v0, v1, s[6:7] offset:4
427; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
428; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
429; GFX90a-NEXT:    s_endpgm
430  store <3 x i16> %in, ptr addrspace(1) %out, align 4
431  ret void
432}
433
434define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x i32> inreg %in) #0 {
435; GFX940-LABEL: v3i32_preload_arg:
436; GFX940:       ; %bb.1:
437; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
438; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
439; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
440; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
441; GFX940-NEXT:    s_branch .LBB11_0
442; GFX940-NEXT:    .p2align 8
443; GFX940-NEXT:  ; %bb.2:
444; GFX940-NEXT:  .LBB11_0:
445; GFX940-NEXT:    v_mov_b32_e32 v0, s6
446; GFX940-NEXT:    v_mov_b32_e32 v1, s7
447; GFX940-NEXT:    v_mov_b32_e32 v2, s8
448; GFX940-NEXT:    v_mov_b32_e32 v3, 0
449; GFX940-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
450; GFX940-NEXT:    s_endpgm
451;
452; GFX90a-LABEL: v3i32_preload_arg:
453; GFX90a:       ; %bb.1:
454; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
455; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
456; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
457; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
458; GFX90a-NEXT:    s_branch .LBB11_0
459; GFX90a-NEXT:    .p2align 8
460; GFX90a-NEXT:  ; %bb.2:
461; GFX90a-NEXT:  .LBB11_0:
462; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
463; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
464; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
465; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
466; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
467; GFX90a-NEXT:    s_endpgm
468  store <3 x i32> %in, ptr addrspace(1) %out, align 4
469  ret void
470}
471
472define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x float> inreg %in) #0 {
473; GFX940-LABEL: v3f32_preload_arg:
474; GFX940:       ; %bb.1:
475; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
476; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
477; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
478; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX940-NEXT:    s_branch .LBB12_0
480; GFX940-NEXT:    .p2align 8
481; GFX940-NEXT:  ; %bb.2:
482; GFX940-NEXT:  .LBB12_0:
483; GFX940-NEXT:    v_mov_b32_e32 v3, 0
484; GFX940-NEXT:    v_mov_b32_e32 v0, s6
485; GFX940-NEXT:    v_mov_b32_e32 v1, s7
486; GFX940-NEXT:    v_mov_b32_e32 v2, s8
487; GFX940-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
488; GFX940-NEXT:    s_endpgm
489;
490; GFX90a-LABEL: v3f32_preload_arg:
491; GFX90a:       ; %bb.1:
492; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
493; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
494; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
495; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
496; GFX90a-NEXT:    s_branch .LBB12_0
497; GFX90a-NEXT:    .p2align 8
498; GFX90a-NEXT:  ; %bb.2:
499; GFX90a-NEXT:  .LBB12_0:
500; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
501; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
502; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
503; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
504; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
505; GFX90a-NEXT:    s_endpgm
506  store <3 x float> %in, ptr addrspace(1) %out, align 4
507  ret void
508}
509
510define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %out, <5 x i8> inreg %in) #0 {
511; GFX940-LABEL: v5i8_preload_arg:
512; GFX940:       ; %bb.1:
513; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
514; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
515; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
516; GFX940-NEXT:    s_branch .LBB13_0
517; GFX940-NEXT:    .p2align 8
518; GFX940-NEXT:  ; %bb.2:
519; GFX940-NEXT:  .LBB13_0:
520; GFX940-NEXT:    s_lshr_b32 s1, s4, 24
521; GFX940-NEXT:    s_and_b32 s0, s4, 0xffff
522; GFX940-NEXT:    s_lshl_b32 s1, s1, 8
523; GFX940-NEXT:    s_bfe_u32 s4, s4, 0x80010
524; GFX940-NEXT:    s_or_b32 s1, s4, s1
525; GFX940-NEXT:    s_lshl_b32 s1, s1, 16
526; GFX940-NEXT:    s_or_b32 s0, s0, s1
527; GFX940-NEXT:    v_mov_b32_e32 v0, 0
528; GFX940-NEXT:    v_mov_b32_e32 v1, s5
529; GFX940-NEXT:    global_store_byte v0, v1, s[2:3] offset:4 sc0 sc1
530; GFX940-NEXT:    v_mov_b32_e32 v1, s0
531; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
532; GFX940-NEXT:    s_endpgm
533;
534; GFX90a-LABEL: v5i8_preload_arg:
535; GFX90a:       ; %bb.1:
536; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
537; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
538; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
539; GFX90a-NEXT:    s_branch .LBB13_0
540; GFX90a-NEXT:    .p2align 8
541; GFX90a-NEXT:  ; %bb.2:
542; GFX90a-NEXT:  .LBB13_0:
543; GFX90a-NEXT:    s_lshr_b32 s1, s8, 24
544; GFX90a-NEXT:    s_lshl_b32 s1, s1, 8
545; GFX90a-NEXT:    s_bfe_u32 s2, s8, 0x80010
546; GFX90a-NEXT:    s_or_b32 s1, s2, s1
547; GFX90a-NEXT:    s_and_b32 s0, s8, 0xffff
548; GFX90a-NEXT:    s_lshl_b32 s1, s1, 16
549; GFX90a-NEXT:    s_or_b32 s0, s0, s1
550; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
551; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
552; GFX90a-NEXT:    global_store_byte v0, v1, s[6:7] offset:4
553; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
554; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
555; GFX90a-NEXT:    s_endpgm
556  store <5 x i8> %in, ptr addrspace(1) %out, align 4
557  ret void
558}
559
560define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x double> inreg %in) #0 {
561; GFX940-LABEL: v5f64_arg:
562; GFX940:       ; %bb.1:
563; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
564; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
565; GFX940-NEXT:    s_branch .LBB14_0
566; GFX940-NEXT:    .p2align 8
567; GFX940-NEXT:  ; %bb.2:
568; GFX940-NEXT:  .LBB14_0:
569; GFX940-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
570; GFX940-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
571; GFX940-NEXT:    v_mov_b32_e32 v4, 0
572; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
573; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
574; GFX940-NEXT:    v_mov_b32_e32 v0, s8
575; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
576; GFX940-NEXT:    v_mov_b32_e32 v1, s9
577; GFX940-NEXT:    v_mov_b32_e32 v2, s10
578; GFX940-NEXT:    v_mov_b32_e32 v3, s11
579; GFX940-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
580; GFX940-NEXT:    s_nop 1
581; GFX940-NEXT:    v_mov_b32_e32 v0, s4
582; GFX940-NEXT:    v_mov_b32_e32 v1, s5
583; GFX940-NEXT:    v_mov_b32_e32 v2, s6
584; GFX940-NEXT:    v_mov_b32_e32 v3, s7
585; GFX940-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
586; GFX940-NEXT:    s_endpgm
587;
588; GFX90a-LABEL: v5f64_arg:
589; GFX90a:       ; %bb.1:
590; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
591; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
592; GFX90a-NEXT:    s_branch .LBB14_0
593; GFX90a-NEXT:    .p2align 8
594; GFX90a-NEXT:  ; %bb.2:
595; GFX90a-NEXT:  .LBB14_0:
596; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
597; GFX90a-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
598; GFX90a-NEXT:    v_mov_b32_e32 v4, 0
599; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
600; GFX90a-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
601; GFX90a-NEXT:    v_mov_b32_e32 v0, s12
602; GFX90a-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
603; GFX90a-NEXT:    v_mov_b32_e32 v1, s13
604; GFX90a-NEXT:    v_mov_b32_e32 v2, s14
605; GFX90a-NEXT:    v_mov_b32_e32 v3, s15
606; GFX90a-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
607; GFX90a-NEXT:    s_nop 0
608; GFX90a-NEXT:    v_mov_b32_e32 v0, s8
609; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
610; GFX90a-NEXT:    v_mov_b32_e32 v2, s10
611; GFX90a-NEXT:    v_mov_b32_e32 v3, s11
612; GFX90a-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
613; GFX90a-NEXT:    s_endpgm
614  store <5 x double> %in, ptr addrspace(1) %out, align 8
615  ret void
616}
617
618define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8> inreg %in) #0 {
619; GFX940-LABEL: v8i8_preload_arg:
620; GFX940:       ; %bb.1:
621; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
622; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
623; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
624; GFX940-NEXT:    s_branch .LBB15_0
625; GFX940-NEXT:    .p2align 8
626; GFX940-NEXT:  ; %bb.2:
627; GFX940-NEXT:  .LBB15_0:
628; GFX940-NEXT:    s_lshr_b32 s1, s5, 24
629; GFX940-NEXT:    s_and_b32 s0, s5, 0xffff
630; GFX940-NEXT:    s_lshl_b32 s1, s1, 8
631; GFX940-NEXT:    s_bfe_u32 s5, s5, 0x80010
632; GFX940-NEXT:    s_or_b32 s1, s5, s1
633; GFX940-NEXT:    s_lshl_b32 s1, s1, 16
634; GFX940-NEXT:    s_lshr_b32 s5, s4, 24
635; GFX940-NEXT:    s_or_b32 s0, s0, s1
636; GFX940-NEXT:    s_and_b32 s1, s4, 0xffff
637; GFX940-NEXT:    s_lshl_b32 s5, s5, 8
638; GFX940-NEXT:    s_bfe_u32 s4, s4, 0x80010
639; GFX940-NEXT:    s_or_b32 s4, s4, s5
640; GFX940-NEXT:    s_lshl_b32 s4, s4, 16
641; GFX940-NEXT:    s_or_b32 s1, s1, s4
642; GFX940-NEXT:    v_mov_b32_e32 v0, s1
643; GFX940-NEXT:    v_mov_b32_e32 v1, s0
644; GFX940-NEXT:    v_mov_b32_e32 v2, 0
645; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
646; GFX940-NEXT:    s_endpgm
647;
648; GFX90a-LABEL: v8i8_preload_arg:
649; GFX90a:       ; %bb.1:
650; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
651; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
652; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
653; GFX90a-NEXT:    s_branch .LBB15_0
654; GFX90a-NEXT:    .p2align 8
655; GFX90a-NEXT:  ; %bb.2:
656; GFX90a-NEXT:  .LBB15_0:
657; GFX90a-NEXT:    s_lshr_b32 s1, s9, 24
658; GFX90a-NEXT:    s_lshl_b32 s1, s1, 8
659; GFX90a-NEXT:    s_bfe_u32 s2, s9, 0x80010
660; GFX90a-NEXT:    s_or_b32 s1, s2, s1
661; GFX90a-NEXT:    s_lshr_b32 s2, s8, 24
662; GFX90a-NEXT:    s_lshl_b32 s2, s2, 8
663; GFX90a-NEXT:    s_bfe_u32 s3, s8, 0x80010
664; GFX90a-NEXT:    s_and_b32 s0, s9, 0xffff
665; GFX90a-NEXT:    s_lshl_b32 s1, s1, 16
666; GFX90a-NEXT:    s_or_b32 s2, s3, s2
667; GFX90a-NEXT:    s_or_b32 s0, s0, s1
668; GFX90a-NEXT:    s_and_b32 s1, s8, 0xffff
669; GFX90a-NEXT:    s_lshl_b32 s2, s2, 16
670; GFX90a-NEXT:    s_or_b32 s1, s1, s2
671; GFX90a-NEXT:    v_mov_b32_e32 v0, s1
672; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
673; GFX90a-NEXT:    v_mov_b32_e32 v2, 0
674; GFX90a-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
675; GFX90a-NEXT:    s_endpgm
676  store <8 x i8> %in, ptr addrspace(1) %out
677  ret void
678}
679
680define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i64 inreg %a) #0 {
681; GFX940-LABEL: i64_kernel_preload_arg:
682; GFX940:       ; %bb.1:
683; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
684; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
685; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
686; GFX940-NEXT:    s_branch .LBB16_0
687; GFX940-NEXT:    .p2align 8
688; GFX940-NEXT:  ; %bb.2:
689; GFX940-NEXT:  .LBB16_0:
690; GFX940-NEXT:    v_mov_b32_e32 v2, 0
691; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
692; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
693; GFX940-NEXT:    s_endpgm
694;
695; GFX90a-LABEL: i64_kernel_preload_arg:
696; GFX90a:       ; %bb.1:
697; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
698; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
699; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
700; GFX90a-NEXT:    s_branch .LBB16_0
701; GFX90a-NEXT:    .p2align 8
702; GFX90a-NEXT:  ; %bb.2:
703; GFX90a-NEXT:  .LBB16_0:
704; GFX90a-NEXT:    v_mov_b32_e32 v2, 0
705; GFX90a-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
706; GFX90a-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
707; GFX90a-NEXT:    s_endpgm
708  store i64 %a, ptr addrspace(1) %out, align 8
709  ret void
710}
711
712define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, double inreg %in) #0 {
713; GFX940-LABEL: f64_kernel_preload_arg:
714; GFX940:       ; %bb.1:
715; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
716; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
717; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
718; GFX940-NEXT:    s_branch .LBB17_0
719; GFX940-NEXT:    .p2align 8
720; GFX940-NEXT:  ; %bb.2:
721; GFX940-NEXT:  .LBB17_0:
722; GFX940-NEXT:    v_mov_b32_e32 v2, 0
723; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
724; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
725; GFX940-NEXT:    s_endpgm
726;
727; GFX90a-LABEL: f64_kernel_preload_arg:
728; GFX90a:       ; %bb.1:
729; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
730; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
731; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
732; GFX90a-NEXT:    s_branch .LBB17_0
733; GFX90a-NEXT:    .p2align 8
734; GFX90a-NEXT:  ; %bb.2:
735; GFX90a-NEXT:  .LBB17_0:
736; GFX90a-NEXT:    v_mov_b32_e32 v2, 0
737; GFX90a-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
738; GFX90a-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
739; GFX90a-NEXT:    s_endpgm
740  store double %in, ptr addrspace(1) %out
741  ret void
742}
743
744define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out, half inreg %in) #0 {
745; GFX940-LABEL: half_kernel_preload_arg:
746; GFX940:       ; %bb.1:
747; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
748; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
749; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
750; GFX940-NEXT:    s_branch .LBB18_0
751; GFX940-NEXT:    .p2align 8
752; GFX940-NEXT:  ; %bb.2:
753; GFX940-NEXT:  .LBB18_0:
754; GFX940-NEXT:    v_mov_b32_e32 v0, 0
755; GFX940-NEXT:    v_mov_b32_e32 v1, s4
756; GFX940-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
757; GFX940-NEXT:    s_endpgm
758;
759; GFX90a-LABEL: half_kernel_preload_arg:
760; GFX90a:       ; %bb.1:
761; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
762; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
763; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
764; GFX90a-NEXT:    s_branch .LBB18_0
765; GFX90a-NEXT:    .p2align 8
766; GFX90a-NEXT:  ; %bb.2:
767; GFX90a-NEXT:  .LBB18_0:
768; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
769; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
770; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
771; GFX90a-NEXT:    s_endpgm
772  store half %in, ptr addrspace(1) %out
773  ret void
774}
775
776define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, bfloat inreg %in) #0 {
777; GFX940-LABEL: bfloat_kernel_preload_arg:
778; GFX940:       ; %bb.1:
779; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
780; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
781; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
782; GFX940-NEXT:    s_branch .LBB19_0
783; GFX940-NEXT:    .p2align 8
784; GFX940-NEXT:  ; %bb.2:
785; GFX940-NEXT:  .LBB19_0:
786; GFX940-NEXT:    v_mov_b32_e32 v0, 0
787; GFX940-NEXT:    v_mov_b32_e32 v1, s4
788; GFX940-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
789; GFX940-NEXT:    s_endpgm
790;
791; GFX90a-LABEL: bfloat_kernel_preload_arg:
792; GFX90a:       ; %bb.1:
793; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
794; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
795; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
796; GFX90a-NEXT:    s_branch .LBB19_0
797; GFX90a-NEXT:    .p2align 8
798; GFX90a-NEXT:  ; %bb.2:
799; GFX90a-NEXT:  .LBB19_0:
800; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
801; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
802; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
803; GFX90a-NEXT:    s_endpgm
804  store bfloat %in, ptr addrspace(1) %out
805  ret void
806}
807
808define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <2 x bfloat> inreg %in) #0 {
809; GFX940-LABEL: v2bfloat_kernel_preload_arg:
810; GFX940:       ; %bb.1:
811; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
812; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
813; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
814; GFX940-NEXT:    s_branch .LBB20_0
815; GFX940-NEXT:    .p2align 8
816; GFX940-NEXT:  ; %bb.2:
817; GFX940-NEXT:  .LBB20_0:
818; GFX940-NEXT:    v_mov_b32_e32 v0, 0
819; GFX940-NEXT:    v_mov_b32_e32 v1, s4
820; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
821; GFX940-NEXT:    s_endpgm
822;
823; GFX90a-LABEL: v2bfloat_kernel_preload_arg:
824; GFX90a:       ; %bb.1:
825; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
826; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
827; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
828; GFX90a-NEXT:    s_branch .LBB20_0
829; GFX90a-NEXT:    .p2align 8
830; GFX90a-NEXT:  ; %bb.2:
831; GFX90a-NEXT:  .LBB20_0:
832; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
833; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
834; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
835; GFX90a-NEXT:    s_endpgm
836  store <2 x bfloat> %in, ptr addrspace(1) %out
837  ret void
838}
839
840define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <3 x bfloat> inreg %in) #0 {
841; GFX940-LABEL: v3bfloat_kernel_preload_arg:
842; GFX940:       ; %bb.1:
843; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
844; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
845; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
846; GFX940-NEXT:    s_branch .LBB21_0
847; GFX940-NEXT:    .p2align 8
848; GFX940-NEXT:  ; %bb.2:
849; GFX940-NEXT:  .LBB21_0:
850; GFX940-NEXT:    v_mov_b32_e32 v0, 0
851; GFX940-NEXT:    v_mov_b32_e32 v1, s5
852; GFX940-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
853; GFX940-NEXT:    v_mov_b32_e32 v1, s4
854; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
855; GFX940-NEXT:    s_endpgm
856;
857; GFX90a-LABEL: v3bfloat_kernel_preload_arg:
858; GFX90a:       ; %bb.1:
859; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
860; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
861; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
862; GFX90a-NEXT:    s_branch .LBB21_0
863; GFX90a-NEXT:    .p2align 8
864; GFX90a-NEXT:  ; %bb.2:
865; GFX90a-NEXT:  .LBB21_0:
866; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
867; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
868; GFX90a-NEXT:    global_store_short v0, v1, s[6:7] offset:4
869; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
870; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
871; GFX90a-NEXT:    s_endpgm
872  store <3 x bfloat> %in, ptr addrspace(1) %out
873  ret void
874}
875
876define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <6 x bfloat> inreg %in) #0 {
877; GFX940-LABEL: v6bfloat_kernel_preload_arg:
878; GFX940:       ; %bb.1:
879; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
880; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
881; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
882; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
883; GFX940-NEXT:    s_branch .LBB22_0
884; GFX940-NEXT:    .p2align 8
885; GFX940-NEXT:  ; %bb.2:
886; GFX940-NEXT:  .LBB22_0:
887; GFX940-NEXT:    v_mov_b32_e32 v0, s6
888; GFX940-NEXT:    v_mov_b32_e32 v1, s7
889; GFX940-NEXT:    v_mov_b32_e32 v2, s8
890; GFX940-NEXT:    v_mov_b32_e32 v3, 0
891; GFX940-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
892; GFX940-NEXT:    s_endpgm
893;
894; GFX90a-LABEL: v6bfloat_kernel_preload_arg:
895; GFX90a:       ; %bb.1:
896; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
897; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
898; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
899; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
900; GFX90a-NEXT:    s_branch .LBB22_0
901; GFX90a-NEXT:    .p2align 8
902; GFX90a-NEXT:  ; %bb.2:
903; GFX90a-NEXT:  .LBB22_0:
904; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
905; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
906; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
907; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
908; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
909; GFX90a-NEXT:    s_endpgm
910  store <6 x bfloat> %in, ptr addrspace(1) %out
911  ret void
912}
913
914define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, half inreg %in, <7 x bfloat> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
915; GFX940-LABEL: half_v7bfloat_kernel_preload_arg:
916; GFX940:       ; %bb.1:
917; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
918; GFX940-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x8
919; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
920; GFX940-NEXT:    s_branch .LBB23_0
921; GFX940-NEXT:    .p2align 8
922; GFX940-NEXT:  ; %bb.2:
923; GFX940-NEXT:  .LBB23_0:
924; GFX940-NEXT:    v_mov_b32_e32 v3, 0
925; GFX940-NEXT:    v_mov_b32_e32 v0, s4
926; GFX940-NEXT:    global_store_short v3, v0, s[2:3] sc0 sc1
927; GFX940-NEXT:    v_mov_b32_e32 v0, s9
928; GFX940-NEXT:    global_store_short v3, v0, s[10:11] offset:12 sc0 sc1
929; GFX940-NEXT:    v_mov_b32_e32 v2, s8
930; GFX940-NEXT:    v_mov_b32_e32 v0, s6
931; GFX940-NEXT:    v_mov_b32_e32 v1, s7
932; GFX940-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1
933; GFX940-NEXT:    s_endpgm
934;
935; GFX90a-LABEL: half_v7bfloat_kernel_preload_arg:
936; GFX90a:       ; %bb.1:
937; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
938; GFX90a-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
939; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
940; GFX90a-NEXT:    s_branch .LBB23_0
941; GFX90a-NEXT:    .p2align 8
942; GFX90a-NEXT:  ; %bb.2:
943; GFX90a-NEXT:  .LBB23_0:
944; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
945; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
946; GFX90a-NEXT:    v_mov_b32_e32 v0, s8
947; GFX90a-NEXT:    global_store_short v3, v0, s[6:7]
948; GFX90a-NEXT:    v_mov_b32_e32 v0, s13
949; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
950; GFX90a-NEXT:    global_store_short v3, v0, s[0:1] offset:12
951; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
952; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
953; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
954; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
955; GFX90a-NEXT:    s_endpgm
956  store half %in, ptr addrspace(1) %out
957  store <7 x bfloat> %in2, ptr addrspace(1) %out2
958  ret void
959}
960
961define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1 inreg %in) #0 {
962; GFX940-LABEL: i1_kernel_preload_arg:
963; GFX940:       ; %bb.1:
964; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
965; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x8
966; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
967; GFX940-NEXT:    s_branch .LBB24_0
968; GFX940-NEXT:    .p2align 8
969; GFX940-NEXT:  ; %bb.2:
970; GFX940-NEXT:  .LBB24_0:
971; GFX940-NEXT:    s_and_b32 s0, s4, 1
972; GFX940-NEXT:    v_mov_b32_e32 v0, 0
973; GFX940-NEXT:    v_mov_b32_e32 v1, s0
974; GFX940-NEXT:    global_store_byte v0, v1, s[2:3] sc0 sc1
975; GFX940-NEXT:    s_endpgm
976;
977; GFX90a-LABEL: i1_kernel_preload_arg:
978; GFX90a:       ; %bb.1:
979; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
980; GFX90a-NEXT:    s_load_dword s8, s[4:5], 0x8
981; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
982; GFX90a-NEXT:    s_branch .LBB24_0
983; GFX90a-NEXT:    .p2align 8
984; GFX90a-NEXT:  ; %bb.2:
985; GFX90a-NEXT:  .LBB24_0:
986; GFX90a-NEXT:    s_and_b32 s0, s8, 1
987; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
988; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
989; GFX90a-NEXT:    global_store_byte v0, v1, s[6:7]
990; GFX90a-NEXT:    s_endpgm
991  store i1 %in, ptr addrspace(1) %out
992  ret void
993}
994
995define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out, fp128 inreg %in) #0 {
996; GFX940-LABEL: fp128_kernel_preload_arg:
997; GFX940:       ; %bb.1:
998; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
999; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
1000; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
1001; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1002; GFX940-NEXT:    s_branch .LBB25_0
1003; GFX940-NEXT:    .p2align 8
1004; GFX940-NEXT:  ; %bb.2:
1005; GFX940-NEXT:  .LBB25_0:
1006; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1007; GFX940-NEXT:    v_mov_b32_e32 v0, s6
1008; GFX940-NEXT:    v_mov_b32_e32 v1, s7
1009; GFX940-NEXT:    v_mov_b32_e32 v2, s8
1010; GFX940-NEXT:    v_mov_b32_e32 v3, s9
1011; GFX940-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
1012; GFX940-NEXT:    s_endpgm
1013;
1014; GFX90a-LABEL: fp128_kernel_preload_arg:
1015; GFX90a:       ; %bb.1:
1016; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1017; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
1018; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
1019; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
1020; GFX90a-NEXT:    s_branch .LBB25_0
1021; GFX90a-NEXT:    .p2align 8
1022; GFX90a-NEXT:  ; %bb.2:
1023; GFX90a-NEXT:  .LBB25_0:
1024; GFX90a-NEXT:    v_mov_b32_e32 v4, 0
1025; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
1026; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
1027; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
1028; GFX90a-NEXT:    v_mov_b32_e32 v3, s13
1029; GFX90a-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
1030; GFX90a-NEXT:    s_endpgm
1031  store fp128 %in, ptr addrspace(1) %out
1032  ret void
1033}
1034
1035define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out, <7 x i8> inreg %in) #0 {
1036; GFX940-LABEL: v7i8_kernel_preload_arg:
1037; GFX940:       ; %bb.1:
1038; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1039; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
1040; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1041; GFX940-NEXT:    s_branch .LBB26_0
1042; GFX940-NEXT:    .p2align 8
1043; GFX940-NEXT:  ; %bb.2:
1044; GFX940-NEXT:  .LBB26_0:
1045; GFX940-NEXT:    s_lshr_b32 s1, s4, 24
1046; GFX940-NEXT:    s_and_b32 s0, s4, 0xffff
1047; GFX940-NEXT:    s_lshl_b32 s1, s1, 8
1048; GFX940-NEXT:    s_bfe_u32 s4, s4, 0x80010
1049; GFX940-NEXT:    s_or_b32 s1, s4, s1
1050; GFX940-NEXT:    s_lshl_b32 s1, s1, 16
1051; GFX940-NEXT:    s_or_b32 s0, s0, s1
1052; GFX940-NEXT:    v_mov_b32_e32 v0, 0
1053; GFX940-NEXT:    v_mov_b32_e32 v1, s5
1054; GFX940-NEXT:    global_store_byte_d16_hi v0, v1, s[2:3] offset:6 sc0 sc1
1055; GFX940-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
1056; GFX940-NEXT:    v_mov_b32_e32 v1, s0
1057; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
1058; GFX940-NEXT:    s_endpgm
1059;
1060; GFX90a-LABEL: v7i8_kernel_preload_arg:
1061; GFX90a:       ; %bb.1:
1062; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1063; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
1064; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
1065; GFX90a-NEXT:    s_branch .LBB26_0
1066; GFX90a-NEXT:    .p2align 8
1067; GFX90a-NEXT:  ; %bb.2:
1068; GFX90a-NEXT:  .LBB26_0:
1069; GFX90a-NEXT:    s_lshr_b32 s1, s8, 24
1070; GFX90a-NEXT:    s_lshl_b32 s1, s1, 8
1071; GFX90a-NEXT:    s_bfe_u32 s2, s8, 0x80010
1072; GFX90a-NEXT:    s_or_b32 s1, s2, s1
1073; GFX90a-NEXT:    s_and_b32 s0, s8, 0xffff
1074; GFX90a-NEXT:    s_lshl_b32 s1, s1, 16
1075; GFX90a-NEXT:    s_or_b32 s0, s0, s1
1076; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
1077; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
1078; GFX90a-NEXT:    global_store_byte_d16_hi v0, v1, s[6:7] offset:6
1079; GFX90a-NEXT:    global_store_short v0, v1, s[6:7] offset:4
1080; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
1081; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
1082; GFX90a-NEXT:    s_endpgm
1083  store <7 x i8> %in, ptr addrspace(1) %out
1084  ret void
1085}
1086
1087define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out, <7 x half> inreg %in) #0 {
1088; GFX940-LABEL: v7half_kernel_preload_arg:
1089; GFX940:       ; %bb.1:
1090; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1091; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
1092; GFX940-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x18
1093; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1094; GFX940-NEXT:    s_branch .LBB27_0
1095; GFX940-NEXT:    .p2align 8
1096; GFX940-NEXT:  ; %bb.2:
1097; GFX940-NEXT:  .LBB27_0:
1098; GFX940-NEXT:    v_mov_b32_e32 v3, 0
1099; GFX940-NEXT:    v_mov_b32_e32 v0, s9
1100; GFX940-NEXT:    global_store_short v3, v0, s[2:3] offset:12 sc0 sc1
1101; GFX940-NEXT:    v_mov_b32_e32 v2, s8
1102; GFX940-NEXT:    v_mov_b32_e32 v0, s6
1103; GFX940-NEXT:    v_mov_b32_e32 v1, s7
1104; GFX940-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
1105; GFX940-NEXT:    s_endpgm
1106;
1107; GFX90a-LABEL: v7half_kernel_preload_arg:
1108; GFX90a:       ; %bb.1:
1109; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1110; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
1111; GFX90a-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x18
1112; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
1113; GFX90a-NEXT:    s_branch .LBB27_0
1114; GFX90a-NEXT:    .p2align 8
1115; GFX90a-NEXT:  ; %bb.2:
1116; GFX90a-NEXT:  .LBB27_0:
1117; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
1118; GFX90a-NEXT:    v_mov_b32_e32 v0, s13
1119; GFX90a-NEXT:    global_store_short v3, v0, s[6:7] offset:12
1120; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
1121; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
1122; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
1123; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
1124; GFX90a-NEXT:    s_endpgm
1125  store <7 x half> %in, ptr addrspace(1) %out
1126  ret void
1127}
1128
1129define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, i32 inreg %in2, ptr addrspace(1) inreg %out2) #0 {
1130; GFX940-LABEL: i16_i32_kernel_preload_arg:
1131; GFX940:       ; %bb.1:
1132; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1133; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
1134; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1135; GFX940-NEXT:    s_branch .LBB28_0
1136; GFX940-NEXT:    .p2align 8
1137; GFX940-NEXT:  ; %bb.2:
1138; GFX940-NEXT:  .LBB28_0:
1139; GFX940-NEXT:    v_mov_b32_e32 v0, 0
1140; GFX940-NEXT:    v_mov_b32_e32 v1, s4
1141; GFX940-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
1142; GFX940-NEXT:    v_mov_b32_e32 v1, s5
1143; GFX940-NEXT:    global_store_dword v0, v1, s[6:7] sc0 sc1
1144; GFX940-NEXT:    s_endpgm
1145;
1146; GFX90a-LABEL: i16_i32_kernel_preload_arg:
1147; GFX90a:       ; %bb.1:
1148; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1149; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
1150; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
1151; GFX90a-NEXT:    s_branch .LBB28_0
1152; GFX90a-NEXT:    .p2align 8
1153; GFX90a-NEXT:  ; %bb.2:
1154; GFX90a-NEXT:  .LBB28_0:
1155; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
1156; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
1157; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
1158; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
1159; GFX90a-NEXT:    global_store_dword v0, v1, s[10:11]
1160; GFX90a-NEXT:    s_endpgm
1161  store i16 %in, ptr addrspace(1) %out
1162  store i32 %in2, ptr addrspace(1) %out2
1163  ret void
1164}
1165
1166define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, <3 x i32> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
1167; GFX940-LABEL: i16_v3i32_kernel_preload_arg:
1168; GFX940:       ; %bb.1:
1169; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1170; GFX940-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x8
1171; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1172; GFX940-NEXT:    s_branch .LBB29_0
1173; GFX940-NEXT:    .p2align 8
1174; GFX940-NEXT:  ; %bb.2:
1175; GFX940-NEXT:  .LBB29_0:
1176; GFX940-NEXT:    v_mov_b32_e32 v3, 0
1177; GFX940-NEXT:    v_mov_b32_e32 v4, s4
1178; GFX940-NEXT:    v_mov_b32_e32 v0, s6
1179; GFX940-NEXT:    v_mov_b32_e32 v1, s7
1180; GFX940-NEXT:    v_mov_b32_e32 v2, s8
1181; GFX940-NEXT:    global_store_short v3, v4, s[2:3] sc0 sc1
1182; GFX940-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1
1183; GFX940-NEXT:    s_endpgm
1184;
1185; GFX90a-LABEL: i16_v3i32_kernel_preload_arg:
1186; GFX90a:       ; %bb.1:
1187; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1188; GFX90a-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
1189; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
1190; GFX90a-NEXT:    s_branch .LBB29_0
1191; GFX90a-NEXT:    .p2align 8
1192; GFX90a-NEXT:  ; %bb.2:
1193; GFX90a-NEXT:  .LBB29_0:
1194; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
1195; GFX90a-NEXT:    v_mov_b32_e32 v3, 0
1196; GFX90a-NEXT:    v_mov_b32_e32 v4, s8
1197; GFX90a-NEXT:    v_mov_b32_e32 v0, s10
1198; GFX90a-NEXT:    v_mov_b32_e32 v1, s11
1199; GFX90a-NEXT:    v_mov_b32_e32 v2, s12
1200; GFX90a-NEXT:    global_store_short v3, v4, s[6:7]
1201; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
1202; GFX90a-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
1203; GFX90a-NEXT:    s_endpgm
1204  store i16 %in, ptr addrspace(1) %out
1205  store <3 x i32> %in2, ptr addrspace(1) %out2
1206  ret void
1207}
1208
1209define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, i16 inreg %in2, ptr addrspace(1) inreg %out2) #0 {
1210; GFX940-LABEL: i16_i16_kernel_preload_arg:
1211; GFX940:       ; %bb.1:
1212; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1213; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
1214; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1215; GFX940-NEXT:    s_branch .LBB30_0
1216; GFX940-NEXT:    .p2align 8
1217; GFX940-NEXT:  ; %bb.2:
1218; GFX940-NEXT:  .LBB30_0:
1219; GFX940-NEXT:    v_mov_b32_e32 v0, 0
1220; GFX940-NEXT:    v_mov_b32_e32 v1, s4
1221; GFX940-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
1222; GFX940-NEXT:    global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
1223; GFX940-NEXT:    s_endpgm
1224;
1225; GFX90a-LABEL: i16_i16_kernel_preload_arg:
1226; GFX90a:       ; %bb.1:
1227; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1228; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
1229; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
1230; GFX90a-NEXT:    s_branch .LBB30_0
1231; GFX90a-NEXT:    .p2align 8
1232; GFX90a-NEXT:  ; %bb.2:
1233; GFX90a-NEXT:  .LBB30_0:
1234; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
1235; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
1236; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
1237; GFX90a-NEXT:    global_store_short_d16_hi v0, v1, s[10:11]
1238; GFX90a-NEXT:    s_endpgm
1239  store i16 %in, ptr addrspace(1) %out
1240  store i16 %in2, ptr addrspace(1) %out2
1241  ret void
1242}
1243
1244define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, <2 x i8> inreg %in2, ptr addrspace(1) inreg %out2) #0 {
1245; GFX940-LABEL: i16_v2i8_kernel_preload_arg:
1246; GFX940:       ; %bb.1:
1247; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1248; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
1249; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1250; GFX940-NEXT:    s_branch .LBB31_0
1251; GFX940-NEXT:    .p2align 8
1252; GFX940-NEXT:  ; %bb.2:
1253; GFX940-NEXT:  .LBB31_0:
1254; GFX940-NEXT:    s_lshr_b32 s0, s4, 24
1255; GFX940-NEXT:    s_lshl_b32 s0, s0, 8
1256; GFX940-NEXT:    s_bfe_u32 s1, s4, 0x80010
1257; GFX940-NEXT:    s_or_b32 s0, s1, s0
1258; GFX940-NEXT:    v_mov_b32_e32 v0, 0
1259; GFX940-NEXT:    v_mov_b32_e32 v1, s4
1260; GFX940-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
1261; GFX940-NEXT:    v_mov_b32_e32 v1, s0
1262; GFX940-NEXT:    global_store_short v0, v1, s[6:7] sc0 sc1
1263; GFX940-NEXT:    s_endpgm
1264;
1265; GFX90a-LABEL: i16_v2i8_kernel_preload_arg:
1266; GFX90a:       ; %bb.1:
1267; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1268; GFX90a-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
1269; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
1270; GFX90a-NEXT:    s_branch .LBB31_0
1271; GFX90a-NEXT:    .p2align 8
1272; GFX90a-NEXT:  ; %bb.2:
1273; GFX90a-NEXT:  .LBB31_0:
1274; GFX90a-NEXT:    s_lshr_b32 s0, s8, 24
1275; GFX90a-NEXT:    s_lshl_b32 s0, s0, 8
1276; GFX90a-NEXT:    s_bfe_u32 s1, s8, 0x80010
1277; GFX90a-NEXT:    s_or_b32 s0, s1, s0
1278; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
1279; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
1280; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
1281; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
1282; GFX90a-NEXT:    global_store_short v0, v1, s[10:11]
1283; GFX90a-NEXT:    s_endpgm
1284  store i16 %in, ptr addrspace(1) %out
1285  store <2 x i8> %in2, ptr addrspace(1) %out2
1286  ret void
1287}
1288
1289; The second argument is not expected to be preloaded with the current behavior.
1290
1291define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, ptr addrspace(1) %out, i32 inreg %arg1) #0 {
1292; GFX940-LABEL: i32_ptr1_i32_staggered_preload_arg:
1293; GFX940:       ; %bb.1:
1294; GFX940-NEXT:    s_load_dword s2, s[0:1], 0x0
1295; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1296; GFX940-NEXT:    s_branch .LBB32_0
1297; GFX940-NEXT:    .p2align 8
1298; GFX940-NEXT:  ; %bb.2:
1299; GFX940-NEXT:  .LBB32_0:
1300; GFX940-NEXT:    s_load_dword s3, s[0:1], 0x10
1301; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
1302; GFX940-NEXT:    v_mov_b32_e32 v0, 0
1303; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1304; GFX940-NEXT:    s_add_i32 s0, s2, s3
1305; GFX940-NEXT:    v_mov_b32_e32 v1, s0
1306; GFX940-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
1307; GFX940-NEXT:    s_endpgm
1308;
1309; GFX90a-LABEL: i32_ptr1_i32_staggered_preload_arg:
1310; GFX90a:       ; %bb.1:
1311; GFX90a-NEXT:    s_load_dword s6, s[4:5], 0x0
1312; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
1313; GFX90a-NEXT:    s_branch .LBB32_0
1314; GFX90a-NEXT:    .p2align 8
1315; GFX90a-NEXT:  ; %bb.2:
1316; GFX90a-NEXT:  .LBB32_0:
1317; GFX90a-NEXT:    s_load_dword s2, s[4:5], 0x10
1318; GFX90a-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1319; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
1320; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
1321; GFX90a-NEXT:    s_add_i32 s2, s6, s2
1322; GFX90a-NEXT:    v_mov_b32_e32 v1, s2
1323; GFX90a-NEXT:    global_store_dword v0, v1, s[0:1]
1324; GFX90a-NEXT:    s_endpgm
1325  %add = add i32 %arg0, %arg1
1326  store i32 %add, ptr addrspace(1) %out
1327  ret void
1328}
1329
1330define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out, i8 inreg %arg0, i32 inreg %unused) #0 {
1331; GFX940-LABEL: ptr1_i8_trailing_unused:
1332; GFX940:       ; %bb.1:
1333; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1334; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
1335; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1336; GFX940-NEXT:    s_branch .LBB33_0
1337; GFX940-NEXT:    .p2align 8
1338; GFX940-NEXT:  ; %bb.2:
1339; GFX940-NEXT:  .LBB33_0:
1340; GFX940-NEXT:    s_and_b32 s0, s4, 0xff
1341; GFX940-NEXT:    v_mov_b32_e32 v0, 0
1342; GFX940-NEXT:    v_mov_b32_e32 v1, s0
1343; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
1344; GFX940-NEXT:    s_endpgm
1345;
1346; GFX90a-LABEL: ptr1_i8_trailing_unused:
1347; GFX90a:       ; %bb.1:
1348; GFX90a-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1349; GFX90a-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x8
1350; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
1351; GFX90a-NEXT:    s_branch .LBB33_0
1352; GFX90a-NEXT:    .p2align 8
1353; GFX90a-NEXT:  ; %bb.2:
1354; GFX90a-NEXT:  .LBB33_0:
1355; GFX90a-NEXT:    s_and_b32 s0, s8, 0xff
1356; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
1357; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
1358; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
1359; GFX90a-NEXT:    s_endpgm
1360  %ext = zext i8 %arg0 to i32
1361  store i32 %ext, ptr addrspace(1) %out
1362  ret void
1363}
1364
1365attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
1366