xref: /llvm-project/llvm/test/CodeGen/AMDGPU/kernel-args.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=SI %s
3; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI %s
4; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc < %s -mtriple=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -check-prefixes=EGCM,EG %s
6; RUN: llc < %s -mtriple=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -check-prefixes=EGCM,CM %s
7
8define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind {
9; SI-LABEL: i8_arg:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
12; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
13; SI-NEXT:    s_mov_b32 s3, 0xf000
14; SI-NEXT:    s_waitcnt lgkmcnt(0)
15; SI-NEXT:    s_and_b32 s4, s2, 0xff
16; SI-NEXT:    s_mov_b32 s2, -1
17; SI-NEXT:    v_mov_b32_e32 v0, s4
18; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
19; SI-NEXT:    s_endpgm
20;
21; VI-LABEL: i8_arg:
22; VI:       ; %bb.0:
23; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
24; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
25; VI-NEXT:    s_waitcnt lgkmcnt(0)
26; VI-NEXT:    s_and_b32 s2, s2, 0xff
27; VI-NEXT:    v_mov_b32_e32 v0, s0
28; VI-NEXT:    v_mov_b32_e32 v1, s1
29; VI-NEXT:    v_mov_b32_e32 v2, s2
30; VI-NEXT:    flat_store_dword v[0:1], v2
31; VI-NEXT:    s_endpgm
32;
33; GFX9-LABEL: i8_arg:
34; GFX9:       ; %bb.0:
35; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
36; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
37; GFX9-NEXT:    v_mov_b32_e32 v0, 0
38; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
40; GFX9-NEXT:    v_mov_b32_e32 v1, s2
41; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
42; GFX9-NEXT:    s_endpgm
43;
44; EG-LABEL: i8_arg:
45; EG:       ; %bb.0:
46; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
47; EG-NEXT:    TEX 0 @6
48; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
49; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
50; EG-NEXT:    CF_END
51; EG-NEXT:    PAD
52; EG-NEXT:    Fetch clause starting at 6:
53; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
54; EG-NEXT:    ALU clause starting at 8:
55; EG-NEXT:     MOV * T0.X, 0.0,
56; EG-NEXT:    ALU clause starting at 9:
57; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
58; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
59;
60; CM-LABEL: i8_arg:
61; CM:       ; %bb.0:
62; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
63; CM-NEXT:    TEX 0 @6
64; CM-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
65; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
66; CM-NEXT:    CF_END
67; CM-NEXT:    PAD
68; CM-NEXT:    Fetch clause starting at 6:
69; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
70; CM-NEXT:    ALU clause starting at 8:
71; CM-NEXT:     MOV * T0.X, 0.0,
72; CM-NEXT:    ALU clause starting at 9:
73; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
74; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
75  %ext = zext i8 %in to i32
76  store i32 %ext, ptr addrspace(1) %out, align 4
77  ret void
78}
79
80define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind {
81; SI-LABEL: i8_zext_arg:
82; SI:       ; %bb.0:
83; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
84; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
85; SI-NEXT:    s_mov_b32 s3, 0xf000
86; SI-NEXT:    s_waitcnt lgkmcnt(0)
87; SI-NEXT:    s_and_b32 s4, s2, 0xff
88; SI-NEXT:    s_mov_b32 s2, -1
89; SI-NEXT:    v_mov_b32_e32 v0, s4
90; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
91; SI-NEXT:    s_endpgm
92;
93; VI-LABEL: i8_zext_arg:
94; VI:       ; %bb.0:
95; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
96; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
97; VI-NEXT:    s_waitcnt lgkmcnt(0)
98; VI-NEXT:    s_and_b32 s2, s2, 0xff
99; VI-NEXT:    v_mov_b32_e32 v0, s0
100; VI-NEXT:    v_mov_b32_e32 v1, s1
101; VI-NEXT:    v_mov_b32_e32 v2, s2
102; VI-NEXT:    flat_store_dword v[0:1], v2
103; VI-NEXT:    s_endpgm
104;
105; GFX9-LABEL: i8_zext_arg:
106; GFX9:       ; %bb.0:
107; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
108; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
109; GFX9-NEXT:    v_mov_b32_e32 v0, 0
110; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
112; GFX9-NEXT:    v_mov_b32_e32 v1, s2
113; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
114; GFX9-NEXT:    s_endpgm
115;
116; EG-LABEL: i8_zext_arg:
117; EG:       ; %bb.0:
118; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
119; EG-NEXT:    TEX 0 @6
120; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
121; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
122; EG-NEXT:    CF_END
123; EG-NEXT:    PAD
124; EG-NEXT:    Fetch clause starting at 6:
125; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
126; EG-NEXT:    ALU clause starting at 8:
127; EG-NEXT:     MOV * T0.X, 0.0,
128; EG-NEXT:    ALU clause starting at 9:
129; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
130; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
131; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
132;
133; CM-LABEL: i8_zext_arg:
134; CM:       ; %bb.0:
135; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
136; CM-NEXT:    TEX 0 @6
137; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
138; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
139; CM-NEXT:    CF_END
140; CM-NEXT:    PAD
141; CM-NEXT:    Fetch clause starting at 6:
142; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
143; CM-NEXT:    ALU clause starting at 8:
144; CM-NEXT:     MOV * T0.X, 0.0,
145; CM-NEXT:    ALU clause starting at 9:
146; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
147; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
148; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
149; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
150  %ext = zext i8 %in to i32
151  store i32 %ext, ptr addrspace(1) %out, align 4
152  ret void
153}
154
155define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind {
156; SI-LABEL: i8_sext_arg:
157; SI:       ; %bb.0:
158; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
159; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
160; SI-NEXT:    s_mov_b32 s3, 0xf000
161; SI-NEXT:    s_waitcnt lgkmcnt(0)
162; SI-NEXT:    s_sext_i32_i8 s4, s2
163; SI-NEXT:    s_mov_b32 s2, -1
164; SI-NEXT:    v_mov_b32_e32 v0, s4
165; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
166; SI-NEXT:    s_endpgm
167;
168; VI-LABEL: i8_sext_arg:
169; VI:       ; %bb.0:
170; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
171; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
172; VI-NEXT:    s_waitcnt lgkmcnt(0)
173; VI-NEXT:    s_sext_i32_i8 s2, s2
174; VI-NEXT:    v_mov_b32_e32 v0, s0
175; VI-NEXT:    v_mov_b32_e32 v1, s1
176; VI-NEXT:    v_mov_b32_e32 v2, s2
177; VI-NEXT:    flat_store_dword v[0:1], v2
178; VI-NEXT:    s_endpgm
179;
180; GFX9-LABEL: i8_sext_arg:
181; GFX9:       ; %bb.0:
182; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
183; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
184; GFX9-NEXT:    v_mov_b32_e32 v0, 0
185; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
186; GFX9-NEXT:    s_sext_i32_i8 s2, s2
187; GFX9-NEXT:    v_mov_b32_e32 v1, s2
188; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
189; GFX9-NEXT:    s_endpgm
190;
191; EG-LABEL: i8_sext_arg:
192; EG:       ; %bb.0:
193; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
194; EG-NEXT:    TEX 0 @6
195; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
196; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
197; EG-NEXT:    CF_END
198; EG-NEXT:    PAD
199; EG-NEXT:    Fetch clause starting at 6:
200; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
201; EG-NEXT:    ALU clause starting at 8:
202; EG-NEXT:     MOV * T0.X, 0.0,
203; EG-NEXT:    ALU clause starting at 9:
204; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
205; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
206; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
207;
208; CM-LABEL: i8_sext_arg:
209; CM:       ; %bb.0:
210; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
211; CM-NEXT:    TEX 0 @6
212; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
213; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
214; CM-NEXT:    CF_END
215; CM-NEXT:    PAD
216; CM-NEXT:    Fetch clause starting at 6:
217; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
218; CM-NEXT:    ALU clause starting at 8:
219; CM-NEXT:     MOV * T0.X, 0.0,
220; CM-NEXT:    ALU clause starting at 9:
221; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
222; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
223; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
224; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
225  %ext = sext i8 %in to i32
226  store i32 %ext, ptr addrspace(1) %out, align 4
227  ret void
228}
229
230define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind {
231; SI-LABEL: i16_arg:
232; SI:       ; %bb.0:
233; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
234; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
235; SI-NEXT:    s_mov_b32 s3, 0xf000
236; SI-NEXT:    s_waitcnt lgkmcnt(0)
237; SI-NEXT:    s_and_b32 s4, s2, 0xffff
238; SI-NEXT:    s_mov_b32 s2, -1
239; SI-NEXT:    v_mov_b32_e32 v0, s4
240; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
241; SI-NEXT:    s_endpgm
242;
243; VI-LABEL: i16_arg:
244; VI:       ; %bb.0:
245; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
246; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
247; VI-NEXT:    s_waitcnt lgkmcnt(0)
248; VI-NEXT:    s_and_b32 s2, s2, 0xffff
249; VI-NEXT:    v_mov_b32_e32 v0, s0
250; VI-NEXT:    v_mov_b32_e32 v1, s1
251; VI-NEXT:    v_mov_b32_e32 v2, s2
252; VI-NEXT:    flat_store_dword v[0:1], v2
253; VI-NEXT:    s_endpgm
254;
255; GFX9-LABEL: i16_arg:
256; GFX9:       ; %bb.0:
257; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
258; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
259; GFX9-NEXT:    v_mov_b32_e32 v0, 0
260; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
261; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
262; GFX9-NEXT:    v_mov_b32_e32 v1, s2
263; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
264; GFX9-NEXT:    s_endpgm
265;
266; EG-LABEL: i16_arg:
267; EG:       ; %bb.0:
268; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
269; EG-NEXT:    TEX 0 @6
270; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
271; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
272; EG-NEXT:    CF_END
273; EG-NEXT:    PAD
274; EG-NEXT:    Fetch clause starting at 6:
275; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
276; EG-NEXT:    ALU clause starting at 8:
277; EG-NEXT:     MOV * T0.X, 0.0,
278; EG-NEXT:    ALU clause starting at 9:
279; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
280; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
281;
282; CM-LABEL: i16_arg:
283; CM:       ; %bb.0:
284; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
285; CM-NEXT:    TEX 0 @6
286; CM-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
287; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
288; CM-NEXT:    CF_END
289; CM-NEXT:    PAD
290; CM-NEXT:    Fetch clause starting at 6:
291; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
292; CM-NEXT:    ALU clause starting at 8:
293; CM-NEXT:     MOV * T0.X, 0.0,
294; CM-NEXT:    ALU clause starting at 9:
295; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
296; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
297  %ext = zext i16 %in to i32
298  store i32 %ext, ptr addrspace(1) %out, align 4
299  ret void
300}
301
302define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind {
303; SI-LABEL: i16_zext_arg:
304; SI:       ; %bb.0:
305; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
306; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
307; SI-NEXT:    s_mov_b32 s3, 0xf000
308; SI-NEXT:    s_waitcnt lgkmcnt(0)
309; SI-NEXT:    s_and_b32 s4, s2, 0xffff
310; SI-NEXT:    s_mov_b32 s2, -1
311; SI-NEXT:    v_mov_b32_e32 v0, s4
312; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
313; SI-NEXT:    s_endpgm
314;
315; VI-LABEL: i16_zext_arg:
316; VI:       ; %bb.0:
317; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
318; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
319; VI-NEXT:    s_waitcnt lgkmcnt(0)
320; VI-NEXT:    s_and_b32 s2, s2, 0xffff
321; VI-NEXT:    v_mov_b32_e32 v0, s0
322; VI-NEXT:    v_mov_b32_e32 v1, s1
323; VI-NEXT:    v_mov_b32_e32 v2, s2
324; VI-NEXT:    flat_store_dword v[0:1], v2
325; VI-NEXT:    s_endpgm
326;
327; GFX9-LABEL: i16_zext_arg:
328; GFX9:       ; %bb.0:
329; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
330; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
331; GFX9-NEXT:    v_mov_b32_e32 v0, 0
332; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
333; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
334; GFX9-NEXT:    v_mov_b32_e32 v1, s2
335; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
336; GFX9-NEXT:    s_endpgm
337;
338; EG-LABEL: i16_zext_arg:
339; EG:       ; %bb.0:
340; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
341; EG-NEXT:    TEX 0 @6
342; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
343; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
344; EG-NEXT:    CF_END
345; EG-NEXT:    PAD
346; EG-NEXT:    Fetch clause starting at 6:
347; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
348; EG-NEXT:    ALU clause starting at 8:
349; EG-NEXT:     MOV * T0.X, 0.0,
350; EG-NEXT:    ALU clause starting at 9:
351; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
352; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
353; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
354;
355; CM-LABEL: i16_zext_arg:
356; CM:       ; %bb.0:
357; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
358; CM-NEXT:    TEX 0 @6
359; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
360; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
361; CM-NEXT:    CF_END
362; CM-NEXT:    PAD
363; CM-NEXT:    Fetch clause starting at 6:
364; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
365; CM-NEXT:    ALU clause starting at 8:
366; CM-NEXT:     MOV * T0.X, 0.0,
367; CM-NEXT:    ALU clause starting at 9:
368; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
369; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
370; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
371; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
372  %ext = zext i16 %in to i32
373  store i32 %ext, ptr addrspace(1) %out, align 4
374  ret void
375}
376
377define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind {
378; SI-LABEL: i16_sext_arg:
379; SI:       ; %bb.0:
380; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
381; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
382; SI-NEXT:    s_mov_b32 s3, 0xf000
383; SI-NEXT:    s_waitcnt lgkmcnt(0)
384; SI-NEXT:    s_sext_i32_i16 s4, s2
385; SI-NEXT:    s_mov_b32 s2, -1
386; SI-NEXT:    v_mov_b32_e32 v0, s4
387; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
388; SI-NEXT:    s_endpgm
389;
390; VI-LABEL: i16_sext_arg:
391; VI:       ; %bb.0:
392; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
393; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
394; VI-NEXT:    s_waitcnt lgkmcnt(0)
395; VI-NEXT:    s_sext_i32_i16 s2, s2
396; VI-NEXT:    v_mov_b32_e32 v0, s0
397; VI-NEXT:    v_mov_b32_e32 v1, s1
398; VI-NEXT:    v_mov_b32_e32 v2, s2
399; VI-NEXT:    flat_store_dword v[0:1], v2
400; VI-NEXT:    s_endpgm
401;
402; GFX9-LABEL: i16_sext_arg:
403; GFX9:       ; %bb.0:
404; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
405; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
406; GFX9-NEXT:    v_mov_b32_e32 v0, 0
407; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
408; GFX9-NEXT:    s_sext_i32_i16 s2, s2
409; GFX9-NEXT:    v_mov_b32_e32 v1, s2
410; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
411; GFX9-NEXT:    s_endpgm
412;
413; EG-LABEL: i16_sext_arg:
414; EG:       ; %bb.0:
415; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
416; EG-NEXT:    TEX 0 @6
417; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
418; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
419; EG-NEXT:    CF_END
420; EG-NEXT:    PAD
421; EG-NEXT:    Fetch clause starting at 6:
422; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
423; EG-NEXT:    ALU clause starting at 8:
424; EG-NEXT:     MOV * T0.X, 0.0,
425; EG-NEXT:    ALU clause starting at 9:
426; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
427; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
428; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
429;
430; CM-LABEL: i16_sext_arg:
431; CM:       ; %bb.0:
432; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
433; CM-NEXT:    TEX 0 @6
434; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
435; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
436; CM-NEXT:    CF_END
437; CM-NEXT:    PAD
438; CM-NEXT:    Fetch clause starting at 6:
439; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
440; CM-NEXT:    ALU clause starting at 8:
441; CM-NEXT:     MOV * T0.X, 0.0,
442; CM-NEXT:    ALU clause starting at 9:
443; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
444; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
445; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
446; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
447  %ext = sext i16 %in to i32
448  store i32 %ext, ptr addrspace(1) %out, align 4
449  ret void
450}
451
452define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind {
453; SI-LABEL: i32_arg:
454; SI:       ; %bb.0: ; %entry
455; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
456; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
457; SI-NEXT:    s_mov_b32 s3, 0xf000
458; SI-NEXT:    s_mov_b32 s2, -1
459; SI-NEXT:    s_waitcnt lgkmcnt(0)
460; SI-NEXT:    v_mov_b32_e32 v0, s6
461; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
462; SI-NEXT:    s_endpgm
463;
464; VI-LABEL: i32_arg:
465; VI:       ; %bb.0: ; %entry
466; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
467; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
468; VI-NEXT:    s_waitcnt lgkmcnt(0)
469; VI-NEXT:    v_mov_b32_e32 v0, s0
470; VI-NEXT:    v_mov_b32_e32 v1, s1
471; VI-NEXT:    v_mov_b32_e32 v2, s2
472; VI-NEXT:    flat_store_dword v[0:1], v2
473; VI-NEXT:    s_endpgm
474;
475; GFX9-LABEL: i32_arg:
476; GFX9:       ; %bb.0: ; %entry
477; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
478; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
479; GFX9-NEXT:    v_mov_b32_e32 v0, 0
480; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
481; GFX9-NEXT:    v_mov_b32_e32 v1, s2
482; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
483; GFX9-NEXT:    s_endpgm
484;
485; EG-LABEL: i32_arg:
486; EG:       ; %bb.0: ; %entry
487; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
488; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
489; EG-NEXT:    CF_END
490; EG-NEXT:    PAD
491; EG-NEXT:    ALU clause starting at 4:
492; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
493; EG-NEXT:     MOV * T1.X, KC0[2].Z,
494; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
495;
496; CM-LABEL: i32_arg:
497; CM:       ; %bb.0: ; %entry
498; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
499; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
500; CM-NEXT:    CF_END
501; CM-NEXT:    PAD
502; CM-NEXT:    ALU clause starting at 4:
503; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
504; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
505; CM-NEXT:     MOV * T1.X, KC0[2].Z,
506entry:
507  store i32 %in, ptr addrspace(1) %out, align 4
508  ret void
509}
510
511define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind {
512; SI-LABEL: f32_arg:
513; SI:       ; %bb.0: ; %entry
514; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
515; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
516; SI-NEXT:    s_mov_b32 s3, 0xf000
517; SI-NEXT:    s_mov_b32 s2, -1
518; SI-NEXT:    s_waitcnt lgkmcnt(0)
519; SI-NEXT:    v_mov_b32_e32 v0, s6
520; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
521; SI-NEXT:    s_endpgm
522;
523; VI-LABEL: f32_arg:
524; VI:       ; %bb.0: ; %entry
525; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
526; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
527; VI-NEXT:    s_waitcnt lgkmcnt(0)
528; VI-NEXT:    v_mov_b32_e32 v0, s0
529; VI-NEXT:    v_mov_b32_e32 v1, s1
530; VI-NEXT:    v_mov_b32_e32 v2, s2
531; VI-NEXT:    flat_store_dword v[0:1], v2
532; VI-NEXT:    s_endpgm
533;
534; GFX9-LABEL: f32_arg:
535; GFX9:       ; %bb.0: ; %entry
536; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
537; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
538; GFX9-NEXT:    v_mov_b32_e32 v0, 0
539; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
540; GFX9-NEXT:    v_mov_b32_e32 v1, s2
541; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
542; GFX9-NEXT:    s_endpgm
543;
544; EG-LABEL: f32_arg:
545; EG:       ; %bb.0: ; %entry
546; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
547; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
548; EG-NEXT:    CF_END
549; EG-NEXT:    PAD
550; EG-NEXT:    ALU clause starting at 4:
551; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
552; EG-NEXT:     MOV * T1.X, KC0[2].Z,
553; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
554;
555; CM-LABEL: f32_arg:
556; CM:       ; %bb.0: ; %entry
557; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
558; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
559; CM-NEXT:    CF_END
560; CM-NEXT:    PAD
561; CM-NEXT:    ALU clause starting at 4:
562; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
563; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
564; CM-NEXT:     MOV * T1.X, KC0[2].Z,
565entry:
566  store float %in, ptr addrspace(1) %out, align 4
567  ret void
568}
569
570define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) {
571; SI-LABEL: v2i8_arg:
572; SI:       ; %bb.0: ; %entry
573; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
574; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
575; SI-NEXT:    s_mov_b32 s3, 0xf000
576; SI-NEXT:    s_mov_b32 s2, -1
577; SI-NEXT:    s_waitcnt lgkmcnt(0)
578; SI-NEXT:    v_mov_b32_e32 v0, s6
579; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
580; SI-NEXT:    s_endpgm
581;
582; VI-LABEL: v2i8_arg:
583; VI:       ; %bb.0: ; %entry
584; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
585; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
586; VI-NEXT:    s_waitcnt lgkmcnt(0)
587; VI-NEXT:    v_mov_b32_e32 v0, s0
588; VI-NEXT:    v_mov_b32_e32 v1, s1
589; VI-NEXT:    v_mov_b32_e32 v2, s2
590; VI-NEXT:    flat_store_short v[0:1], v2
591; VI-NEXT:    s_endpgm
592;
593; GFX9-LABEL: v2i8_arg:
594; GFX9:       ; %bb.0: ; %entry
595; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
596; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
597; GFX9-NEXT:    v_mov_b32_e32 v0, 0
598; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
599; GFX9-NEXT:    v_mov_b32_e32 v1, s2
600; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
601; GFX9-NEXT:    s_endpgm
602;
603; EG-LABEL: v2i8_arg:
604; EG:       ; %bb.0: ; %entry
605; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
606; EG-NEXT:    TEX 0 @6
607; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
608; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
609; EG-NEXT:    CF_END
610; EG-NEXT:    PAD
611; EG-NEXT:    Fetch clause starting at 6:
612; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
613; EG-NEXT:    ALU clause starting at 8:
614; EG-NEXT:     MOV * T0.X, 0.0,
615; EG-NEXT:    ALU clause starting at 9:
616; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
617; EG-NEXT:     AND_INT * T1.W, T0.X, literal.y,
618; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
619; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
620; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
621; EG-NEXT:     LSHL T0.X, T1.W, PV.W,
622; EG-NEXT:     LSHL * T0.W, literal.x, PV.W,
623; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
624; EG-NEXT:     MOV T0.Y, 0.0,
625; EG-NEXT:     MOV * T0.Z, 0.0,
626; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
627; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
628;
629; CM-LABEL: v2i8_arg:
630; CM:       ; %bb.0: ; %entry
631; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
632; CM-NEXT:    TEX 0 @6
633; CM-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
634; CM-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
635; CM-NEXT:    CF_END
636; CM-NEXT:    PAD
637; CM-NEXT:    Fetch clause starting at 6:
638; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
639; CM-NEXT:    ALU clause starting at 8:
640; CM-NEXT:     MOV * T0.X, 0.0,
641; CM-NEXT:    ALU clause starting at 9:
642; CM-NEXT:     AND_INT * T0.W, KC0[2].Y, literal.x,
643; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
644; CM-NEXT:     AND_INT T0.Z, T0.X, literal.x,
645; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
646; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
647; CM-NEXT:     LSHL T0.X, PV.Z, PV.W,
648; CM-NEXT:     LSHL * T0.W, literal.x, PV.W,
649; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
650; CM-NEXT:     MOV T0.Y, 0.0,
651; CM-NEXT:     MOV * T0.Z, 0.0,
652; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
653; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
654entry:
655  store <2 x i8> %in, ptr addrspace(1) %out
656  ret void
657}
658
659define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) {
660; SI-LABEL: v2i16_arg:
661; SI:       ; %bb.0: ; %entry
662; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
663; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
664; SI-NEXT:    s_mov_b32 s3, 0xf000
665; SI-NEXT:    s_mov_b32 s2, -1
666; SI-NEXT:    s_waitcnt lgkmcnt(0)
667; SI-NEXT:    v_mov_b32_e32 v0, s6
668; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
669; SI-NEXT:    s_endpgm
670;
671; VI-LABEL: v2i16_arg:
672; VI:       ; %bb.0: ; %entry
673; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
674; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
675; VI-NEXT:    s_waitcnt lgkmcnt(0)
676; VI-NEXT:    v_mov_b32_e32 v0, s0
677; VI-NEXT:    v_mov_b32_e32 v1, s1
678; VI-NEXT:    v_mov_b32_e32 v2, s2
679; VI-NEXT:    flat_store_dword v[0:1], v2
680; VI-NEXT:    s_endpgm
681;
682; GFX9-LABEL: v2i16_arg:
683; GFX9:       ; %bb.0: ; %entry
684; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
685; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
686; GFX9-NEXT:    v_mov_b32_e32 v0, 0
687; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
688; GFX9-NEXT:    v_mov_b32_e32 v1, s2
689; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
690; GFX9-NEXT:    s_endpgm
691;
692; EG-LABEL: v2i16_arg:
693; EG:       ; %bb.0: ; %entry
694; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
695; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
696; EG-NEXT:    CF_END
697; EG-NEXT:    PAD
698; EG-NEXT:    ALU clause starting at 4:
699; EG-NEXT:     MOV T0.X, KC0[2].Z,
700; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
701; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
702;
703; CM-LABEL: v2i16_arg:
704; CM:       ; %bb.0: ; %entry
705; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
706; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
707; CM-NEXT:    CF_END
708; CM-NEXT:    PAD
709; CM-NEXT:    ALU clause starting at 4:
710; CM-NEXT:     MOV * T0.X, KC0[2].Z,
711; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
712; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
713entry:
714  store <2 x i16> %in, ptr addrspace(1) %out
715  ret void
716}
717
718define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind {
719; SI-LABEL: v2i32_arg:
720; SI:       ; %bb.0: ; %entry
721; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
722; SI-NEXT:    s_mov_b32 s7, 0xf000
723; SI-NEXT:    s_mov_b32 s6, -1
724; SI-NEXT:    s_waitcnt lgkmcnt(0)
725; SI-NEXT:    s_mov_b32 s4, s0
726; SI-NEXT:    s_mov_b32 s5, s1
727; SI-NEXT:    v_mov_b32_e32 v0, s2
728; SI-NEXT:    v_mov_b32_e32 v1, s3
729; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
730; SI-NEXT:    s_endpgm
731;
732; VI-LABEL: v2i32_arg:
733; VI:       ; %bb.0: ; %entry
734; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
735; VI-NEXT:    s_waitcnt lgkmcnt(0)
736; VI-NEXT:    v_mov_b32_e32 v0, s0
737; VI-NEXT:    v_mov_b32_e32 v2, s2
738; VI-NEXT:    v_mov_b32_e32 v1, s1
739; VI-NEXT:    v_mov_b32_e32 v3, s3
740; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
741; VI-NEXT:    s_endpgm
742;
743; GFX9-LABEL: v2i32_arg:
744; GFX9:       ; %bb.0: ; %entry
745; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
746; GFX9-NEXT:    v_mov_b32_e32 v2, 0
747; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
748; GFX9-NEXT:    v_mov_b32_e32 v0, s2
749; GFX9-NEXT:    v_mov_b32_e32 v1, s3
750; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
751; GFX9-NEXT:    s_endpgm
752;
753; EG-LABEL: v2i32_arg:
754; EG:       ; %bb.0: ; %entry
755; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
756; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
757; EG-NEXT:    CF_END
758; EG-NEXT:    PAD
759; EG-NEXT:    ALU clause starting at 4:
760; EG-NEXT:     MOV * T0.Y, KC0[3].X,
761; EG-NEXT:     MOV T0.X, KC0[2].W,
762; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
763; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
764;
765; CM-LABEL: v2i32_arg:
766; CM:       ; %bb.0: ; %entry
767; CM-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
768; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
769; CM-NEXT:    CF_END
770; CM-NEXT:    PAD
771; CM-NEXT:    ALU clause starting at 4:
772; CM-NEXT:     MOV * T0.Y, KC0[3].X,
773; CM-NEXT:     MOV * T0.X, KC0[2].W,
774; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
775; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
776entry:
777  store <2 x i32> %in, ptr addrspace(1) %out, align 4
778  ret void
779}
780
781define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind {
782; SI-LABEL: v2f32_arg:
783; SI:       ; %bb.0: ; %entry
784; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
785; SI-NEXT:    s_mov_b32 s7, 0xf000
786; SI-NEXT:    s_mov_b32 s6, -1
787; SI-NEXT:    s_waitcnt lgkmcnt(0)
788; SI-NEXT:    s_mov_b32 s4, s0
789; SI-NEXT:    s_mov_b32 s5, s1
790; SI-NEXT:    v_mov_b32_e32 v0, s2
791; SI-NEXT:    v_mov_b32_e32 v1, s3
792; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
793; SI-NEXT:    s_endpgm
794;
795; VI-LABEL: v2f32_arg:
796; VI:       ; %bb.0: ; %entry
797; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
798; VI-NEXT:    s_waitcnt lgkmcnt(0)
799; VI-NEXT:    v_mov_b32_e32 v0, s0
800; VI-NEXT:    v_mov_b32_e32 v2, s2
801; VI-NEXT:    v_mov_b32_e32 v1, s1
802; VI-NEXT:    v_mov_b32_e32 v3, s3
803; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
804; VI-NEXT:    s_endpgm
805;
806; GFX9-LABEL: v2f32_arg:
807; GFX9:       ; %bb.0: ; %entry
808; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
809; GFX9-NEXT:    v_mov_b32_e32 v2, 0
810; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
811; GFX9-NEXT:    v_mov_b32_e32 v0, s2
812; GFX9-NEXT:    v_mov_b32_e32 v1, s3
813; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
814; GFX9-NEXT:    s_endpgm
815;
816; EG-LABEL: v2f32_arg:
817; EG:       ; %bb.0: ; %entry
818; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
819; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
820; EG-NEXT:    CF_END
821; EG-NEXT:    PAD
822; EG-NEXT:    ALU clause starting at 4:
823; EG-NEXT:     MOV * T0.Y, KC0[3].X,
824; EG-NEXT:     MOV T0.X, KC0[2].W,
825; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
826; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
827;
828; CM-LABEL: v2f32_arg:
829; CM:       ; %bb.0: ; %entry
830; CM-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
831; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
832; CM-NEXT:    CF_END
833; CM-NEXT:    PAD
834; CM-NEXT:    ALU clause starting at 4:
835; CM-NEXT:     MOV * T0.Y, KC0[3].X,
836; CM-NEXT:     MOV * T0.X, KC0[2].W,
837; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
838; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
839entry:
840  store <2 x float> %in, ptr addrspace(1) %out, align 4
841  ret void
842}
843
844define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind {
845; SI-LABEL: v3i8_arg:
846; SI:       ; %bb.0: ; %entry
847; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
848; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
849; SI-NEXT:    s_mov_b32 s3, 0xf000
850; SI-NEXT:    s_waitcnt lgkmcnt(0)
851; SI-NEXT:    s_lshr_b32 s4, s6, 16
852; SI-NEXT:    s_mov_b32 s2, -1
853; SI-NEXT:    v_mov_b32_e32 v0, s6
854; SI-NEXT:    v_mov_b32_e32 v1, s4
855; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
856; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
857; SI-NEXT:    s_endpgm
858;
859; VI-LABEL: v3i8_arg:
860; VI:       ; %bb.0: ; %entry
861; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
862; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
863; VI-NEXT:    s_waitcnt lgkmcnt(0)
864; VI-NEXT:    s_lshr_b32 s3, s2, 16
865; VI-NEXT:    v_mov_b32_e32 v0, s0
866; VI-NEXT:    v_mov_b32_e32 v1, s1
867; VI-NEXT:    s_add_u32 s0, s0, 2
868; VI-NEXT:    s_addc_u32 s1, s1, 0
869; VI-NEXT:    v_mov_b32_e32 v3, s1
870; VI-NEXT:    v_mov_b32_e32 v5, s3
871; VI-NEXT:    v_mov_b32_e32 v2, s0
872; VI-NEXT:    v_mov_b32_e32 v4, s2
873; VI-NEXT:    flat_store_byte v[2:3], v5
874; VI-NEXT:    flat_store_short v[0:1], v4
875; VI-NEXT:    s_endpgm
876;
877; GFX9-LABEL: v3i8_arg:
878; GFX9:       ; %bb.0: ; %entry
879; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
880; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
881; GFX9-NEXT:    v_mov_b32_e32 v0, 0
882; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
883; GFX9-NEXT:    v_mov_b32_e32 v1, s2
884; GFX9-NEXT:    global_store_byte_d16_hi v0, v1, s[0:1] offset:2
885; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
886; GFX9-NEXT:    s_endpgm
887;
888; EG-LABEL: v3i8_arg:
889; EG:       ; %bb.0: ; %entry
890; EG-NEXT:    ALU 0, @12, KC0[], KC1[]
891; EG-NEXT:    TEX 2 @6
892; EG-NEXT:    ALU 28, @13, KC0[CB0:0-32], KC1[]
893; EG-NEXT:    MEM_RAT MSKOR T4.XW, T7.X
894; EG-NEXT:    MEM_RAT MSKOR T5.XW, T6.X
895; EG-NEXT:    CF_END
896; EG-NEXT:    Fetch clause starting at 6:
897; EG-NEXT:     VTX_READ_8 T5.X, T4.X, 41, #3
898; EG-NEXT:     VTX_READ_8 T6.X, T4.X, 42, #3
899; EG-NEXT:     VTX_READ_8 T4.X, T4.X, 40, #3
900; EG-NEXT:    ALU clause starting at 12:
901; EG-NEXT:     MOV * T4.X, 0.0,
902; EG-NEXT:    ALU clause starting at 13:
903; EG-NEXT:     LSHL T0.W, T5.X, literal.x,
904; EG-NEXT:     AND_INT * T1.W, T4.X, literal.y,
905; EG-NEXT:    8(1.121039e-44), 255(3.573311e-43)
906; EG-NEXT:     AND_INT T2.W, KC0[2].Y, literal.x,
907; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
908; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
909; EG-NEXT:     AND_INT T0.W, PS, literal.x,
910; EG-NEXT:     LSHL * T1.W, PV.W, literal.y,
911; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
912; EG-NEXT:     LSHL T4.X, PV.W, PS,
913; EG-NEXT:     LSHL * T4.W, literal.x, PS,
914; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
915; EG-NEXT:     MOV T4.Y, 0.0,
916; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
917; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
918; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
919; EG-NEXT:     AND_INT * T2.W, T6.X, literal.y,
920; EG-NEXT:    3(4.203895e-45), 255(3.573311e-43)
921; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
922; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
923; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
924; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
925; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
926; EG-NEXT:     MOV T5.Y, 0.0,
927; EG-NEXT:     MOV T4.Z, 0.0,
928; EG-NEXT:     MOV * T5.Z, 0.0,
929; EG-NEXT:     LSHR T6.X, T0.W, literal.x,
930; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
931; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
932;
933; CM-LABEL: v3i8_arg:
934; CM:       ; %bb.0: ; %entry
935; CM-NEXT:    ALU 0, @12, KC0[], KC1[]
936; CM-NEXT:    TEX 2 @6
937; CM-NEXT:    ALU 29, @13, KC0[CB0:0-32], KC1[]
938; CM-NEXT:    MEM_RAT MSKOR T4.XW, T7.X
939; CM-NEXT:    MEM_RAT MSKOR T5.XW, T6.X
940; CM-NEXT:    CF_END
941; CM-NEXT:    Fetch clause starting at 6:
942; CM-NEXT:     VTX_READ_8 T5.X, T4.X, 41, #3
943; CM-NEXT:     VTX_READ_8 T6.X, T4.X, 42, #3
944; CM-NEXT:     VTX_READ_8 T4.X, T4.X, 40, #3
945; CM-NEXT:    ALU clause starting at 12:
946; CM-NEXT:     MOV * T4.X, 0.0,
947; CM-NEXT:    ALU clause starting at 13:
948; CM-NEXT:     LSHL T0.Z, T5.X, literal.x,
949; CM-NEXT:     AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212
950; CM-NEXT:    8(1.121039e-44), 255(3.573311e-43)
951; CM-NEXT:     AND_INT T1.Z, KC0[2].Y, literal.x,
952; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
953; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
954; CM-NEXT:     AND_INT T0.Z, PV.W, literal.x,
955; CM-NEXT:     LSHL * T0.W, PV.Z, literal.y,
956; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
957; CM-NEXT:     LSHL T4.X, PV.Z, PV.W,
958; CM-NEXT:     LSHL * T4.W, literal.x, PV.W,
959; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
960; CM-NEXT:     MOV T4.Y, 0.0,
961; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
962; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
963; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
964; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
965; CM-NEXT:     AND_INT T0.Z, T6.X, literal.x,
966; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
967; CM-NEXT:    255(3.573311e-43), 3(4.203895e-45)
968; CM-NEXT:     LSHL T5.X, PV.Z, PV.W,
969; CM-NEXT:     LSHL * T5.W, literal.x, PV.W,
970; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
971; CM-NEXT:     MOV T5.Y, 0.0,
972; CM-NEXT:     MOV * T4.Z, 0.0,
973; CM-NEXT:     MOV * T5.Z, 0.0,
974; CM-NEXT:     LSHR * T6.X, T0.W, literal.x,
975; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
976; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
977; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
978entry:
979  store <3 x i8> %in, ptr addrspace(1) %out, align 4
980  ret void
981}
982
983define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
984; SI-LABEL: v3i16_arg:
985; SI:       ; %bb.0: ; %entry
986; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
987; SI-NEXT:    s_mov_b32 s7, 0xf000
988; SI-NEXT:    s_mov_b32 s6, -1
989; SI-NEXT:    s_waitcnt lgkmcnt(0)
990; SI-NEXT:    s_mov_b32 s4, s0
991; SI-NEXT:    s_mov_b32 s5, s1
992; SI-NEXT:    v_mov_b32_e32 v0, s3
993; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
994; SI-NEXT:    s_waitcnt expcnt(0)
995; SI-NEXT:    v_mov_b32_e32 v0, s2
996; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
997; SI-NEXT:    s_endpgm
998;
999; VI-LABEL: v3i16_arg:
1000; VI:       ; %bb.0: ; %entry
1001; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1002; VI-NEXT:    s_waitcnt lgkmcnt(0)
1003; VI-NEXT:    s_add_u32 s4, s0, 4
1004; VI-NEXT:    s_addc_u32 s5, s1, 0
1005; VI-NEXT:    v_mov_b32_e32 v2, s4
1006; VI-NEXT:    v_mov_b32_e32 v4, s3
1007; VI-NEXT:    v_mov_b32_e32 v0, s0
1008; VI-NEXT:    v_mov_b32_e32 v3, s5
1009; VI-NEXT:    v_mov_b32_e32 v1, s1
1010; VI-NEXT:    v_mov_b32_e32 v5, s2
1011; VI-NEXT:    flat_store_short v[2:3], v4
1012; VI-NEXT:    flat_store_dword v[0:1], v5
1013; VI-NEXT:    s_endpgm
1014;
1015; GFX9-LABEL: v3i16_arg:
1016; GFX9:       ; %bb.0: ; %entry
1017; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1018; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1019; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1020; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1021; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1022; GFX9-NEXT:    global_store_short v0, v1, s[0:1] offset:4
1023; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
1024; GFX9-NEXT:    s_endpgm
1025;
1026; EG-LABEL: v3i16_arg:
1027; EG:       ; %bb.0: ; %entry
1028; EG-NEXT:    ALU 0, @12, KC0[], KC1[]
1029; EG-NEXT:    TEX 2 @6
1030; EG-NEXT:    ALU 19, @13, KC0[CB0:0-32], KC1[]
1031; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0
1032; EG-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
1033; EG-NEXT:    CF_END
1034; EG-NEXT:    Fetch clause starting at 6:
1035; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 44, #3
1036; EG-NEXT:     VTX_READ_16 T7.X, T5.X, 46, #3
1037; EG-NEXT:     VTX_READ_16 T5.X, T5.X, 48, #3
1038; EG-NEXT:    ALU clause starting at 12:
1039; EG-NEXT:     MOV * T5.X, 0.0,
1040; EG-NEXT:    ALU clause starting at 13:
1041; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1042; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1043; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
1044; EG-NEXT:     AND_INT * T2.W, T5.X, literal.y,
1045; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1046; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
1047; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1048; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
1049; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
1050; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1051; EG-NEXT:     MOV T5.Y, 0.0,
1052; EG-NEXT:     MOV * T5.Z, 0.0,
1053; EG-NEXT:     LSHR T8.X, T0.W, literal.x,
1054; EG-NEXT:     LSHL T0.W, T7.X, literal.y,
1055; EG-NEXT:     AND_INT * T1.W, T6.X, literal.z,
1056; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
1057; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1058; EG-NEXT:     OR_INT T6.X, PV.W, PS,
1059; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
1060; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1061;
1062; CM-LABEL: v3i16_arg:
1063; CM:       ; %bb.0: ; %entry
1064; CM-NEXT:    ALU 0, @12, KC0[], KC1[]
1065; CM-NEXT:    TEX 2 @6
1066; CM-NEXT:    ALU 19, @13, KC0[CB0:0-32], KC1[]
1067; CM-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
1068; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X
1069; CM-NEXT:    CF_END
1070; CM-NEXT:    Fetch clause starting at 6:
1071; CM-NEXT:     VTX_READ_16 T6.X, T5.X, 44, #3
1072; CM-NEXT:     VTX_READ_16 T7.X, T5.X, 46, #3
1073; CM-NEXT:     VTX_READ_16 T5.X, T5.X, 48, #3
1074; CM-NEXT:    ALU clause starting at 12:
1075; CM-NEXT:     MOV * T5.X, 0.0,
1076; CM-NEXT:    ALU clause starting at 13:
1077; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1078; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1079; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
1080; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1081; CM-NEXT:     AND_INT T0.Z, T5.X, literal.x,
1082; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
1083; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1084; CM-NEXT:     LSHL T5.X, PV.Z, PV.W,
1085; CM-NEXT:     LSHL * T5.W, literal.x, PV.W,
1086; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1087; CM-NEXT:     MOV T5.Y, 0.0,
1088; CM-NEXT:     MOV * T5.Z, 0.0,
1089; CM-NEXT:     LSHL T0.Z, T7.X, literal.x,
1090; CM-NEXT:     AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212
1091; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
1092; CM-NEXT:     OR_INT * T6.X, PV.Z, PV.W,
1093; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
1094; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1095; CM-NEXT:     LSHR * T8.X, T0.W, literal.x,
1096; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1097entry:
1098  store <3 x i16> %in, ptr addrspace(1) %out, align 4
1099  ret void
1100}
1101
1102define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
1103; SI-LABEL: v3i32_arg:
1104; SI:       ; %bb.0: ; %entry
1105; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
1106; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
1107; SI-NEXT:    s_mov_b32 s7, 0xf000
1108; SI-NEXT:    s_mov_b32 s6, -1
1109; SI-NEXT:    s_waitcnt lgkmcnt(0)
1110; SI-NEXT:    v_mov_b32_e32 v0, s2
1111; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
1112; SI-NEXT:    s_waitcnt expcnt(0)
1113; SI-NEXT:    v_mov_b32_e32 v0, s0
1114; SI-NEXT:    v_mov_b32_e32 v1, s1
1115; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1116; SI-NEXT:    s_endpgm
1117;
1118; VI-LABEL: v3i32_arg:
1119; VI:       ; %bb.0: ; %entry
1120; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
1121; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
1122; VI-NEXT:    s_waitcnt lgkmcnt(0)
1123; VI-NEXT:    v_mov_b32_e32 v0, s0
1124; VI-NEXT:    v_mov_b32_e32 v3, s4
1125; VI-NEXT:    v_mov_b32_e32 v1, s1
1126; VI-NEXT:    v_mov_b32_e32 v2, s2
1127; VI-NEXT:    v_mov_b32_e32 v4, s5
1128; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
1129; VI-NEXT:    s_endpgm
1130;
1131; GFX9-LABEL: v3i32_arg:
1132; GFX9:       ; %bb.0: ; %entry
1133; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
1134; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1135; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1136; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1137; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1138; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1139; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1140; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
1141; GFX9-NEXT:    s_endpgm
1142;
1143; EG-LABEL: v3i32_arg:
1144; EG:       ; %bb.0: ; %entry
1145; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
1146; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
1147; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1148; EG-NEXT:    CF_END
1149; EG-NEXT:    ALU clause starting at 4:
1150; EG-NEXT:     MOV * T0.Y, KC0[3].Z,
1151; EG-NEXT:     MOV T0.X, KC0[3].Y,
1152; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1153; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1154; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1155; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1156; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
1157; EG-NEXT:     MOV * T3.X, KC0[3].W,
1158; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1159;
1160; CM-LABEL: v3i32_arg:
1161; CM:       ; %bb.0: ; %entry
1162; CM-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
1163; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T3.X
1164; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
1165; CM-NEXT:    CF_END
1166; CM-NEXT:    ALU clause starting at 4:
1167; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1168; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1169; CM-NEXT:     LSHR * T0.X, PV.W, literal.x,
1170; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1171; CM-NEXT:     MOV T1.X, KC0[3].W,
1172; CM-NEXT:     MOV * T2.Y, KC0[3].Z,
1173; CM-NEXT:     MOV * T2.X, KC0[3].Y,
1174; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
1175; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1176entry:
1177  store <3 x i32> %in, ptr addrspace(1) %out, align 4
1178  ret void
1179}
1180
1181define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
1182; SI-LABEL: v3f32_arg:
1183; SI:       ; %bb.0: ; %entry
1184; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
1185; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
1186; SI-NEXT:    s_mov_b32 s7, 0xf000
1187; SI-NEXT:    s_mov_b32 s6, -1
1188; SI-NEXT:    s_waitcnt lgkmcnt(0)
1189; SI-NEXT:    v_mov_b32_e32 v0, s2
1190; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
1191; SI-NEXT:    s_waitcnt expcnt(0)
1192; SI-NEXT:    v_mov_b32_e32 v0, s0
1193; SI-NEXT:    v_mov_b32_e32 v1, s1
1194; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1195; SI-NEXT:    s_endpgm
1196;
1197; VI-LABEL: v3f32_arg:
1198; VI:       ; %bb.0: ; %entry
1199; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
1200; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
1201; VI-NEXT:    s_waitcnt lgkmcnt(0)
1202; VI-NEXT:    v_mov_b32_e32 v0, s0
1203; VI-NEXT:    v_mov_b32_e32 v3, s4
1204; VI-NEXT:    v_mov_b32_e32 v1, s1
1205; VI-NEXT:    v_mov_b32_e32 v2, s2
1206; VI-NEXT:    v_mov_b32_e32 v4, s5
1207; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
1208; VI-NEXT:    s_endpgm
1209;
1210; GFX9-LABEL: v3f32_arg:
1211; GFX9:       ; %bb.0: ; %entry
1212; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
1213; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1214; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1215; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1216; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1217; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1218; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1219; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
1220; GFX9-NEXT:    s_endpgm
1221;
1222; EG-LABEL: v3f32_arg:
1223; EG:       ; %bb.0: ; %entry
1224; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
1225; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
1226; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1227; EG-NEXT:    CF_END
1228; EG-NEXT:    ALU clause starting at 4:
1229; EG-NEXT:     MOV * T0.Y, KC0[3].Z,
1230; EG-NEXT:     MOV T0.X, KC0[3].Y,
1231; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1232; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1233; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1234; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1235; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
1236; EG-NEXT:     MOV * T3.X, KC0[3].W,
1237; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1238;
1239; CM-LABEL: v3f32_arg:
1240; CM:       ; %bb.0: ; %entry
1241; CM-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
1242; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T3.X
1243; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
1244; CM-NEXT:    CF_END
1245; CM-NEXT:    ALU clause starting at 4:
1246; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1247; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1248; CM-NEXT:     LSHR * T0.X, PV.W, literal.x,
1249; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1250; CM-NEXT:     MOV T1.X, KC0[3].W,
1251; CM-NEXT:     MOV * T2.Y, KC0[3].Z,
1252; CM-NEXT:     MOV * T2.X, KC0[3].Y,
1253; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
1254; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1255entry:
1256  store <3 x float> %in, ptr addrspace(1) %out, align 4
1257  ret void
1258}
1259
1260define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) {
1261; SI-LABEL: v4i8_arg:
1262; SI:       ; %bb.0: ; %entry
1263; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
1264; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1265; SI-NEXT:    s_mov_b32 s3, 0xf000
1266; SI-NEXT:    s_mov_b32 s2, -1
1267; SI-NEXT:    s_waitcnt lgkmcnt(0)
1268; SI-NEXT:    v_mov_b32_e32 v0, s6
1269; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1270; SI-NEXT:    s_endpgm
1271;
1272; VI-LABEL: v4i8_arg:
1273; VI:       ; %bb.0: ; %entry
1274; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1275; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
1276; VI-NEXT:    s_waitcnt lgkmcnt(0)
1277; VI-NEXT:    v_mov_b32_e32 v0, s0
1278; VI-NEXT:    v_mov_b32_e32 v1, s1
1279; VI-NEXT:    v_mov_b32_e32 v2, s2
1280; VI-NEXT:    flat_store_dword v[0:1], v2
1281; VI-NEXT:    s_endpgm
1282;
1283; GFX9-LABEL: v4i8_arg:
1284; GFX9:       ; %bb.0: ; %entry
1285; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
1286; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1287; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1288; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1289; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1290; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1291; GFX9-NEXT:    s_endpgm
1292;
1293; EG-LABEL: v4i8_arg:
1294; EG:       ; %bb.0: ; %entry
1295; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
1296; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1297; EG-NEXT:    CF_END
1298; EG-NEXT:    PAD
1299; EG-NEXT:    ALU clause starting at 4:
1300; EG-NEXT:     MOV T0.X, KC0[2].Z,
1301; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1302; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1303;
1304; CM-LABEL: v4i8_arg:
1305; CM:       ; %bb.0: ; %entry
1306; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
1307; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
1308; CM-NEXT:    CF_END
1309; CM-NEXT:    PAD
1310; CM-NEXT:    ALU clause starting at 4:
1311; CM-NEXT:     MOV * T0.X, KC0[2].Z,
1312; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1313; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1314entry:
1315  store <4 x i8> %in, ptr addrspace(1) %out
1316  ret void
1317}
1318
1319define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
1320; SI-LABEL: v4i16_arg:
1321; SI:       ; %bb.0: ; %entry
1322; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1323; SI-NEXT:    s_mov_b32 s7, 0xf000
1324; SI-NEXT:    s_mov_b32 s6, -1
1325; SI-NEXT:    s_waitcnt lgkmcnt(0)
1326; SI-NEXT:    s_mov_b32 s4, s0
1327; SI-NEXT:    s_mov_b32 s5, s1
1328; SI-NEXT:    v_mov_b32_e32 v0, s2
1329; SI-NEXT:    v_mov_b32_e32 v1, s3
1330; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1331; SI-NEXT:    s_endpgm
1332;
1333; VI-LABEL: v4i16_arg:
1334; VI:       ; %bb.0: ; %entry
1335; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1336; VI-NEXT:    s_waitcnt lgkmcnt(0)
1337; VI-NEXT:    v_mov_b32_e32 v0, s0
1338; VI-NEXT:    v_mov_b32_e32 v2, s2
1339; VI-NEXT:    v_mov_b32_e32 v1, s1
1340; VI-NEXT:    v_mov_b32_e32 v3, s3
1341; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1342; VI-NEXT:    s_endpgm
1343;
1344; GFX9-LABEL: v4i16_arg:
1345; GFX9:       ; %bb.0: ; %entry
1346; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1347; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1348; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1349; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1350; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1351; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1352; GFX9-NEXT:    s_endpgm
1353;
1354; EG-LABEL: v4i16_arg:
1355; EG:       ; %bb.0: ; %entry
1356; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
1357; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1358; EG-NEXT:    CF_END
1359; EG-NEXT:    PAD
1360; EG-NEXT:    ALU clause starting at 4:
1361; EG-NEXT:     MOV * T0.Y, KC0[3].X,
1362; EG-NEXT:     MOV T0.X, KC0[2].W,
1363; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1364; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1365;
1366; CM-LABEL: v4i16_arg:
1367; CM:       ; %bb.0: ; %entry
1368; CM-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
1369; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
1370; CM-NEXT:    CF_END
1371; CM-NEXT:    PAD
1372; CM-NEXT:    ALU clause starting at 4:
1373; CM-NEXT:     MOV * T0.Y, KC0[3].X,
1374; CM-NEXT:     MOV * T0.X, KC0[2].W,
1375; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1376; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1377entry:
1378  store <4 x i16> %in, ptr addrspace(1) %out
1379  ret void
1380}
1381
1382define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind {
1383; SI-LABEL: v4i32_arg:
1384; SI:       ; %bb.0: ; %entry
1385; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
1386; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
1387; SI-NEXT:    s_mov_b32 s7, 0xf000
1388; SI-NEXT:    s_mov_b32 s6, -1
1389; SI-NEXT:    s_waitcnt lgkmcnt(0)
1390; SI-NEXT:    v_mov_b32_e32 v0, s0
1391; SI-NEXT:    v_mov_b32_e32 v1, s1
1392; SI-NEXT:    v_mov_b32_e32 v2, s2
1393; SI-NEXT:    v_mov_b32_e32 v3, s3
1394; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1395; SI-NEXT:    s_endpgm
1396;
1397; VI-LABEL: v4i32_arg:
1398; VI:       ; %bb.0: ; %entry
1399; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
1400; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
1401; VI-NEXT:    s_waitcnt lgkmcnt(0)
1402; VI-NEXT:    v_mov_b32_e32 v4, s6
1403; VI-NEXT:    v_mov_b32_e32 v0, s0
1404; VI-NEXT:    v_mov_b32_e32 v5, s7
1405; VI-NEXT:    v_mov_b32_e32 v1, s1
1406; VI-NEXT:    v_mov_b32_e32 v2, s2
1407; VI-NEXT:    v_mov_b32_e32 v3, s3
1408; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1409; VI-NEXT:    s_endpgm
1410;
1411; GFX9-LABEL: v4i32_arg:
1412; GFX9:       ; %bb.0: ; %entry
1413; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
1414; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1415; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1416; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1417; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1418; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1419; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1420; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1421; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
1422; GFX9-NEXT:    s_endpgm
1423;
1424; EG-LABEL: v4i32_arg:
1425; EG:       ; %bb.0: ; %entry
1426; EG-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
1427; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
1428; EG-NEXT:    CF_END
1429; EG-NEXT:    PAD
1430; EG-NEXT:    ALU clause starting at 4:
1431; EG-NEXT:     MOV * T0.W, KC0[4].X,
1432; EG-NEXT:     MOV * T0.Z, KC0[3].W,
1433; EG-NEXT:     MOV * T0.Y, KC0[3].Z,
1434; EG-NEXT:     MOV T0.X, KC0[3].Y,
1435; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1436; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1437;
1438; CM-LABEL: v4i32_arg:
1439; CM:       ; %bb.0: ; %entry
1440; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
1441; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
1442; CM-NEXT:    CF_END
1443; CM-NEXT:    PAD
1444; CM-NEXT:    ALU clause starting at 4:
1445; CM-NEXT:     MOV * T0.W, KC0[4].X,
1446; CM-NEXT:     MOV * T0.Z, KC0[3].W,
1447; CM-NEXT:     MOV * T0.Y, KC0[3].Z,
1448; CM-NEXT:     MOV * T0.X, KC0[3].Y,
1449; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1450; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1451entry:
1452  store <4 x i32> %in, ptr addrspace(1) %out, align 4
1453  ret void
1454}
1455
1456define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind {
1457; SI-LABEL: v4f32_arg:
1458; SI:       ; %bb.0: ; %entry
1459; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
1460; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
1461; SI-NEXT:    s_mov_b32 s7, 0xf000
1462; SI-NEXT:    s_mov_b32 s6, -1
1463; SI-NEXT:    s_waitcnt lgkmcnt(0)
1464; SI-NEXT:    v_mov_b32_e32 v0, s0
1465; SI-NEXT:    v_mov_b32_e32 v1, s1
1466; SI-NEXT:    v_mov_b32_e32 v2, s2
1467; SI-NEXT:    v_mov_b32_e32 v3, s3
1468; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1469; SI-NEXT:    s_endpgm
1470;
1471; VI-LABEL: v4f32_arg:
1472; VI:       ; %bb.0: ; %entry
1473; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
1474; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
1475; VI-NEXT:    s_waitcnt lgkmcnt(0)
1476; VI-NEXT:    v_mov_b32_e32 v4, s6
1477; VI-NEXT:    v_mov_b32_e32 v0, s0
1478; VI-NEXT:    v_mov_b32_e32 v5, s7
1479; VI-NEXT:    v_mov_b32_e32 v1, s1
1480; VI-NEXT:    v_mov_b32_e32 v2, s2
1481; VI-NEXT:    v_mov_b32_e32 v3, s3
1482; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1483; VI-NEXT:    s_endpgm
1484;
1485; GFX9-LABEL: v4f32_arg:
1486; GFX9:       ; %bb.0: ; %entry
1487; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
1488; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1489; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1490; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1491; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1492; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1493; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1494; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1495; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
1496; GFX9-NEXT:    s_endpgm
1497;
1498; EG-LABEL: v4f32_arg:
1499; EG:       ; %bb.0: ; %entry
1500; EG-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
1501; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
1502; EG-NEXT:    CF_END
1503; EG-NEXT:    PAD
1504; EG-NEXT:    ALU clause starting at 4:
1505; EG-NEXT:     MOV * T0.W, KC0[4].X,
1506; EG-NEXT:     MOV * T0.Z, KC0[3].W,
1507; EG-NEXT:     MOV * T0.Y, KC0[3].Z,
1508; EG-NEXT:     MOV T0.X, KC0[3].Y,
1509; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1510; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1511;
1512; CM-LABEL: v4f32_arg:
1513; CM:       ; %bb.0: ; %entry
1514; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
1515; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
1516; CM-NEXT:    CF_END
1517; CM-NEXT:    PAD
1518; CM-NEXT:    ALU clause starting at 4:
1519; CM-NEXT:     MOV * T0.W, KC0[4].X,
1520; CM-NEXT:     MOV * T0.Z, KC0[3].W,
1521; CM-NEXT:     MOV * T0.Y, KC0[3].Z,
1522; CM-NEXT:     MOV * T0.X, KC0[3].Y,
1523; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1524; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1525entry:
1526  store <4 x float> %in, ptr addrspace(1) %out, align 4
1527  ret void
1528}
1529
1530define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
1531; SI-LABEL: v5i8_arg:
1532; SI:       ; %bb.0: ; %entry
1533; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1534; SI-NEXT:    s_mov_b32 s7, 0xf000
1535; SI-NEXT:    s_mov_b32 s6, -1
1536; SI-NEXT:    s_waitcnt lgkmcnt(0)
1537; SI-NEXT:    s_mov_b32 s4, s0
1538; SI-NEXT:    s_mov_b32 s5, s1
1539; SI-NEXT:    v_mov_b32_e32 v0, s3
1540; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:4
1541; SI-NEXT:    s_waitcnt expcnt(0)
1542; SI-NEXT:    v_mov_b32_e32 v0, s2
1543; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1544; SI-NEXT:    s_endpgm
1545;
1546; VI-LABEL: v5i8_arg:
1547; VI:       ; %bb.0: ; %entry
1548; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1549; VI-NEXT:    s_waitcnt lgkmcnt(0)
1550; VI-NEXT:    s_add_u32 s4, s0, 4
1551; VI-NEXT:    s_addc_u32 s5, s1, 0
1552; VI-NEXT:    v_mov_b32_e32 v2, s4
1553; VI-NEXT:    v_mov_b32_e32 v4, s3
1554; VI-NEXT:    v_mov_b32_e32 v0, s0
1555; VI-NEXT:    v_mov_b32_e32 v3, s5
1556; VI-NEXT:    v_mov_b32_e32 v1, s1
1557; VI-NEXT:    v_mov_b32_e32 v5, s2
1558; VI-NEXT:    flat_store_byte v[2:3], v4
1559; VI-NEXT:    flat_store_dword v[0:1], v5
1560; VI-NEXT:    s_endpgm
1561;
1562; GFX9-LABEL: v5i8_arg:
1563; GFX9:       ; %bb.0: ; %entry
1564; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1565; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1566; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1567; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1568; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1569; GFX9-NEXT:    global_store_byte v0, v1, s[0:1] offset:4
1570; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
1571; GFX9-NEXT:    s_endpgm
1572;
1573; EG-LABEL: v5i8_arg:
1574; EG:       ; %bb.0: ; %entry
1575; EG-NEXT:    ALU 0, @16, KC0[], KC1[]
1576; EG-NEXT:    TEX 4 @6
1577; EG-NEXT:    ALU 28, @17, KC0[CB0:0-32], KC1[]
1578; EG-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
1579; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1
1580; EG-NEXT:    CF_END
1581; EG-NEXT:    Fetch clause starting at 6:
1582; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 44, #3
1583; EG-NEXT:     VTX_READ_8 T7.X, T5.X, 47, #3
1584; EG-NEXT:     VTX_READ_8 T8.X, T5.X, 45, #3
1585; EG-NEXT:     VTX_READ_8 T9.X, T5.X, 46, #3
1586; EG-NEXT:     VTX_READ_8 T5.X, T5.X, 48, #3
1587; EG-NEXT:    ALU clause starting at 16:
1588; EG-NEXT:     MOV * T5.X, 0.0,
1589; EG-NEXT:    ALU clause starting at 17:
1590; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1591; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1592; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
1593; EG-NEXT:     AND_INT * T2.W, T5.X, literal.y,
1594; EG-NEXT:    3(4.203895e-45), 255(3.573311e-43)
1595; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
1596; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1597; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
1598; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
1599; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1600; EG-NEXT:     MOV T5.Y, 0.0,
1601; EG-NEXT:     MOV T5.Z, 0.0,
1602; EG-NEXT:     AND_INT T1.W, T9.X, literal.x,
1603; EG-NEXT:     AND_INT * T0.Z, T8.X, literal.x,
1604; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1605; EG-NEXT:     LSHL T1.W, PV.W, literal.x,
1606; EG-NEXT:     LSHL * T2.W, T7.X, literal.y,
1607; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
1608; EG-NEXT:     OR_INT T1.W, PS, PV.W,
1609; EG-NEXT:     LSHL * T2.W, T0.Z, literal.x,
1610; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1611; EG-NEXT:     OR_INT T1.W, PV.W, PS,
1612; EG-NEXT:     AND_INT * T2.W, T6.X, literal.x,
1613; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1614; EG-NEXT:     OR_INT T6.X, PV.W, PS,
1615; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
1616; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1617; EG-NEXT:     LSHR * T8.X, T0.W, literal.x,
1618; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1619;
1620; CM-LABEL: v5i8_arg:
1621; CM:       ; %bb.0: ; %entry
1622; CM-NEXT:    ALU 0, @16, KC0[], KC1[]
1623; CM-NEXT:    TEX 4 @6
1624; CM-NEXT:    ALU 28, @17, KC0[CB0:0-32], KC1[]
1625; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6.X, T8.X
1626; CM-NEXT:    MEM_RAT MSKOR T5.XW, T7.X
1627; CM-NEXT:    CF_END
1628; CM-NEXT:    Fetch clause starting at 6:
1629; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 44, #3
1630; CM-NEXT:     VTX_READ_8 T7.X, T5.X, 47, #3
1631; CM-NEXT:     VTX_READ_8 T8.X, T5.X, 45, #3
1632; CM-NEXT:     VTX_READ_8 T9.X, T5.X, 46, #3
1633; CM-NEXT:     VTX_READ_8 T5.X, T5.X, 48, #3
1634; CM-NEXT:    ALU clause starting at 16:
1635; CM-NEXT:     MOV * T5.X, 0.0,
1636; CM-NEXT:    ALU clause starting at 17:
1637; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1638; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1639; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
1640; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1641; CM-NEXT:     AND_INT T0.Z, T5.X, literal.x,
1642; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
1643; CM-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1644; CM-NEXT:     LSHL T5.X, PV.Z, PV.W,
1645; CM-NEXT:     LSHL * T5.W, literal.x, PV.W,
1646; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1647; CM-NEXT:     MOV T5.Y, 0.0,
1648; CM-NEXT:     MOV T5.Z, 0.0,
1649; CM-NEXT:     AND_INT * T1.W, T9.X, literal.x,
1650; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1651; CM-NEXT:     AND_INT T0.Y, T8.X, literal.x,
1652; CM-NEXT:     LSHL T0.Z, PV.W, literal.y,
1653; CM-NEXT:     LSHL * T1.W, T7.X, literal.z, BS:VEC_120/SCL_212
1654; CM-NEXT:    255(3.573311e-43), 16(2.242078e-44)
1655; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
1656; CM-NEXT:     OR_INT T0.Z, PV.W, PV.Z,
1657; CM-NEXT:     LSHL * T1.W, PV.Y, literal.x,
1658; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1659; CM-NEXT:     LSHR T7.X, T0.W, literal.x,
1660; CM-NEXT:     OR_INT T0.Z, PV.Z, PV.W,
1661; CM-NEXT:     AND_INT * T0.W, T6.X, literal.y,
1662; CM-NEXT:    2(2.802597e-45), 255(3.573311e-43)
1663; CM-NEXT:     OR_INT * T6.X, PV.Z, PV.W,
1664; CM-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
1665; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1666entry:
1667  store <5 x i8> %in, ptr addrspace(1) %out, align 4
1668  ret void
1669}
1670
1671define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> %in) nounwind {
1672; SI-LABEL: v5i16_arg:
1673; SI:       ; %bb.0: ; %entry
1674; SI-NEXT:    s_load_dword s6, s[4:5], 0xf
1675; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1676; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1677; SI-NEXT:    s_mov_b32 s3, 0xf000
1678; SI-NEXT:    s_mov_b32 s2, -1
1679; SI-NEXT:    s_waitcnt lgkmcnt(0)
1680; SI-NEXT:    v_mov_b32_e32 v0, s6
1681; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:8
1682; SI-NEXT:    s_waitcnt expcnt(0)
1683; SI-NEXT:    v_mov_b32_e32 v0, s4
1684; SI-NEXT:    v_mov_b32_e32 v1, s5
1685; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1686; SI-NEXT:    s_endpgm
1687;
1688; VI-LABEL: v5i16_arg:
1689; VI:       ; %bb.0: ; %entry
1690; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1691; VI-NEXT:    s_load_dword s6, s[4:5], 0x3c
1692; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
1693; VI-NEXT:    s_waitcnt lgkmcnt(0)
1694; VI-NEXT:    s_add_u32 s4, s0, 8
1695; VI-NEXT:    s_addc_u32 s5, s1, 0
1696; VI-NEXT:    v_mov_b32_e32 v2, s4
1697; VI-NEXT:    v_mov_b32_e32 v4, s6
1698; VI-NEXT:    v_mov_b32_e32 v3, s5
1699; VI-NEXT:    v_mov_b32_e32 v0, s0
1700; VI-NEXT:    flat_store_short v[2:3], v4
1701; VI-NEXT:    v_mov_b32_e32 v2, s2
1702; VI-NEXT:    v_mov_b32_e32 v1, s1
1703; VI-NEXT:    v_mov_b32_e32 v3, s3
1704; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1705; VI-NEXT:    s_endpgm
1706;
1707; GFX9-LABEL: v5i16_arg:
1708; GFX9:       ; %bb.0: ; %entry
1709; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
1710; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1711; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1712; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1713; GFX9-NEXT:    v_mov_b32_e32 v3, s2
1714; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1715; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1716; GFX9-NEXT:    global_store_short v2, v3, s[4:5] offset:8
1717; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
1718; GFX9-NEXT:    s_endpgm
1719;
1720; EG-LABEL: v5i16_arg:
1721; EG:       ; %bb.0: ; %entry
1722; EG-NEXT:    ALU 0, @20, KC0[], KC1[]
1723; EG-NEXT:    TEX 4 @10
1724; EG-NEXT:    ALU 65, @21, KC0[CB0:0-32], KC1[]
1725; EG-NEXT:    MEM_RAT MSKOR T5.XW, T9.X
1726; EG-NEXT:    MEM_RAT MSKOR T4.XW, T7.X
1727; EG-NEXT:    MEM_RAT MSKOR T3.XW, T2.X
1728; EG-NEXT:    MEM_RAT MSKOR T6.XW, T1.X
1729; EG-NEXT:    MEM_RAT MSKOR T8.XW, T0.X
1730; EG-NEXT:    CF_END
1731; EG-NEXT:    PAD
1732; EG-NEXT:    Fetch clause starting at 10:
1733; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 58, #3
1734; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 56, #3
1735; EG-NEXT:     VTX_READ_16 T3.X, T0.X, 54, #3
1736; EG-NEXT:     VTX_READ_16 T4.X, T0.X, 52, #3
1737; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 60, #3
1738; EG-NEXT:    ALU clause starting at 20:
1739; EG-NEXT:     MOV * T0.X, 0.0,
1740; EG-NEXT:    ALU clause starting at 21:
1741; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1742; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1743; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
1744; EG-NEXT:     AND_INT * T2.W, T0.X, literal.y,
1745; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1746; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
1747; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1748; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
1749; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
1750; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1751; EG-NEXT:     MOV T5.Y, 0.0,
1752; EG-NEXT:     AND_INT T1.W, KC0[2].Y, literal.x,
1753; EG-NEXT:     AND_INT * T2.W, T4.X, literal.y,
1754; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1755; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
1756; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1757; EG-NEXT:     LSHL T4.X, T2.W, PV.W,
1758; EG-NEXT:     LSHL * T4.W, literal.x, PV.W,
1759; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1760; EG-NEXT:     MOV T4.Y, 0.0,
1761; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
1762; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1763; EG-NEXT:     AND_INT T2.W, PV.W, literal.x,
1764; EG-NEXT:     AND_INT * T3.W, T3.X, literal.y,
1765; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1766; EG-NEXT:     LSHL * T2.W, PV.W, literal.x,
1767; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1768; EG-NEXT:     LSHL T3.X, T3.W, PV.W,
1769; EG-NEXT:     LSHL * T3.W, literal.x, PV.W,
1770; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1771; EG-NEXT:     MOV T3.Y, 0.0,
1772; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
1773; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1774; EG-NEXT:     AND_INT T6.W, PV.W, literal.x,
1775; EG-NEXT:     AND_INT * T7.W, T2.X, literal.y,
1776; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1777; EG-NEXT:     LSHL * T6.W, PV.W, literal.x,
1778; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1779; EG-NEXT:     LSHL T6.X, T7.W, PV.W,
1780; EG-NEXT:     LSHL * T6.W, literal.x, PV.W,
1781; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1782; EG-NEXT:     MOV T6.Y, 0.0,
1783; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.x,
1784; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
1785; EG-NEXT:     AND_INT T8.W, PV.W, literal.x,
1786; EG-NEXT:     AND_INT * T9.W, T1.X, literal.y,
1787; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1788; EG-NEXT:     LSHL * T8.W, PV.W, literal.x,
1789; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1790; EG-NEXT:     LSHL T8.X, T9.W, PV.W,
1791; EG-NEXT:     LSHL * T8.W, literal.x, PV.W,
1792; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1793; EG-NEXT:     MOV T8.Y, 0.0,
1794; EG-NEXT:     MOV T5.Z, 0.0,
1795; EG-NEXT:     MOV * T4.Z, 0.0,
1796; EG-NEXT:     MOV T3.Z, 0.0,
1797; EG-NEXT:     MOV * T6.Z, 0.0,
1798; EG-NEXT:     MOV * T8.Z, 0.0,
1799; EG-NEXT:     LSHR T0.X, T7.W, literal.x,
1800; EG-NEXT:     LSHR * T1.X, T2.W, literal.x,
1801; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1802; EG-NEXT:     LSHR T2.X, T1.W, literal.x,
1803; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
1804; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1805; EG-NEXT:     LSHR * T9.X, T0.W, literal.x,
1806; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1807;
1808; CM-LABEL: v5i16_arg:
1809; CM:       ; %bb.0: ; %entry
1810; CM-NEXT:    ALU 0, @20, KC0[], KC1[]
1811; CM-NEXT:    TEX 4 @10
1812; CM-NEXT:    ALU 67, @21, KC0[CB0:0-32], KC1[]
1813; CM-NEXT:    MEM_RAT MSKOR T5.XW, T9.X
1814; CM-NEXT:    MEM_RAT MSKOR T4.XW, T7.X
1815; CM-NEXT:    MEM_RAT MSKOR T3.XW, T2.X
1816; CM-NEXT:    MEM_RAT MSKOR T6.XW, T1.X
1817; CM-NEXT:    MEM_RAT MSKOR T8.XW, T0.X
1818; CM-NEXT:    CF_END
1819; CM-NEXT:    PAD
1820; CM-NEXT:    Fetch clause starting at 10:
1821; CM-NEXT:     VTX_READ_16 T1.X, T0.X, 58, #3
1822; CM-NEXT:     VTX_READ_16 T2.X, T0.X, 56, #3
1823; CM-NEXT:     VTX_READ_16 T3.X, T0.X, 54, #3
1824; CM-NEXT:     VTX_READ_16 T4.X, T0.X, 52, #3
1825; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 60, #3
1826; CM-NEXT:    ALU clause starting at 20:
1827; CM-NEXT:     MOV * T0.X, 0.0,
1828; CM-NEXT:    ALU clause starting at 21:
1829; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1830; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1831; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
1832; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1833; CM-NEXT:     AND_INT T0.Z, T0.X, literal.x,
1834; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
1835; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1836; CM-NEXT:     LSHL T5.X, PV.Z, PV.W,
1837; CM-NEXT:     LSHL * T5.W, literal.x, PV.W,
1838; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1839; CM-NEXT:     MOV T5.Y, 0.0,
1840; CM-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1841; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1842; CM-NEXT:     AND_INT T0.Z, T4.X, literal.x,
1843; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
1844; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1845; CM-NEXT:     LSHL T4.X, PV.Z, PV.W,
1846; CM-NEXT:     LSHL * T4.W, literal.x, PV.W,
1847; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1848; CM-NEXT:     MOV T4.Y, 0.0,
1849; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
1850; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1851; CM-NEXT:     AND_INT * T2.W, PV.W, literal.x,
1852; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1853; CM-NEXT:     AND_INT T0.Z, T3.X, literal.x,
1854; CM-NEXT:     LSHL * T2.W, PV.W, literal.y,
1855; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1856; CM-NEXT:     LSHL T3.X, PV.Z, PV.W,
1857; CM-NEXT:     LSHL * T3.W, literal.x, PV.W,
1858; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1859; CM-NEXT:     MOV T3.Y, 0.0,
1860; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
1861; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1862; CM-NEXT:     AND_INT * T6.W, PV.W, literal.x,
1863; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1864; CM-NEXT:     AND_INT T0.Z, T2.X, literal.x,
1865; CM-NEXT:     LSHL * T6.W, PV.W, literal.y,
1866; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1867; CM-NEXT:     LSHL T6.X, PV.Z, PV.W,
1868; CM-NEXT:     LSHL * T6.W, literal.x, PV.W,
1869; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1870; CM-NEXT:     MOV T6.Y, 0.0,
1871; CM-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.x,
1872; CM-NEXT:    6(8.407791e-45), 0(0.000000e+00)
1873; CM-NEXT:     AND_INT * T8.W, PV.W, literal.x,
1874; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1875; CM-NEXT:     AND_INT T0.Z, T1.X, literal.x,
1876; CM-NEXT:     LSHL * T8.W, PV.W, literal.y,
1877; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1878; CM-NEXT:     LSHL T8.X, PV.Z, PV.W,
1879; CM-NEXT:     LSHL * T8.W, literal.x, PV.W,
1880; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1881; CM-NEXT:     MOV T8.Y, 0.0,
1882; CM-NEXT:     MOV * T5.Z, 0.0,
1883; CM-NEXT:     MOV * T4.Z, 0.0,
1884; CM-NEXT:     MOV * T3.Z, 0.0,
1885; CM-NEXT:     MOV * T6.Z, 0.0,
1886; CM-NEXT:     MOV * T8.Z, 0.0,
1887; CM-NEXT:     LSHR * T0.X, T7.W, literal.x,
1888; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1889; CM-NEXT:     LSHR * T1.X, T2.W, literal.x,
1890; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1891; CM-NEXT:     LSHR * T2.X, T1.W, literal.x,
1892; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1893; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
1894; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1895; CM-NEXT:     LSHR * T9.X, T0.W, literal.x,
1896; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1897entry:
1898  store <5 x i16> %in, ptr addrspace(1) %out, align 4
1899  ret void
1900}
1901
1902define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> %in) nounwind {
1903; SI-LABEL: v5i32_arg:
1904; SI:       ; %bb.0: ; %entry
1905; SI-NEXT:    s_load_dword s8, s[4:5], 0x15
1906; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1907; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x11
1908; SI-NEXT:    s_mov_b32 s3, 0xf000
1909; SI-NEXT:    s_mov_b32 s2, -1
1910; SI-NEXT:    s_waitcnt lgkmcnt(0)
1911; SI-NEXT:    v_mov_b32_e32 v0, s8
1912; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:16
1913; SI-NEXT:    s_waitcnt expcnt(0)
1914; SI-NEXT:    v_mov_b32_e32 v0, s4
1915; SI-NEXT:    v_mov_b32_e32 v1, s5
1916; SI-NEXT:    v_mov_b32_e32 v2, s6
1917; SI-NEXT:    v_mov_b32_e32 v3, s7
1918; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1919; SI-NEXT:    s_endpgm
1920;
1921; VI-LABEL: v5i32_arg:
1922; VI:       ; %bb.0: ; %entry
1923; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
1924; VI-NEXT:    s_load_dword s8, s[4:5], 0x54
1925; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x44
1926; VI-NEXT:    s_waitcnt lgkmcnt(0)
1927; VI-NEXT:    s_add_u32 s4, s6, 16
1928; VI-NEXT:    s_addc_u32 s5, s7, 0
1929; VI-NEXT:    v_mov_b32_e32 v0, s4
1930; VI-NEXT:    v_mov_b32_e32 v2, s8
1931; VI-NEXT:    v_mov_b32_e32 v1, s5
1932; VI-NEXT:    v_mov_b32_e32 v4, s6
1933; VI-NEXT:    flat_store_dword v[0:1], v2
1934; VI-NEXT:    v_mov_b32_e32 v0, s0
1935; VI-NEXT:    v_mov_b32_e32 v5, s7
1936; VI-NEXT:    v_mov_b32_e32 v1, s1
1937; VI-NEXT:    v_mov_b32_e32 v2, s2
1938; VI-NEXT:    v_mov_b32_e32 v3, s3
1939; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1940; VI-NEXT:    s_endpgm
1941;
1942; GFX9-LABEL: v5i32_arg:
1943; GFX9:       ; %bb.0: ; %entry
1944; GFX9-NEXT:    s_load_dword s6, s[8:9], 0x30
1945; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
1946; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1947; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1948; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1949; GFX9-NEXT:    v_mov_b32_e32 v5, s6
1950; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1951; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1952; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1953; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1954; GFX9-NEXT:    global_store_dword v4, v5, s[4:5] offset:16
1955; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
1956; GFX9-NEXT:    s_endpgm
1957;
1958; EG-LABEL: v5i32_arg:
1959; EG:       ; %bb.0: ; %entry
1960; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
1961; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
1962; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
1963; EG-NEXT:    CF_END
1964; EG-NEXT:    ALU clause starting at 4:
1965; EG-NEXT:     MOV * T0.W, KC0[5].X,
1966; EG-NEXT:     MOV * T0.Z, KC0[4].W,
1967; EG-NEXT:     MOV * T0.Y, KC0[4].Z,
1968; EG-NEXT:     MOV T0.X, KC0[4].Y,
1969; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1970; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1971; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
1972; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1973; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
1974; EG-NEXT:     MOV * T3.X, KC0[5].Y,
1975; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1976;
1977; CM-LABEL: v5i32_arg:
1978; CM:       ; %bb.0: ; %entry
1979; CM-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
1980; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
1981; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
1982; CM-NEXT:    CF_END
1983; CM-NEXT:    ALU clause starting at 4:
1984; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.x,
1985; CM-NEXT:     MOV * T0.W, KC0[5].X,
1986; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1987; CM-NEXT:     LSHR T1.X, PV.Z, literal.x,
1988; CM-NEXT:     MOV * T0.Z, KC0[4].W,
1989; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1990; CM-NEXT:     MOV T2.X, KC0[5].Y,
1991; CM-NEXT:     MOV * T0.Y, KC0[4].Z,
1992; CM-NEXT:     MOV * T0.X, KC0[4].Y,
1993; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
1994; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1995entry:
1996  store <5 x i32> %in, ptr addrspace(1) %out, align 4
1997  ret void
1998}
1999
2000define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float> %in) nounwind {
2001; SI-LABEL: v5f32_arg:
2002; SI:       ; %bb.0: ; %entry
2003; SI-NEXT:    s_load_dword s8, s[4:5], 0x15
2004; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2005; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x11
2006; SI-NEXT:    s_mov_b32 s3, 0xf000
2007; SI-NEXT:    s_mov_b32 s2, -1
2008; SI-NEXT:    s_waitcnt lgkmcnt(0)
2009; SI-NEXT:    v_mov_b32_e32 v0, s8
2010; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:16
2011; SI-NEXT:    s_waitcnt expcnt(0)
2012; SI-NEXT:    v_mov_b32_e32 v0, s4
2013; SI-NEXT:    v_mov_b32_e32 v1, s5
2014; SI-NEXT:    v_mov_b32_e32 v2, s6
2015; SI-NEXT:    v_mov_b32_e32 v3, s7
2016; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2017; SI-NEXT:    s_endpgm
2018;
2019; VI-LABEL: v5f32_arg:
2020; VI:       ; %bb.0: ; %entry
2021; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
2022; VI-NEXT:    s_load_dword s8, s[4:5], 0x54
2023; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x44
2024; VI-NEXT:    s_waitcnt lgkmcnt(0)
2025; VI-NEXT:    s_add_u32 s4, s6, 16
2026; VI-NEXT:    s_addc_u32 s5, s7, 0
2027; VI-NEXT:    v_mov_b32_e32 v1, s4
2028; VI-NEXT:    v_mov_b32_e32 v3, s8
2029; VI-NEXT:    v_mov_b32_e32 v2, s5
2030; VI-NEXT:    v_mov_b32_e32 v4, s6
2031; VI-NEXT:    v_mov_b32_e32 v0, s0
2032; VI-NEXT:    flat_store_dword v[1:2], v3
2033; VI-NEXT:    v_mov_b32_e32 v1, s1
2034; VI-NEXT:    v_mov_b32_e32 v2, s2
2035; VI-NEXT:    v_mov_b32_e32 v3, s3
2036; VI-NEXT:    v_mov_b32_e32 v5, s7
2037; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2038; VI-NEXT:    s_endpgm
2039;
2040; GFX9-LABEL: v5f32_arg:
2041; GFX9:       ; %bb.0: ; %entry
2042; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
2043; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2044; GFX9-NEXT:    s_load_dword s6, s[8:9], 0x30
2045; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2046; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2047; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2048; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2049; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2050; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2051; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
2052; GFX9-NEXT:    s_nop 0
2053; GFX9-NEXT:    v_mov_b32_e32 v0, s6
2054; GFX9-NEXT:    global_store_dword v4, v0, s[4:5] offset:16
2055; GFX9-NEXT:    s_endpgm
2056;
2057; EG-LABEL: v5f32_arg:
2058; EG:       ; %bb.0: ; %entry
2059; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
2060; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
2061; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2062; EG-NEXT:    CF_END
2063; EG-NEXT:    ALU clause starting at 4:
2064; EG-NEXT:     MOV * T0.W, KC0[5].X,
2065; EG-NEXT:     MOV * T0.Z, KC0[4].W,
2066; EG-NEXT:     MOV * T0.Y, KC0[4].Z,
2067; EG-NEXT:     MOV T0.X, KC0[4].Y,
2068; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2069; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2070; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
2071; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2072; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
2073; EG-NEXT:     MOV * T3.X, KC0[5].Y,
2074; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2075;
2076; CM-LABEL: v5f32_arg:
2077; CM:       ; %bb.0: ; %entry
2078; CM-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
2079; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
2080; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
2081; CM-NEXT:    CF_END
2082; CM-NEXT:    ALU clause starting at 4:
2083; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.x,
2084; CM-NEXT:     MOV * T0.W, KC0[5].X,
2085; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2086; CM-NEXT:     LSHR T1.X, PV.Z, literal.x,
2087; CM-NEXT:     MOV * T0.Z, KC0[4].W,
2088; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2089; CM-NEXT:     MOV T2.X, KC0[5].Y,
2090; CM-NEXT:     MOV * T0.Y, KC0[4].Z,
2091; CM-NEXT:     MOV * T0.X, KC0[4].Y,
2092; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
2093; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2094entry:
2095  store <5 x float> %in, ptr addrspace(1) %out, align 4
2096  ret void
2097}
2098
2099define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> %in) nounwind {
2100; SI-LABEL: v5i64_arg:
2101; SI:       ; %bb.0: ; %entry
2102; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x19
2103; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2104; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x21
2105; SI-NEXT:    s_mov_b32 s3, 0xf000
2106; SI-NEXT:    s_mov_b32 s2, -1
2107; SI-NEXT:    s_waitcnt lgkmcnt(0)
2108; SI-NEXT:    v_mov_b32_e32 v0, s12
2109; SI-NEXT:    v_mov_b32_e32 v1, s13
2110; SI-NEXT:    v_mov_b32_e32 v2, s14
2111; SI-NEXT:    v_mov_b32_e32 v3, s15
2112; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
2113; SI-NEXT:    s_waitcnt expcnt(0)
2114; SI-NEXT:    v_mov_b32_e32 v0, s8
2115; SI-NEXT:    v_mov_b32_e32 v1, s9
2116; SI-NEXT:    v_mov_b32_e32 v2, s10
2117; SI-NEXT:    v_mov_b32_e32 v3, s11
2118; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2119; SI-NEXT:    s_waitcnt expcnt(0)
2120; SI-NEXT:    v_mov_b32_e32 v0, s4
2121; SI-NEXT:    v_mov_b32_e32 v1, s5
2122; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32
2123; SI-NEXT:    s_endpgm
2124;
2125; VI-LABEL: v5i64_arg:
2126; VI:       ; %bb.0: ; %entry
2127; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
2128; VI-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x84
2129; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x64
2130; VI-NEXT:    s_waitcnt lgkmcnt(0)
2131; VI-NEXT:    s_add_u32 s12, s8, 32
2132; VI-NEXT:    v_mov_b32_e32 v1, s10
2133; VI-NEXT:    s_addc_u32 s13, s9, 0
2134; VI-NEXT:    v_mov_b32_e32 v3, s12
2135; VI-NEXT:    v_mov_b32_e32 v2, s11
2136; VI-NEXT:    v_mov_b32_e32 v0, s4
2137; VI-NEXT:    v_mov_b32_e32 v4, s13
2138; VI-NEXT:    s_add_u32 s4, s8, 16
2139; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
2140; VI-NEXT:    v_mov_b32_e32 v1, s5
2141; VI-NEXT:    s_addc_u32 s5, s9, 0
2142; VI-NEXT:    v_mov_b32_e32 v4, s4
2143; VI-NEXT:    v_mov_b32_e32 v2, s6
2144; VI-NEXT:    v_mov_b32_e32 v3, s7
2145; VI-NEXT:    v_mov_b32_e32 v5, s5
2146; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2147; VI-NEXT:    v_mov_b32_e32 v4, s8
2148; VI-NEXT:    v_mov_b32_e32 v0, s0
2149; VI-NEXT:    v_mov_b32_e32 v1, s1
2150; VI-NEXT:    v_mov_b32_e32 v2, s2
2151; VI-NEXT:    v_mov_b32_e32 v3, s3
2152; VI-NEXT:    v_mov_b32_e32 v5, s9
2153; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2154; VI-NEXT:    s_endpgm
2155;
2156; GFX9-LABEL: v5i64_arg:
2157; GFX9:       ; %bb.0: ; %entry
2158; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x60
2159; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x40
2160; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x0
2161; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2162; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2163; GFX9-NEXT:    v_mov_b32_e32 v1, s10
2164; GFX9-NEXT:    v_mov_b32_e32 v2, s11
2165; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2166; GFX9-NEXT:    global_store_dwordx2 v4, v[1:2], s[12:13] offset:32
2167; GFX9-NEXT:    v_mov_b32_e32 v1, s5
2168; GFX9-NEXT:    v_mov_b32_e32 v2, s6
2169; GFX9-NEXT:    v_mov_b32_e32 v3, s7
2170; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
2171; GFX9-NEXT:    s_nop 0
2172; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2173; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2174; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2175; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2176; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13]
2177; GFX9-NEXT:    s_endpgm
2178;
2179; EG-LABEL: v5i64_arg:
2180; EG:       ; %bb.0: ; %entry
2181; EG-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
2182; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0
2183; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
2184; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
2185; EG-NEXT:    CF_END
2186; EG-NEXT:    PAD
2187; EG-NEXT:    ALU clause starting at 6:
2188; EG-NEXT:     MOV * T0.W, KC0[7].X,
2189; EG-NEXT:     MOV * T0.Z, KC0[6].W,
2190; EG-NEXT:     MOV T0.Y, KC0[6].Z,
2191; EG-NEXT:     MOV * T1.W, KC0[8].X,
2192; EG-NEXT:     MOV T0.X, KC0[6].Y,
2193; EG-NEXT:     MOV * T1.Z, KC0[7].W,
2194; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
2195; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
2196; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2197; EG-NEXT:     MOV T1.X, KC0[7].Y,
2198; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
2199; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2200; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
2201; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
2202; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
2203; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
2204; EG-NEXT:     MOV T5.Y, KC0[8].Z,
2205; EG-NEXT:     MOV * T5.X, KC0[8].Y,
2206; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2207;
2208; CM-LABEL: v5i64_arg:
2209; CM:       ; %bb.0: ; %entry
2210; CM-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
2211; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
2212; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T4.X
2213; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
2214; CM-NEXT:    CF_END
2215; CM-NEXT:    PAD
2216; CM-NEXT:    ALU clause starting at 6:
2217; CM-NEXT:     MOV * T0.W, KC0[8].X,
2218; CM-NEXT:     MOV T1.Y, KC0[8].Z,
2219; CM-NEXT:     MOV * T0.Z, KC0[7].W,
2220; CM-NEXT:     MOV T1.X, KC0[8].Y,
2221; CM-NEXT:     MOV * T0.Y, KC0[7].Z,
2222; CM-NEXT:     MOV T0.X, KC0[7].Y,
2223; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
2224; CM-NEXT:     MOV * T2.W, KC0[7].X,
2225; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
2226; CM-NEXT:     LSHR T3.X, PV.Z, literal.x,
2227; CM-NEXT:     MOV T2.Z, KC0[6].W,
2228; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
2229; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2230; CM-NEXT:     LSHR T4.X, PV.W, literal.x,
2231; CM-NEXT:     MOV * T2.Y, KC0[6].Z,
2232; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2233; CM-NEXT:     MOV * T2.X, KC0[6].Y,
2234; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
2235; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2236entry:
2237  store <5 x i64> %in, ptr addrspace(1) %out, align 8
2238  ret void
2239}
2240
2241define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
2242; SI-LABEL: v5f64_arg:
2243; SI:       ; %bb.0: ; %entry
2244; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x19
2245; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2246; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x21
2247; SI-NEXT:    s_mov_b32 s3, 0xf000
2248; SI-NEXT:    s_mov_b32 s2, -1
2249; SI-NEXT:    s_waitcnt lgkmcnt(0)
2250; SI-NEXT:    v_mov_b32_e32 v0, s12
2251; SI-NEXT:    v_mov_b32_e32 v1, s13
2252; SI-NEXT:    v_mov_b32_e32 v2, s14
2253; SI-NEXT:    v_mov_b32_e32 v3, s15
2254; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
2255; SI-NEXT:    s_waitcnt expcnt(0)
2256; SI-NEXT:    v_mov_b32_e32 v0, s8
2257; SI-NEXT:    v_mov_b32_e32 v1, s9
2258; SI-NEXT:    v_mov_b32_e32 v2, s10
2259; SI-NEXT:    v_mov_b32_e32 v3, s11
2260; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2261; SI-NEXT:    s_waitcnt expcnt(0)
2262; SI-NEXT:    v_mov_b32_e32 v0, s4
2263; SI-NEXT:    v_mov_b32_e32 v1, s5
2264; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32
2265; SI-NEXT:    s_endpgm
2266;
2267; VI-LABEL: v5f64_arg:
2268; VI:       ; %bb.0: ; %entry
2269; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
2270; VI-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x84
2271; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x64
2272; VI-NEXT:    s_waitcnt lgkmcnt(0)
2273; VI-NEXT:    s_add_u32 s12, s8, 32
2274; VI-NEXT:    v_mov_b32_e32 v1, s10
2275; VI-NEXT:    s_addc_u32 s13, s9, 0
2276; VI-NEXT:    v_mov_b32_e32 v3, s12
2277; VI-NEXT:    v_mov_b32_e32 v2, s11
2278; VI-NEXT:    v_mov_b32_e32 v0, s4
2279; VI-NEXT:    v_mov_b32_e32 v4, s13
2280; VI-NEXT:    s_add_u32 s4, s8, 16
2281; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
2282; VI-NEXT:    v_mov_b32_e32 v1, s5
2283; VI-NEXT:    s_addc_u32 s5, s9, 0
2284; VI-NEXT:    v_mov_b32_e32 v4, s4
2285; VI-NEXT:    v_mov_b32_e32 v2, s6
2286; VI-NEXT:    v_mov_b32_e32 v3, s7
2287; VI-NEXT:    v_mov_b32_e32 v5, s5
2288; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2289; VI-NEXT:    v_mov_b32_e32 v4, s8
2290; VI-NEXT:    v_mov_b32_e32 v0, s0
2291; VI-NEXT:    v_mov_b32_e32 v1, s1
2292; VI-NEXT:    v_mov_b32_e32 v2, s2
2293; VI-NEXT:    v_mov_b32_e32 v3, s3
2294; VI-NEXT:    v_mov_b32_e32 v5, s9
2295; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2296; VI-NEXT:    s_endpgm
2297;
2298; GFX9-LABEL: v5f64_arg:
2299; GFX9:       ; %bb.0: ; %entry
2300; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x60
2301; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x40
2302; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x0
2303; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2304; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2305; GFX9-NEXT:    v_mov_b32_e32 v1, s10
2306; GFX9-NEXT:    v_mov_b32_e32 v2, s11
2307; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2308; GFX9-NEXT:    global_store_dwordx2 v4, v[1:2], s[12:13] offset:32
2309; GFX9-NEXT:    v_mov_b32_e32 v1, s5
2310; GFX9-NEXT:    v_mov_b32_e32 v2, s6
2311; GFX9-NEXT:    v_mov_b32_e32 v3, s7
2312; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] offset:16
2313; GFX9-NEXT:    s_nop 0
2314; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2315; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2316; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2317; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2318; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13]
2319; GFX9-NEXT:    s_endpgm
2320;
2321; EG-LABEL: v5f64_arg:
2322; EG:       ; %bb.0: ; %entry
2323; EG-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
2324; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0
2325; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
2326; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
2327; EG-NEXT:    CF_END
2328; EG-NEXT:    PAD
2329; EG-NEXT:    ALU clause starting at 6:
2330; EG-NEXT:     MOV * T0.W, KC0[7].X,
2331; EG-NEXT:     MOV * T0.Z, KC0[6].W,
2332; EG-NEXT:     MOV T0.Y, KC0[6].Z,
2333; EG-NEXT:     MOV * T1.W, KC0[8].X,
2334; EG-NEXT:     MOV T0.X, KC0[6].Y,
2335; EG-NEXT:     MOV * T1.Z, KC0[7].W,
2336; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
2337; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
2338; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2339; EG-NEXT:     MOV T1.X, KC0[7].Y,
2340; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
2341; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2342; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
2343; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
2344; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
2345; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
2346; EG-NEXT:     MOV T5.Y, KC0[8].Z,
2347; EG-NEXT:     MOV * T5.X, KC0[8].Y,
2348; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2349;
2350; CM-LABEL: v5f64_arg:
2351; CM:       ; %bb.0: ; %entry
2352; CM-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
2353; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
2354; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T4.X
2355; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
2356; CM-NEXT:    CF_END
2357; CM-NEXT:    PAD
2358; CM-NEXT:    ALU clause starting at 6:
2359; CM-NEXT:     MOV * T0.W, KC0[8].X,
2360; CM-NEXT:     MOV T1.Y, KC0[8].Z,
2361; CM-NEXT:     MOV * T0.Z, KC0[7].W,
2362; CM-NEXT:     MOV T1.X, KC0[8].Y,
2363; CM-NEXT:     MOV * T0.Y, KC0[7].Z,
2364; CM-NEXT:     MOV T0.X, KC0[7].Y,
2365; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
2366; CM-NEXT:     MOV * T2.W, KC0[7].X,
2367; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
2368; CM-NEXT:     LSHR T3.X, PV.Z, literal.x,
2369; CM-NEXT:     MOV T2.Z, KC0[6].W,
2370; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
2371; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2372; CM-NEXT:     LSHR T4.X, PV.W, literal.x,
2373; CM-NEXT:     MOV * T2.Y, KC0[6].Z,
2374; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2375; CM-NEXT:     MOV * T2.X, KC0[6].Y,
2376; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
2377; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2378entry:
2379  store <5 x double> %in, ptr addrspace(1) %out, align 8
2380  ret void
2381}
2382
2383; FIXME: Lots of unpack and re-pack junk on VI
2384define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
2385; SI-LABEL: v8i8_arg:
2386; SI:       ; %bb.0: ; %entry
2387; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2388; SI-NEXT:    s_mov_b32 s7, 0xf000
2389; SI-NEXT:    s_mov_b32 s6, -1
2390; SI-NEXT:    s_waitcnt lgkmcnt(0)
2391; SI-NEXT:    s_mov_b32 s4, s0
2392; SI-NEXT:    s_mov_b32 s5, s1
2393; SI-NEXT:    v_mov_b32_e32 v0, s2
2394; SI-NEXT:    v_mov_b32_e32 v1, s3
2395; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2396; SI-NEXT:    s_endpgm
2397;
2398; VI-LABEL: v8i8_arg:
2399; VI:       ; %bb.0: ; %entry
2400; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2401; VI-NEXT:    s_waitcnt lgkmcnt(0)
2402; VI-NEXT:    v_mov_b32_e32 v0, s0
2403; VI-NEXT:    v_mov_b32_e32 v2, s2
2404; VI-NEXT:    v_mov_b32_e32 v1, s1
2405; VI-NEXT:    v_mov_b32_e32 v3, s3
2406; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
2407; VI-NEXT:    s_endpgm
2408;
2409; GFX9-LABEL: v8i8_arg:
2410; GFX9:       ; %bb.0: ; %entry
2411; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2412; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2413; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2414; GFX9-NEXT:    v_mov_b32_e32 v0, s2
2415; GFX9-NEXT:    v_mov_b32_e32 v1, s3
2416; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
2417; GFX9-NEXT:    s_endpgm
2418;
2419; EG-LABEL: v8i8_arg:
2420; EG:       ; %bb.0: ; %entry
2421; EG-NEXT:    ALU 1, @36, KC0[], KC1[]
2422; EG-NEXT:    TEX 0 @20
2423; EG-NEXT:    ALU 5, @38, KC0[], KC1[]
2424; EG-NEXT:    TEX 0 @22
2425; EG-NEXT:    ALU 5, @44, KC0[], KC1[]
2426; EG-NEXT:    TEX 0 @24
2427; EG-NEXT:    ALU 7, @50, KC0[], KC1[]
2428; EG-NEXT:    TEX 0 @26
2429; EG-NEXT:    ALU 7, @58, KC0[], KC1[]
2430; EG-NEXT:    TEX 0 @28
2431; EG-NEXT:    ALU 7, @66, KC0[], KC1[]
2432; EG-NEXT:    TEX 0 @30
2433; EG-NEXT:    ALU 7, @74, KC0[], KC1[]
2434; EG-NEXT:    TEX 0 @32
2435; EG-NEXT:    ALU 5, @82, KC0[], KC1[]
2436; EG-NEXT:    TEX 0 @34
2437; EG-NEXT:    ALU 5, @88, KC0[CB0:0-32], KC1[]
2438; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
2439; EG-NEXT:    CF_END
2440; EG-NEXT:    PAD
2441; EG-NEXT:    Fetch clause starting at 20:
2442; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 51, #3
2443; EG-NEXT:    Fetch clause starting at 22:
2444; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 47, #3
2445; EG-NEXT:    Fetch clause starting at 24:
2446; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 50, #3
2447; EG-NEXT:    Fetch clause starting at 26:
2448; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 46, #3
2449; EG-NEXT:    Fetch clause starting at 28:
2450; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 49, #3
2451; EG-NEXT:    Fetch clause starting at 30:
2452; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 45, #3
2453; EG-NEXT:    Fetch clause starting at 32:
2454; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 48, #3
2455; EG-NEXT:    Fetch clause starting at 34:
2456; EG-NEXT:     VTX_READ_8 T5.X, T5.X, 44, #3
2457; EG-NEXT:    ALU clause starting at 36:
2458; EG-NEXT:     MOV * T0.Y, T2.X,
2459; EG-NEXT:     MOV * T5.X, 0.0,
2460; EG-NEXT:    ALU clause starting at 38:
2461; EG-NEXT:     LSHL T0.W, T6.X, literal.x,
2462; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2463; EG-NEXT:    24(3.363116e-44), 16777215(2.350989e-38)
2464; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
2465; EG-NEXT:     MOV T2.X, PV.W,
2466; EG-NEXT:     MOV * T0.Y, T3.X,
2467; EG-NEXT:    ALU clause starting at 44:
2468; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2469; EG-NEXT:     LSHL * T1.W, T6.X, literal.y,
2470; EG-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
2471; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2472; EG-NEXT:     MOV T3.X, PV.W,
2473; EG-NEXT:     MOV * T0.Y, T2.X,
2474; EG-NEXT:    ALU clause starting at 50:
2475; EG-NEXT:     AND_INT T0.W, T6.X, literal.x,
2476; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2477; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
2478; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2479; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2480; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
2481; EG-NEXT:     MOV T2.X, PV.W,
2482; EG-NEXT:     MOV * T0.Y, T3.X,
2483; EG-NEXT:    ALU clause starting at 58:
2484; EG-NEXT:     AND_INT T0.W, T6.X, literal.x,
2485; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2486; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
2487; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2488; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2489; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
2490; EG-NEXT:     MOV T3.X, PV.W,
2491; EG-NEXT:     MOV * T0.Y, T2.X,
2492; EG-NEXT:    ALU clause starting at 66:
2493; EG-NEXT:     AND_INT T0.W, T6.X, literal.x,
2494; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2495; EG-NEXT:    255(3.573311e-43), -65281(nan)
2496; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2497; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2498; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
2499; EG-NEXT:     MOV T2.X, PV.W,
2500; EG-NEXT:     MOV * T0.Y, T3.X,
2501; EG-NEXT:    ALU clause starting at 74:
2502; EG-NEXT:     AND_INT T0.W, T6.X, literal.x,
2503; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2504; EG-NEXT:    255(3.573311e-43), -65281(nan)
2505; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2506; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2507; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
2508; EG-NEXT:     MOV T3.X, PV.W,
2509; EG-NEXT:     MOV * T0.Y, T2.X,
2510; EG-NEXT:    ALU clause starting at 82:
2511; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2512; EG-NEXT:     AND_INT * T1.W, T6.X, literal.y,
2513; EG-NEXT:    -256(nan), 255(3.573311e-43)
2514; EG-NEXT:     OR_INT * T5.Y, PV.W, PS,
2515; EG-NEXT:     MOV T2.X, PV.Y,
2516; EG-NEXT:     MOV * T0.Y, T3.X,
2517; EG-NEXT:    ALU clause starting at 88:
2518; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2519; EG-NEXT:     AND_INT * T1.W, T5.X, literal.y,
2520; EG-NEXT:    -256(nan), 255(3.573311e-43)
2521; EG-NEXT:     OR_INT T5.X, PV.W, PS,
2522; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
2523; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2524;
2525; CM-LABEL: v8i8_arg:
2526; CM:       ; %bb.0: ; %entry
2527; CM-NEXT:    ALU 1, @36, KC0[], KC1[]
2528; CM-NEXT:    TEX 0 @20
2529; CM-NEXT:    ALU 5, @38, KC0[], KC1[]
2530; CM-NEXT:    TEX 0 @22
2531; CM-NEXT:    ALU 5, @44, KC0[], KC1[]
2532; CM-NEXT:    TEX 0 @24
2533; CM-NEXT:    ALU 7, @50, KC0[], KC1[]
2534; CM-NEXT:    TEX 0 @26
2535; CM-NEXT:    ALU 7, @58, KC0[], KC1[]
2536; CM-NEXT:    TEX 0 @28
2537; CM-NEXT:    ALU 7, @66, KC0[], KC1[]
2538; CM-NEXT:    TEX 0 @30
2539; CM-NEXT:    ALU 7, @74, KC0[], KC1[]
2540; CM-NEXT:    TEX 0 @32
2541; CM-NEXT:    ALU 5, @82, KC0[], KC1[]
2542; CM-NEXT:    TEX 0 @34
2543; CM-NEXT:    ALU 5, @88, KC0[CB0:0-32], KC1[]
2544; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
2545; CM-NEXT:    CF_END
2546; CM-NEXT:    PAD
2547; CM-NEXT:    Fetch clause starting at 20:
2548; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 51, #3
2549; CM-NEXT:    Fetch clause starting at 22:
2550; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 47, #3
2551; CM-NEXT:    Fetch clause starting at 24:
2552; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 50, #3
2553; CM-NEXT:    Fetch clause starting at 26:
2554; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 46, #3
2555; CM-NEXT:    Fetch clause starting at 28:
2556; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 49, #3
2557; CM-NEXT:    Fetch clause starting at 30:
2558; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 45, #3
2559; CM-NEXT:    Fetch clause starting at 32:
2560; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 48, #3
2561; CM-NEXT:    Fetch clause starting at 34:
2562; CM-NEXT:     VTX_READ_8 T5.X, T5.X, 44, #3
2563; CM-NEXT:    ALU clause starting at 36:
2564; CM-NEXT:     MOV * T0.Y, T2.X,
2565; CM-NEXT:     MOV * T5.X, 0.0,
2566; CM-NEXT:    ALU clause starting at 38:
2567; CM-NEXT:     LSHL T0.Z, T6.X, literal.x,
2568; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
2569; CM-NEXT:    24(3.363116e-44), 16777215(2.350989e-38)
2570; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
2571; CM-NEXT:     MOV T2.X, PV.W,
2572; CM-NEXT:     MOV * T0.Y, T3.X,
2573; CM-NEXT:    ALU clause starting at 44:
2574; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2575; CM-NEXT:     LSHL * T0.W, T6.X, literal.y,
2576; CM-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
2577; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2578; CM-NEXT:     MOV T3.X, PV.W,
2579; CM-NEXT:     MOV * T0.Y, T2.X,
2580; CM-NEXT:    ALU clause starting at 50:
2581; CM-NEXT:     AND_INT * T0.W, T6.X, literal.x,
2582; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2583; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2584; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
2585; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
2586; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2587; CM-NEXT:     MOV T2.X, PV.W,
2588; CM-NEXT:     MOV * T0.Y, T3.X,
2589; CM-NEXT:    ALU clause starting at 58:
2590; CM-NEXT:     AND_INT * T0.W, T6.X, literal.x,
2591; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2592; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2593; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
2594; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
2595; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2596; CM-NEXT:     MOV T3.X, PV.W,
2597; CM-NEXT:     MOV * T0.Y, T2.X,
2598; CM-NEXT:    ALU clause starting at 66:
2599; CM-NEXT:     AND_INT * T0.W, T6.X, literal.x,
2600; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2601; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2602; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
2603; CM-NEXT:    -65281(nan), 8(1.121039e-44)
2604; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2605; CM-NEXT:     MOV T2.X, PV.W,
2606; CM-NEXT:     MOV * T0.Y, T3.X,
2607; CM-NEXT:    ALU clause starting at 74:
2608; CM-NEXT:     AND_INT * T0.W, T6.X, literal.x,
2609; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2610; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2611; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
2612; CM-NEXT:    -65281(nan), 8(1.121039e-44)
2613; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2614; CM-NEXT:     MOV T3.X, PV.W,
2615; CM-NEXT:     MOV * T0.Y, T2.X,
2616; CM-NEXT:    ALU clause starting at 82:
2617; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2618; CM-NEXT:     AND_INT * T0.W, T6.X, literal.y,
2619; CM-NEXT:    -256(nan), 255(3.573311e-43)
2620; CM-NEXT:     OR_INT * T5.Y, PV.Z, PV.W,
2621; CM-NEXT:     MOV T2.X, PV.Y,
2622; CM-NEXT:     MOV * T0.Y, T3.X,
2623; CM-NEXT:    ALU clause starting at 88:
2624; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2625; CM-NEXT:     AND_INT * T0.W, T5.X, literal.y,
2626; CM-NEXT:    -256(nan), 255(3.573311e-43)
2627; CM-NEXT:     OR_INT * T5.X, PV.Z, PV.W,
2628; CM-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
2629; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2630entry:
2631  store <8 x i8> %in, ptr addrspace(1) %out
2632  ret void
2633}
2634
2635define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
2636; SI-LABEL: v8i16_arg:
2637; SI:       ; %bb.0: ; %entry
2638; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
2639; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
2640; SI-NEXT:    s_mov_b32 s7, 0xf000
2641; SI-NEXT:    s_mov_b32 s6, -1
2642; SI-NEXT:    s_waitcnt lgkmcnt(0)
2643; SI-NEXT:    v_mov_b32_e32 v0, s0
2644; SI-NEXT:    v_mov_b32_e32 v1, s1
2645; SI-NEXT:    v_mov_b32_e32 v2, s2
2646; SI-NEXT:    v_mov_b32_e32 v3, s3
2647; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2648; SI-NEXT:    s_endpgm
2649;
2650; VI-LABEL: v8i16_arg:
2651; VI:       ; %bb.0: ; %entry
2652; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
2653; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
2654; VI-NEXT:    s_waitcnt lgkmcnt(0)
2655; VI-NEXT:    v_mov_b32_e32 v4, s6
2656; VI-NEXT:    v_mov_b32_e32 v0, s0
2657; VI-NEXT:    v_mov_b32_e32 v5, s7
2658; VI-NEXT:    v_mov_b32_e32 v1, s1
2659; VI-NEXT:    v_mov_b32_e32 v2, s2
2660; VI-NEXT:    v_mov_b32_e32 v3, s3
2661; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2662; VI-NEXT:    s_endpgm
2663;
2664; GFX9-LABEL: v8i16_arg:
2665; GFX9:       ; %bb.0: ; %entry
2666; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
2667; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2668; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2669; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2670; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2671; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2672; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2673; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2674; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
2675; GFX9-NEXT:    s_endpgm
2676;
2677; EG-LABEL: v8i16_arg:
2678; EG:       ; %bb.0: ; %entry
2679; EG-NEXT:    ALU 1, @36, KC0[], KC1[]
2680; EG-NEXT:    TEX 0 @20
2681; EG-NEXT:    ALU 5, @38, KC0[], KC1[]
2682; EG-NEXT:    TEX 0 @22
2683; EG-NEXT:    ALU 5, @44, KC0[], KC1[]
2684; EG-NEXT:    TEX 0 @24
2685; EG-NEXT:    ALU 5, @50, KC0[], KC1[]
2686; EG-NEXT:    TEX 0 @26
2687; EG-NEXT:    ALU 5, @56, KC0[], KC1[]
2688; EG-NEXT:    TEX 0 @28
2689; EG-NEXT:    ALU 5, @62, KC0[], KC1[]
2690; EG-NEXT:    TEX 0 @30
2691; EG-NEXT:    ALU 5, @68, KC0[], KC1[]
2692; EG-NEXT:    TEX 0 @32
2693; EG-NEXT:    ALU 5, @74, KC0[], KC1[]
2694; EG-NEXT:    TEX 0 @34
2695; EG-NEXT:    ALU 8, @80, KC0[CB0:0-32], KC1[]
2696; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
2697; EG-NEXT:    CF_END
2698; EG-NEXT:    PAD
2699; EG-NEXT:    Fetch clause starting at 20:
2700; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 66, #3
2701; EG-NEXT:    Fetch clause starting at 22:
2702; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 58, #3
2703; EG-NEXT:    Fetch clause starting at 24:
2704; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 64, #3
2705; EG-NEXT:    Fetch clause starting at 26:
2706; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 56, #3
2707; EG-NEXT:    Fetch clause starting at 28:
2708; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 62, #3
2709; EG-NEXT:    Fetch clause starting at 30:
2710; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 54, #3
2711; EG-NEXT:    Fetch clause starting at 32:
2712; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 60, #3
2713; EG-NEXT:    Fetch clause starting at 34:
2714; EG-NEXT:     VTX_READ_16 T7.X, T7.X, 52, #3
2715; EG-NEXT:    ALU clause starting at 36:
2716; EG-NEXT:     MOV * T0.Y, T3.X,
2717; EG-NEXT:     MOV * T7.X, 0.0,
2718; EG-NEXT:    ALU clause starting at 38:
2719; EG-NEXT:     LSHL T0.W, T8.X, literal.x,
2720; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2721; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
2722; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
2723; EG-NEXT:     MOV T3.X, PV.W,
2724; EG-NEXT:     MOV * T0.Y, T5.X,
2725; EG-NEXT:    ALU clause starting at 44:
2726; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2727; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
2728; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2729; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2730; EG-NEXT:     MOV T5.X, PV.W,
2731; EG-NEXT:     MOV * T0.Y, T3.X,
2732; EG-NEXT:    ALU clause starting at 50:
2733; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2734; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
2735; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
2736; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2737; EG-NEXT:     MOV T3.X, PV.W,
2738; EG-NEXT:     MOV * T0.Y, T5.X,
2739; EG-NEXT:    ALU clause starting at 56:
2740; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2741; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
2742; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
2743; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2744; EG-NEXT:     MOV T5.X, PV.W,
2745; EG-NEXT:     MOV * T0.Y, T2.X,
2746; EG-NEXT:    ALU clause starting at 62:
2747; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2748; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
2749; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2750; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2751; EG-NEXT:     MOV T2.X, PV.W,
2752; EG-NEXT:     MOV * T0.Y, T4.X,
2753; EG-NEXT:    ALU clause starting at 68:
2754; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2755; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
2756; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2757; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2758; EG-NEXT:     MOV T4.X, PV.W,
2759; EG-NEXT:     MOV * T0.Y, T2.X,
2760; EG-NEXT:    ALU clause starting at 74:
2761; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2762; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
2763; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
2764; EG-NEXT:     OR_INT * T7.Z, PV.W, PS,
2765; EG-NEXT:     MOV T2.X, PV.Z,
2766; EG-NEXT:     MOV * T0.Y, T4.X,
2767; EG-NEXT:    ALU clause starting at 80:
2768; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
2769; EG-NEXT:     AND_INT T0.W, T0.Y, literal.y,
2770; EG-NEXT:     AND_INT * T1.W, T7.X, literal.z,
2771; EG-NEXT:    2(2.802597e-45), -65536(nan)
2772; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2773; EG-NEXT:     OR_INT * T7.X, PV.W, PS,
2774; EG-NEXT:     MOV T4.X, PV.X,
2775; EG-NEXT:     MOV * T7.W, T3.X,
2776; EG-NEXT:     MOV * T7.Y, T5.X,
2777;
2778; CM-LABEL: v8i16_arg:
2779; CM:       ; %bb.0: ; %entry
2780; CM-NEXT:    ALU 1, @36, KC0[], KC1[]
2781; CM-NEXT:    TEX 0 @20
2782; CM-NEXT:    ALU 5, @38, KC0[], KC1[]
2783; CM-NEXT:    TEX 0 @22
2784; CM-NEXT:    ALU 5, @44, KC0[], KC1[]
2785; CM-NEXT:    TEX 0 @24
2786; CM-NEXT:    ALU 5, @50, KC0[], KC1[]
2787; CM-NEXT:    TEX 0 @26
2788; CM-NEXT:    ALU 5, @56, KC0[], KC1[]
2789; CM-NEXT:    TEX 0 @28
2790; CM-NEXT:    ALU 5, @62, KC0[], KC1[]
2791; CM-NEXT:    TEX 0 @30
2792; CM-NEXT:    ALU 5, @68, KC0[], KC1[]
2793; CM-NEXT:    TEX 0 @32
2794; CM-NEXT:    ALU 5, @74, KC0[], KC1[]
2795; CM-NEXT:    TEX 0 @34
2796; CM-NEXT:    ALU 8, @80, KC0[CB0:0-32], KC1[]
2797; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
2798; CM-NEXT:    CF_END
2799; CM-NEXT:    PAD
2800; CM-NEXT:    Fetch clause starting at 20:
2801; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 66, #3
2802; CM-NEXT:    Fetch clause starting at 22:
2803; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 58, #3
2804; CM-NEXT:    Fetch clause starting at 24:
2805; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 64, #3
2806; CM-NEXT:    Fetch clause starting at 26:
2807; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 56, #3
2808; CM-NEXT:    Fetch clause starting at 28:
2809; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 62, #3
2810; CM-NEXT:    Fetch clause starting at 30:
2811; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 54, #3
2812; CM-NEXT:    Fetch clause starting at 32:
2813; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 60, #3
2814; CM-NEXT:    Fetch clause starting at 34:
2815; CM-NEXT:     VTX_READ_16 T7.X, T7.X, 52, #3
2816; CM-NEXT:    ALU clause starting at 36:
2817; CM-NEXT:     MOV * T0.Y, T3.X,
2818; CM-NEXT:     MOV * T7.X, 0.0,
2819; CM-NEXT:    ALU clause starting at 38:
2820; CM-NEXT:     LSHL T0.Z, T8.X, literal.x,
2821; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
2822; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
2823; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
2824; CM-NEXT:     MOV T3.X, PV.W,
2825; CM-NEXT:     MOV * T0.Y, T5.X,
2826; CM-NEXT:    ALU clause starting at 44:
2827; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2828; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
2829; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2830; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2831; CM-NEXT:     MOV T5.X, PV.W,
2832; CM-NEXT:     MOV * T0.Y, T3.X,
2833; CM-NEXT:    ALU clause starting at 50:
2834; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2835; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
2836; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
2837; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2838; CM-NEXT:     MOV T3.X, PV.W,
2839; CM-NEXT:     MOV * T0.Y, T5.X,
2840; CM-NEXT:    ALU clause starting at 56:
2841; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2842; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
2843; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
2844; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2845; CM-NEXT:     MOV T5.X, PV.W,
2846; CM-NEXT:     MOV * T0.Y, T2.X,
2847; CM-NEXT:    ALU clause starting at 62:
2848; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2849; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
2850; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2851; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2852; CM-NEXT:     MOV T2.X, PV.W,
2853; CM-NEXT:     MOV * T0.Y, T4.X,
2854; CM-NEXT:    ALU clause starting at 68:
2855; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2856; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
2857; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2858; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2859; CM-NEXT:     MOV T4.X, PV.W,
2860; CM-NEXT:     MOV * T0.Y, T2.X,
2861; CM-NEXT:    ALU clause starting at 74:
2862; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2863; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
2864; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
2865; CM-NEXT:     OR_INT * T7.Z, PV.Z, PV.W,
2866; CM-NEXT:     MOV T2.X, PV.Z,
2867; CM-NEXT:     MOV * T0.Y, T4.X,
2868; CM-NEXT:    ALU clause starting at 80:
2869; CM-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
2870; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
2871; CM-NEXT:     AND_INT * T0.W, T7.X, literal.z,
2872; CM-NEXT:    2(2.802597e-45), -65536(nan)
2873; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2874; CM-NEXT:     OR_INT * T7.X, PV.Z, PV.W,
2875; CM-NEXT:     MOV T4.X, PV.X,
2876; CM-NEXT:     MOV * T7.W, T3.X,
2877; CM-NEXT:     MOV * T7.Y, T5.X,
2878entry:
2879  store <8 x i16> %in, ptr addrspace(1) %out
2880  ret void
2881}
2882
2883define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
2884; SI-LABEL: v8i32_arg:
2885; SI:       ; %bb.0: ; %entry
2886; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x11
2887; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2888; SI-NEXT:    s_mov_b32 s3, 0xf000
2889; SI-NEXT:    s_mov_b32 s2, -1
2890; SI-NEXT:    s_waitcnt lgkmcnt(0)
2891; SI-NEXT:    v_mov_b32_e32 v0, s12
2892; SI-NEXT:    v_mov_b32_e32 v1, s13
2893; SI-NEXT:    v_mov_b32_e32 v2, s14
2894; SI-NEXT:    v_mov_b32_e32 v3, s15
2895; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
2896; SI-NEXT:    s_waitcnt expcnt(0)
2897; SI-NEXT:    v_mov_b32_e32 v0, s8
2898; SI-NEXT:    v_mov_b32_e32 v1, s9
2899; SI-NEXT:    v_mov_b32_e32 v2, s10
2900; SI-NEXT:    v_mov_b32_e32 v3, s11
2901; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2902; SI-NEXT:    s_endpgm
2903;
2904; VI-LABEL: v8i32_arg:
2905; VI:       ; %bb.0: ; %entry
2906; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
2907; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2908; VI-NEXT:    s_waitcnt lgkmcnt(0)
2909; VI-NEXT:    v_mov_b32_e32 v0, s12
2910; VI-NEXT:    s_add_u32 s2, s0, 16
2911; VI-NEXT:    s_addc_u32 s3, s1, 0
2912; VI-NEXT:    v_mov_b32_e32 v5, s3
2913; VI-NEXT:    v_mov_b32_e32 v1, s13
2914; VI-NEXT:    v_mov_b32_e32 v2, s14
2915; VI-NEXT:    v_mov_b32_e32 v3, s15
2916; VI-NEXT:    v_mov_b32_e32 v4, s2
2917; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2918; VI-NEXT:    v_mov_b32_e32 v5, s1
2919; VI-NEXT:    v_mov_b32_e32 v0, s8
2920; VI-NEXT:    v_mov_b32_e32 v1, s9
2921; VI-NEXT:    v_mov_b32_e32 v2, s10
2922; VI-NEXT:    v_mov_b32_e32 v3, s11
2923; VI-NEXT:    v_mov_b32_e32 v4, s0
2924; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2925; VI-NEXT:    s_endpgm
2926;
2927; GFX9-LABEL: v8i32_arg:
2928; GFX9:       ; %bb.0: ; %entry
2929; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x20
2930; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2931; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
2932; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2933; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2934; GFX9-NEXT:    v_mov_b32_e32 v1, s5
2935; GFX9-NEXT:    v_mov_b32_e32 v2, s6
2936; GFX9-NEXT:    v_mov_b32_e32 v3, s7
2937; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
2938; GFX9-NEXT:    s_nop 0
2939; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2940; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2941; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2942; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2943; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
2944; GFX9-NEXT:    s_endpgm
2945;
2946; EG-LABEL: v8i32_arg:
2947; EG:       ; %bb.0: ; %entry
2948; EG-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
2949; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
2950; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
2951; EG-NEXT:    CF_END
2952; EG-NEXT:    ALU clause starting at 4:
2953; EG-NEXT:     MOV * T0.W, KC0[5].X,
2954; EG-NEXT:     MOV * T0.Z, KC0[4].W,
2955; EG-NEXT:     MOV T0.Y, KC0[4].Z,
2956; EG-NEXT:     MOV * T1.W, KC0[6].X,
2957; EG-NEXT:     MOV T0.X, KC0[4].Y,
2958; EG-NEXT:     MOV * T1.Z, KC0[5].W,
2959; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
2960; EG-NEXT:     MOV * T1.Y, KC0[5].Z,
2961; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2962; EG-NEXT:     MOV T1.X, KC0[5].Y,
2963; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
2964; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2965; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
2966; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2967;
2968; CM-LABEL: v8i32_arg:
2969; CM:       ; %bb.0: ; %entry
2970; CM-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
2971; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
2972; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
2973; CM-NEXT:    CF_END
2974; CM-NEXT:    ALU clause starting at 4:
2975; CM-NEXT:     MOV * T0.W, KC0[6].X,
2976; CM-NEXT:     MOV * T0.Z, KC0[5].W,
2977; CM-NEXT:     MOV * T0.Y, KC0[5].Z,
2978; CM-NEXT:     MOV T0.X, KC0[5].Y,
2979; CM-NEXT:     MOV * T1.W, KC0[5].X,
2980; CM-NEXT:     MOV T1.Z, KC0[4].W,
2981; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
2982; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2983; CM-NEXT:     LSHR T2.X, PV.W, literal.x,
2984; CM-NEXT:     MOV * T1.Y, KC0[4].Z,
2985; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2986; CM-NEXT:     MOV * T1.X, KC0[4].Y,
2987; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
2988; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2989entry:
2990  store <8 x i32> %in, ptr addrspace(1) %out, align 4
2991  ret void
2992}
2993
2994define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind {
2995; SI-LABEL: v8f32_arg:
2996; SI:       ; %bb.0: ; %entry
2997; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x11
2998; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2999; SI-NEXT:    s_mov_b32 s3, 0xf000
3000; SI-NEXT:    s_mov_b32 s2, -1
3001; SI-NEXT:    s_waitcnt lgkmcnt(0)
3002; SI-NEXT:    v_mov_b32_e32 v0, s12
3003; SI-NEXT:    v_mov_b32_e32 v1, s13
3004; SI-NEXT:    v_mov_b32_e32 v2, s14
3005; SI-NEXT:    v_mov_b32_e32 v3, s15
3006; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3007; SI-NEXT:    s_waitcnt expcnt(0)
3008; SI-NEXT:    v_mov_b32_e32 v0, s8
3009; SI-NEXT:    v_mov_b32_e32 v1, s9
3010; SI-NEXT:    v_mov_b32_e32 v2, s10
3011; SI-NEXT:    v_mov_b32_e32 v3, s11
3012; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3013; SI-NEXT:    s_endpgm
3014;
3015; VI-LABEL: v8f32_arg:
3016; VI:       ; %bb.0: ; %entry
3017; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
3018; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3019; VI-NEXT:    s_waitcnt lgkmcnt(0)
3020; VI-NEXT:    v_mov_b32_e32 v0, s12
3021; VI-NEXT:    s_add_u32 s2, s0, 16
3022; VI-NEXT:    s_addc_u32 s3, s1, 0
3023; VI-NEXT:    v_mov_b32_e32 v5, s3
3024; VI-NEXT:    v_mov_b32_e32 v1, s13
3025; VI-NEXT:    v_mov_b32_e32 v2, s14
3026; VI-NEXT:    v_mov_b32_e32 v3, s15
3027; VI-NEXT:    v_mov_b32_e32 v4, s2
3028; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3029; VI-NEXT:    v_mov_b32_e32 v5, s1
3030; VI-NEXT:    v_mov_b32_e32 v0, s8
3031; VI-NEXT:    v_mov_b32_e32 v1, s9
3032; VI-NEXT:    v_mov_b32_e32 v2, s10
3033; VI-NEXT:    v_mov_b32_e32 v3, s11
3034; VI-NEXT:    v_mov_b32_e32 v4, s0
3035; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3036; VI-NEXT:    s_endpgm
3037;
3038; GFX9-LABEL: v8f32_arg:
3039; GFX9:       ; %bb.0: ; %entry
3040; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x20
3041; GFX9-NEXT:    v_mov_b32_e32 v4, 0
3042; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
3043; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3044; GFX9-NEXT:    v_mov_b32_e32 v0, s4
3045; GFX9-NEXT:    v_mov_b32_e32 v1, s5
3046; GFX9-NEXT:    v_mov_b32_e32 v2, s6
3047; GFX9-NEXT:    v_mov_b32_e32 v3, s7
3048; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
3049; GFX9-NEXT:    s_nop 0
3050; GFX9-NEXT:    v_mov_b32_e32 v0, s0
3051; GFX9-NEXT:    v_mov_b32_e32 v1, s1
3052; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3053; GFX9-NEXT:    v_mov_b32_e32 v3, s3
3054; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
3055; GFX9-NEXT:    s_endpgm
3056;
3057; EG-LABEL: v8f32_arg:
3058; EG:       ; %bb.0: ; %entry
3059; EG-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
3060; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
3061; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
3062; EG-NEXT:    CF_END
3063; EG-NEXT:    ALU clause starting at 4:
3064; EG-NEXT:     MOV * T0.W, KC0[5].X,
3065; EG-NEXT:     MOV * T0.Z, KC0[4].W,
3066; EG-NEXT:     MOV T0.Y, KC0[4].Z,
3067; EG-NEXT:     MOV * T1.W, KC0[6].X,
3068; EG-NEXT:     MOV T0.X, KC0[4].Y,
3069; EG-NEXT:     MOV * T1.Z, KC0[5].W,
3070; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
3071; EG-NEXT:     MOV * T1.Y, KC0[5].Z,
3072; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3073; EG-NEXT:     MOV T1.X, KC0[5].Y,
3074; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
3075; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3076; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
3077; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3078;
3079; CM-LABEL: v8f32_arg:
3080; CM:       ; %bb.0: ; %entry
3081; CM-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
3082; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
3083; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
3084; CM-NEXT:    CF_END
3085; CM-NEXT:    ALU clause starting at 4:
3086; CM-NEXT:     MOV * T0.W, KC0[6].X,
3087; CM-NEXT:     MOV * T0.Z, KC0[5].W,
3088; CM-NEXT:     MOV * T0.Y, KC0[5].Z,
3089; CM-NEXT:     MOV T0.X, KC0[5].Y,
3090; CM-NEXT:     MOV * T1.W, KC0[5].X,
3091; CM-NEXT:     MOV T1.Z, KC0[4].W,
3092; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
3093; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3094; CM-NEXT:     LSHR T2.X, PV.W, literal.x,
3095; CM-NEXT:     MOV * T1.Y, KC0[4].Z,
3096; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3097; CM-NEXT:     MOV * T1.X, KC0[4].Y,
3098; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
3099; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3100entry:
3101  store <8 x float> %in, ptr addrspace(1) %out, align 4
3102  ret void
3103}
3104
3105; FIXME: Pack/repack on VI
3106define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
3107; SI-LABEL: v16i8_arg:
3108; SI:       ; %bb.0: ; %entry
3109; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
3110; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
3111; SI-NEXT:    s_mov_b32 s7, 0xf000
3112; SI-NEXT:    s_mov_b32 s6, -1
3113; SI-NEXT:    s_waitcnt lgkmcnt(0)
3114; SI-NEXT:    v_mov_b32_e32 v0, s0
3115; SI-NEXT:    v_mov_b32_e32 v1, s1
3116; SI-NEXT:    v_mov_b32_e32 v2, s2
3117; SI-NEXT:    v_mov_b32_e32 v3, s3
3118; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
3119; SI-NEXT:    s_endpgm
3120;
3121; VI-LABEL: v16i8_arg:
3122; VI:       ; %bb.0: ; %entry
3123; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
3124; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
3125; VI-NEXT:    s_waitcnt lgkmcnt(0)
3126; VI-NEXT:    v_mov_b32_e32 v4, s6
3127; VI-NEXT:    v_mov_b32_e32 v0, s0
3128; VI-NEXT:    v_mov_b32_e32 v5, s7
3129; VI-NEXT:    v_mov_b32_e32 v1, s1
3130; VI-NEXT:    v_mov_b32_e32 v2, s2
3131; VI-NEXT:    v_mov_b32_e32 v3, s3
3132; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3133; VI-NEXT:    s_endpgm
3134;
3135; GFX9-LABEL: v16i8_arg:
3136; GFX9:       ; %bb.0: ; %entry
3137; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
3138; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
3139; GFX9-NEXT:    v_mov_b32_e32 v4, 0
3140; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3141; GFX9-NEXT:    v_mov_b32_e32 v0, s0
3142; GFX9-NEXT:    v_mov_b32_e32 v1, s1
3143; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3144; GFX9-NEXT:    v_mov_b32_e32 v3, s3
3145; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
3146; GFX9-NEXT:    s_endpgm
3147;
3148; EG-LABEL: v16i8_arg:
3149; EG:       ; %bb.0: ; %entry
3150; EG-NEXT:    ALU 1, @68, KC0[], KC1[]
3151; EG-NEXT:    TEX 0 @36
3152; EG-NEXT:    ALU 5, @70, KC0[], KC1[]
3153; EG-NEXT:    TEX 0 @38
3154; EG-NEXT:    ALU 5, @76, KC0[], KC1[]
3155; EG-NEXT:    TEX 0 @40
3156; EG-NEXT:    ALU 5, @82, KC0[], KC1[]
3157; EG-NEXT:    TEX 0 @42
3158; EG-NEXT:    ALU 5, @88, KC0[], KC1[]
3159; EG-NEXT:    TEX 0 @44
3160; EG-NEXT:    ALU 7, @94, KC0[], KC1[]
3161; EG-NEXT:    TEX 0 @46
3162; EG-NEXT:    ALU 7, @102, KC0[], KC1[]
3163; EG-NEXT:    TEX 0 @48
3164; EG-NEXT:    ALU 7, @110, KC0[], KC1[]
3165; EG-NEXT:    TEX 0 @50
3166; EG-NEXT:    ALU 7, @118, KC0[], KC1[]
3167; EG-NEXT:    TEX 0 @52
3168; EG-NEXT:    ALU 7, @126, KC0[], KC1[]
3169; EG-NEXT:    TEX 0 @54
3170; EG-NEXT:    ALU 7, @134, KC0[], KC1[]
3171; EG-NEXT:    TEX 0 @56
3172; EG-NEXT:    ALU 7, @142, KC0[], KC1[]
3173; EG-NEXT:    TEX 0 @58
3174; EG-NEXT:    ALU 7, @150, KC0[], KC1[]
3175; EG-NEXT:    TEX 0 @60
3176; EG-NEXT:    ALU 5, @158, KC0[], KC1[]
3177; EG-NEXT:    TEX 0 @62
3178; EG-NEXT:    ALU 5, @164, KC0[], KC1[]
3179; EG-NEXT:    TEX 0 @64
3180; EG-NEXT:    ALU 5, @170, KC0[], KC1[]
3181; EG-NEXT:    TEX 0 @66
3182; EG-NEXT:    ALU 5, @176, KC0[CB0:0-32], KC1[]
3183; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
3184; EG-NEXT:    CF_END
3185; EG-NEXT:    PAD
3186; EG-NEXT:    Fetch clause starting at 36:
3187; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 67, #3
3188; EG-NEXT:    Fetch clause starting at 38:
3189; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 63, #3
3190; EG-NEXT:    Fetch clause starting at 40:
3191; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 59, #3
3192; EG-NEXT:    Fetch clause starting at 42:
3193; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 55, #3
3194; EG-NEXT:    Fetch clause starting at 44:
3195; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 66, #3
3196; EG-NEXT:    Fetch clause starting at 46:
3197; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 62, #3
3198; EG-NEXT:    Fetch clause starting at 48:
3199; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 58, #3
3200; EG-NEXT:    Fetch clause starting at 50:
3201; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 54, #3
3202; EG-NEXT:    Fetch clause starting at 52:
3203; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 65, #3
3204; EG-NEXT:    Fetch clause starting at 54:
3205; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 61, #3
3206; EG-NEXT:    Fetch clause starting at 56:
3207; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 57, #3
3208; EG-NEXT:    Fetch clause starting at 58:
3209; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 53, #3
3210; EG-NEXT:    Fetch clause starting at 60:
3211; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 64, #3
3212; EG-NEXT:    Fetch clause starting at 62:
3213; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 60, #3
3214; EG-NEXT:    Fetch clause starting at 64:
3215; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 56, #3
3216; EG-NEXT:    Fetch clause starting at 66:
3217; EG-NEXT:     VTX_READ_8 T7.X, T7.X, 52, #3
3218; EG-NEXT:    ALU clause starting at 68:
3219; EG-NEXT:     MOV * T0.Y, T2.X,
3220; EG-NEXT:     MOV * T7.X, 0.0,
3221; EG-NEXT:    ALU clause starting at 70:
3222; EG-NEXT:     LSHL T0.W, T8.X, literal.x,
3223; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3224; EG-NEXT:    24(3.363116e-44), 16777215(2.350989e-38)
3225; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
3226; EG-NEXT:     MOV T2.X, PV.W,
3227; EG-NEXT:     MOV * T0.Y, T3.X,
3228; EG-NEXT:    ALU clause starting at 76:
3229; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3230; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
3231; EG-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3232; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3233; EG-NEXT:     MOV T3.X, PV.W,
3234; EG-NEXT:     MOV * T0.Y, T4.X,
3235; EG-NEXT:    ALU clause starting at 82:
3236; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3237; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
3238; EG-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3239; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3240; EG-NEXT:     MOV T4.X, PV.W,
3241; EG-NEXT:     MOV * T0.Y, T5.X,
3242; EG-NEXT:    ALU clause starting at 88:
3243; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3244; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
3245; EG-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3246; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3247; EG-NEXT:     MOV T5.X, PV.W,
3248; EG-NEXT:     MOV * T0.Y, T2.X,
3249; EG-NEXT:    ALU clause starting at 94:
3250; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3251; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3252; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
3253; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3254; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3255; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3256; EG-NEXT:     MOV T2.X, PV.W,
3257; EG-NEXT:     MOV * T0.Y, T3.X,
3258; EG-NEXT:    ALU clause starting at 102:
3259; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3260; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3261; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
3262; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3263; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3264; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3265; EG-NEXT:     MOV T3.X, PV.W,
3266; EG-NEXT:     MOV * T0.Y, T4.X,
3267; EG-NEXT:    ALU clause starting at 110:
3268; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3269; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3270; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
3271; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3272; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3273; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3274; EG-NEXT:     MOV T4.X, PV.W,
3275; EG-NEXT:     MOV * T0.Y, T5.X,
3276; EG-NEXT:    ALU clause starting at 118:
3277; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3278; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3279; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
3280; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3281; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3282; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3283; EG-NEXT:     MOV T5.X, PV.W,
3284; EG-NEXT:     MOV * T0.Y, T2.X,
3285; EG-NEXT:    ALU clause starting at 126:
3286; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3287; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3288; EG-NEXT:    255(3.573311e-43), -65281(nan)
3289; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3290; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
3291; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3292; EG-NEXT:     MOV T2.X, PV.W,
3293; EG-NEXT:     MOV * T0.Y, T3.X,
3294; EG-NEXT:    ALU clause starting at 134:
3295; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3296; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3297; EG-NEXT:    255(3.573311e-43), -65281(nan)
3298; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3299; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
3300; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3301; EG-NEXT:     MOV T3.X, PV.W,
3302; EG-NEXT:     MOV * T0.Y, T4.X,
3303; EG-NEXT:    ALU clause starting at 142:
3304; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3305; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3306; EG-NEXT:    255(3.573311e-43), -65281(nan)
3307; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3308; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
3309; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3310; EG-NEXT:     MOV T4.X, PV.W,
3311; EG-NEXT:     MOV * T0.Y, T5.X,
3312; EG-NEXT:    ALU clause starting at 150:
3313; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3314; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3315; EG-NEXT:    255(3.573311e-43), -65281(nan)
3316; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3317; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
3318; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3319; EG-NEXT:     MOV T5.X, PV.W,
3320; EG-NEXT:     MOV * T0.Y, T2.X,
3321; EG-NEXT:    ALU clause starting at 158:
3322; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3323; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
3324; EG-NEXT:    -256(nan), 255(3.573311e-43)
3325; EG-NEXT:     OR_INT * T7.W, PV.W, PS,
3326; EG-NEXT:     MOV T2.X, PV.W,
3327; EG-NEXT:     MOV * T0.Y, T3.X,
3328; EG-NEXT:    ALU clause starting at 164:
3329; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3330; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
3331; EG-NEXT:    -256(nan), 255(3.573311e-43)
3332; EG-NEXT:     OR_INT * T7.Z, PV.W, PS,
3333; EG-NEXT:     MOV T3.X, PV.Z,
3334; EG-NEXT:     MOV * T0.Y, T4.X,
3335; EG-NEXT:    ALU clause starting at 170:
3336; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3337; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
3338; EG-NEXT:    -256(nan), 255(3.573311e-43)
3339; EG-NEXT:     OR_INT * T7.Y, PV.W, PS,
3340; EG-NEXT:     MOV T4.X, PV.Y,
3341; EG-NEXT:     MOV * T0.Y, T5.X,
3342; EG-NEXT:    ALU clause starting at 176:
3343; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3344; EG-NEXT:     AND_INT * T1.W, T7.X, literal.y,
3345; EG-NEXT:    -256(nan), 255(3.573311e-43)
3346; EG-NEXT:     OR_INT T7.X, PV.W, PS,
3347; EG-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
3348; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3349;
3350; CM-LABEL: v16i8_arg:
3351; CM:       ; %bb.0: ; %entry
3352; CM-NEXT:    ALU 1, @68, KC0[], KC1[]
3353; CM-NEXT:    TEX 0 @36
3354; CM-NEXT:    ALU 5, @70, KC0[], KC1[]
3355; CM-NEXT:    TEX 0 @38
3356; CM-NEXT:    ALU 5, @76, KC0[], KC1[]
3357; CM-NEXT:    TEX 0 @40
3358; CM-NEXT:    ALU 5, @82, KC0[], KC1[]
3359; CM-NEXT:    TEX 0 @42
3360; CM-NEXT:    ALU 5, @88, KC0[], KC1[]
3361; CM-NEXT:    TEX 0 @44
3362; CM-NEXT:    ALU 7, @94, KC0[], KC1[]
3363; CM-NEXT:    TEX 0 @46
3364; CM-NEXT:    ALU 7, @102, KC0[], KC1[]
3365; CM-NEXT:    TEX 0 @48
3366; CM-NEXT:    ALU 7, @110, KC0[], KC1[]
3367; CM-NEXT:    TEX 0 @50
3368; CM-NEXT:    ALU 7, @118, KC0[], KC1[]
3369; CM-NEXT:    TEX 0 @52
3370; CM-NEXT:    ALU 7, @126, KC0[], KC1[]
3371; CM-NEXT:    TEX 0 @54
3372; CM-NEXT:    ALU 7, @134, KC0[], KC1[]
3373; CM-NEXT:    TEX 0 @56
3374; CM-NEXT:    ALU 7, @142, KC0[], KC1[]
3375; CM-NEXT:    TEX 0 @58
3376; CM-NEXT:    ALU 7, @150, KC0[], KC1[]
3377; CM-NEXT:    TEX 0 @60
3378; CM-NEXT:    ALU 5, @158, KC0[], KC1[]
3379; CM-NEXT:    TEX 0 @62
3380; CM-NEXT:    ALU 5, @164, KC0[], KC1[]
3381; CM-NEXT:    TEX 0 @64
3382; CM-NEXT:    ALU 5, @170, KC0[], KC1[]
3383; CM-NEXT:    TEX 0 @66
3384; CM-NEXT:    ALU 5, @176, KC0[CB0:0-32], KC1[]
3385; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
3386; CM-NEXT:    CF_END
3387; CM-NEXT:    PAD
3388; CM-NEXT:    Fetch clause starting at 36:
3389; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 67, #3
3390; CM-NEXT:    Fetch clause starting at 38:
3391; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 63, #3
3392; CM-NEXT:    Fetch clause starting at 40:
3393; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 59, #3
3394; CM-NEXT:    Fetch clause starting at 42:
3395; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 55, #3
3396; CM-NEXT:    Fetch clause starting at 44:
3397; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 66, #3
3398; CM-NEXT:    Fetch clause starting at 46:
3399; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 62, #3
3400; CM-NEXT:    Fetch clause starting at 48:
3401; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 58, #3
3402; CM-NEXT:    Fetch clause starting at 50:
3403; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 54, #3
3404; CM-NEXT:    Fetch clause starting at 52:
3405; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 65, #3
3406; CM-NEXT:    Fetch clause starting at 54:
3407; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 61, #3
3408; CM-NEXT:    Fetch clause starting at 56:
3409; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 57, #3
3410; CM-NEXT:    Fetch clause starting at 58:
3411; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 53, #3
3412; CM-NEXT:    Fetch clause starting at 60:
3413; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 64, #3
3414; CM-NEXT:    Fetch clause starting at 62:
3415; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 60, #3
3416; CM-NEXT:    Fetch clause starting at 64:
3417; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 56, #3
3418; CM-NEXT:    Fetch clause starting at 66:
3419; CM-NEXT:     VTX_READ_8 T7.X, T7.X, 52, #3
3420; CM-NEXT:    ALU clause starting at 68:
3421; CM-NEXT:     MOV * T0.Y, T2.X,
3422; CM-NEXT:     MOV * T7.X, 0.0,
3423; CM-NEXT:    ALU clause starting at 70:
3424; CM-NEXT:     LSHL T0.Z, T8.X, literal.x,
3425; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
3426; CM-NEXT:    24(3.363116e-44), 16777215(2.350989e-38)
3427; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
3428; CM-NEXT:     MOV T2.X, PV.W,
3429; CM-NEXT:     MOV * T0.Y, T3.X,
3430; CM-NEXT:    ALU clause starting at 76:
3431; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3432; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
3433; CM-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3434; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3435; CM-NEXT:     MOV T3.X, PV.W,
3436; CM-NEXT:     MOV * T0.Y, T4.X,
3437; CM-NEXT:    ALU clause starting at 82:
3438; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3439; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
3440; CM-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3441; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3442; CM-NEXT:     MOV T4.X, PV.W,
3443; CM-NEXT:     MOV * T0.Y, T5.X,
3444; CM-NEXT:    ALU clause starting at 88:
3445; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3446; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
3447; CM-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3448; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3449; CM-NEXT:     MOV T5.X, PV.W,
3450; CM-NEXT:     MOV * T0.Y, T2.X,
3451; CM-NEXT:    ALU clause starting at 94:
3452; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3453; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3454; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3455; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3456; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
3457; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3458; CM-NEXT:     MOV T2.X, PV.W,
3459; CM-NEXT:     MOV * T0.Y, T3.X,
3460; CM-NEXT:    ALU clause starting at 102:
3461; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3462; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3463; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3464; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3465; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
3466; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3467; CM-NEXT:     MOV T3.X, PV.W,
3468; CM-NEXT:     MOV * T0.Y, T4.X,
3469; CM-NEXT:    ALU clause starting at 110:
3470; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3471; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3472; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3473; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3474; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
3475; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3476; CM-NEXT:     MOV T4.X, PV.W,
3477; CM-NEXT:     MOV * T0.Y, T5.X,
3478; CM-NEXT:    ALU clause starting at 118:
3479; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3480; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3481; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3482; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3483; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
3484; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3485; CM-NEXT:     MOV T5.X, PV.W,
3486; CM-NEXT:     MOV * T0.Y, T2.X,
3487; CM-NEXT:    ALU clause starting at 126:
3488; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3489; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3490; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3491; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3492; CM-NEXT:    -65281(nan), 8(1.121039e-44)
3493; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3494; CM-NEXT:     MOV T2.X, PV.W,
3495; CM-NEXT:     MOV * T0.Y, T3.X,
3496; CM-NEXT:    ALU clause starting at 134:
3497; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3498; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3499; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3500; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3501; CM-NEXT:    -65281(nan), 8(1.121039e-44)
3502; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3503; CM-NEXT:     MOV T3.X, PV.W,
3504; CM-NEXT:     MOV * T0.Y, T4.X,
3505; CM-NEXT:    ALU clause starting at 142:
3506; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3507; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3508; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3509; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3510; CM-NEXT:    -65281(nan), 8(1.121039e-44)
3511; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3512; CM-NEXT:     MOV T4.X, PV.W,
3513; CM-NEXT:     MOV * T0.Y, T5.X,
3514; CM-NEXT:    ALU clause starting at 150:
3515; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3516; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3517; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3518; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3519; CM-NEXT:    -65281(nan), 8(1.121039e-44)
3520; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3521; CM-NEXT:     MOV T5.X, PV.W,
3522; CM-NEXT:     MOV * T0.Y, T2.X,
3523; CM-NEXT:    ALU clause starting at 158:
3524; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3525; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
3526; CM-NEXT:    -256(nan), 255(3.573311e-43)
3527; CM-NEXT:     OR_INT * T7.W, PV.Z, PV.W,
3528; CM-NEXT:     MOV T2.X, PV.W,
3529; CM-NEXT:     MOV * T0.Y, T3.X,
3530; CM-NEXT:    ALU clause starting at 164:
3531; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3532; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
3533; CM-NEXT:    -256(nan), 255(3.573311e-43)
3534; CM-NEXT:     OR_INT * T7.Z, PV.Z, PV.W,
3535; CM-NEXT:     MOV T3.X, PV.Z,
3536; CM-NEXT:     MOV * T0.Y, T4.X,
3537; CM-NEXT:    ALU clause starting at 170:
3538; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3539; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
3540; CM-NEXT:    -256(nan), 255(3.573311e-43)
3541; CM-NEXT:     OR_INT * T7.Y, PV.Z, PV.W,
3542; CM-NEXT:     MOV T4.X, PV.Y,
3543; CM-NEXT:     MOV * T0.Y, T5.X,
3544; CM-NEXT:    ALU clause starting at 176:
3545; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3546; CM-NEXT:     AND_INT * T0.W, T7.X, literal.y,
3547; CM-NEXT:    -256(nan), 255(3.573311e-43)
3548; CM-NEXT:     OR_INT * T7.X, PV.Z, PV.W,
3549; CM-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
3550; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3551entry:
3552  store <16 x i8> %in, ptr addrspace(1) %out
3553  ret void
3554}
3555
3556define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
3557; SI-LABEL: v16i16_arg:
3558; SI:       ; %bb.0: ; %entry
3559; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x11
3560; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
3561; SI-NEXT:    s_mov_b32 s3, 0xf000
3562; SI-NEXT:    s_mov_b32 s2, -1
3563; SI-NEXT:    s_waitcnt lgkmcnt(0)
3564; SI-NEXT:    v_mov_b32_e32 v0, s12
3565; SI-NEXT:    v_mov_b32_e32 v1, s13
3566; SI-NEXT:    v_mov_b32_e32 v2, s14
3567; SI-NEXT:    v_mov_b32_e32 v3, s15
3568; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3569; SI-NEXT:    s_waitcnt expcnt(0)
3570; SI-NEXT:    v_mov_b32_e32 v0, s8
3571; SI-NEXT:    v_mov_b32_e32 v1, s9
3572; SI-NEXT:    v_mov_b32_e32 v2, s10
3573; SI-NEXT:    v_mov_b32_e32 v3, s11
3574; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3575; SI-NEXT:    s_endpgm
3576;
3577; VI-LABEL: v16i16_arg:
3578; VI:       ; %bb.0: ; %entry
3579; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
3580; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
3581; VI-NEXT:    s_waitcnt lgkmcnt(0)
3582; VI-NEXT:    v_mov_b32_e32 v0, s12
3583; VI-NEXT:    s_add_u32 s2, s0, 16
3584; VI-NEXT:    s_addc_u32 s3, s1, 0
3585; VI-NEXT:    v_mov_b32_e32 v5, s3
3586; VI-NEXT:    v_mov_b32_e32 v1, s13
3587; VI-NEXT:    v_mov_b32_e32 v2, s14
3588; VI-NEXT:    v_mov_b32_e32 v3, s15
3589; VI-NEXT:    v_mov_b32_e32 v4, s2
3590; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3591; VI-NEXT:    v_mov_b32_e32 v5, s1
3592; VI-NEXT:    v_mov_b32_e32 v0, s8
3593; VI-NEXT:    v_mov_b32_e32 v1, s9
3594; VI-NEXT:    v_mov_b32_e32 v2, s10
3595; VI-NEXT:    v_mov_b32_e32 v3, s11
3596; VI-NEXT:    v_mov_b32_e32 v4, s0
3597; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3598; VI-NEXT:    s_endpgm
3599;
3600; GFX9-LABEL: v16i16_arg:
3601; GFX9:       ; %bb.0: ; %entry
3602; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x20
3603; GFX9-NEXT:    v_mov_b32_e32 v4, 0
3604; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
3605; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3606; GFX9-NEXT:    v_mov_b32_e32 v0, s4
3607; GFX9-NEXT:    v_mov_b32_e32 v1, s5
3608; GFX9-NEXT:    v_mov_b32_e32 v2, s6
3609; GFX9-NEXT:    v_mov_b32_e32 v3, s7
3610; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
3611; GFX9-NEXT:    s_nop 0
3612; GFX9-NEXT:    v_mov_b32_e32 v0, s0
3613; GFX9-NEXT:    v_mov_b32_e32 v1, s1
3614; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3615; GFX9-NEXT:    v_mov_b32_e32 v3, s3
3616; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
3617; GFX9-NEXT:    s_endpgm
3618;
3619; EG-LABEL: v16i16_arg:
3620; EG:       ; %bb.0: ; %entry
3621; EG-NEXT:    ALU 1, @68, KC0[], KC1[]
3622; EG-NEXT:    TEX 0 @36
3623; EG-NEXT:    ALU 5, @70, KC0[], KC1[]
3624; EG-NEXT:    TEX 0 @38
3625; EG-NEXT:    ALU 5, @76, KC0[], KC1[]
3626; EG-NEXT:    TEX 0 @40
3627; EG-NEXT:    ALU 5, @82, KC0[], KC1[]
3628; EG-NEXT:    TEX 0 @42
3629; EG-NEXT:    ALU 5, @88, KC0[], KC1[]
3630; EG-NEXT:    TEX 0 @44
3631; EG-NEXT:    ALU 5, @94, KC0[], KC1[]
3632; EG-NEXT:    TEX 0 @46
3633; EG-NEXT:    ALU 5, @100, KC0[], KC1[]
3634; EG-NEXT:    TEX 0 @48
3635; EG-NEXT:    ALU 5, @106, KC0[], KC1[]
3636; EG-NEXT:    TEX 0 @50
3637; EG-NEXT:    ALU 5, @112, KC0[], KC1[]
3638; EG-NEXT:    TEX 0 @52
3639; EG-NEXT:    ALU 5, @118, KC0[], KC1[]
3640; EG-NEXT:    TEX 0 @54
3641; EG-NEXT:    ALU 5, @124, KC0[], KC1[]
3642; EG-NEXT:    TEX 0 @56
3643; EG-NEXT:    ALU 5, @130, KC0[], KC1[]
3644; EG-NEXT:    TEX 0 @58
3645; EG-NEXT:    ALU 5, @136, KC0[], KC1[]
3646; EG-NEXT:    TEX 0 @60
3647; EG-NEXT:    ALU 5, @142, KC0[], KC1[]
3648; EG-NEXT:    TEX 0 @62
3649; EG-NEXT:    ALU 5, @148, KC0[], KC1[]
3650; EG-NEXT:    TEX 0 @64
3651; EG-NEXT:    ALU 5, @154, KC0[], KC1[]
3652; EG-NEXT:    TEX 0 @66
3653; EG-NEXT:    ALU 13, @160, KC0[CB0:0-32], KC1[]
3654; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
3655; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1
3656; EG-NEXT:    CF_END
3657; EG-NEXT:    Fetch clause starting at 36:
3658; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 98, #3
3659; EG-NEXT:    Fetch clause starting at 38:
3660; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 90, #3
3661; EG-NEXT:    Fetch clause starting at 40:
3662; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 82, #3
3663; EG-NEXT:    Fetch clause starting at 42:
3664; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 74, #3
3665; EG-NEXT:    Fetch clause starting at 44:
3666; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 96, #3
3667; EG-NEXT:    Fetch clause starting at 46:
3668; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 88, #3
3669; EG-NEXT:    Fetch clause starting at 48:
3670; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 80, #3
3671; EG-NEXT:    Fetch clause starting at 50:
3672; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 72, #3
3673; EG-NEXT:    Fetch clause starting at 52:
3674; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 94, #3
3675; EG-NEXT:    Fetch clause starting at 54:
3676; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 86, #3
3677; EG-NEXT:    Fetch clause starting at 56:
3678; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 78, #3
3679; EG-NEXT:    Fetch clause starting at 58:
3680; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 70, #3
3681; EG-NEXT:    Fetch clause starting at 60:
3682; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 92, #3
3683; EG-NEXT:    Fetch clause starting at 62:
3684; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 84, #3
3685; EG-NEXT:    Fetch clause starting at 64:
3686; EG-NEXT:     VTX_READ_16 T13.X, T11.X, 76, #3
3687; EG-NEXT:    Fetch clause starting at 66:
3688; EG-NEXT:     VTX_READ_16 T11.X, T11.X, 68, #3
3689; EG-NEXT:    ALU clause starting at 68:
3690; EG-NEXT:     MOV * T0.Y, T3.X,
3691; EG-NEXT:     MOV * T11.X, 0.0,
3692; EG-NEXT:    ALU clause starting at 70:
3693; EG-NEXT:     LSHL T0.W, T12.X, literal.x,
3694; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3695; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
3696; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
3697; EG-NEXT:     MOV T3.X, PV.W,
3698; EG-NEXT:     MOV * T0.Y, T5.X,
3699; EG-NEXT:    ALU clause starting at 76:
3700; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3701; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3702; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3703; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3704; EG-NEXT:     MOV T5.X, PV.W,
3705; EG-NEXT:     MOV * T0.Y, T7.X,
3706; EG-NEXT:    ALU clause starting at 82:
3707; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3708; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3709; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3710; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3711; EG-NEXT:     MOV T7.X, PV.W,
3712; EG-NEXT:     MOV * T0.Y, T9.X,
3713; EG-NEXT:    ALU clause starting at 88:
3714; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3715; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3716; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3717; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3718; EG-NEXT:     MOV T9.X, PV.W,
3719; EG-NEXT:     MOV * T0.Y, T3.X,
3720; EG-NEXT:    ALU clause starting at 94:
3721; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3722; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3723; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3724; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3725; EG-NEXT:     MOV T3.X, PV.W,
3726; EG-NEXT:     MOV * T0.Y, T5.X,
3727; EG-NEXT:    ALU clause starting at 100:
3728; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3729; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3730; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3731; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3732; EG-NEXT:     MOV T5.X, PV.W,
3733; EG-NEXT:     MOV * T0.Y, T7.X,
3734; EG-NEXT:    ALU clause starting at 106:
3735; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3736; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3737; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3738; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3739; EG-NEXT:     MOV T7.X, PV.W,
3740; EG-NEXT:     MOV * T0.Y, T9.X,
3741; EG-NEXT:    ALU clause starting at 112:
3742; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3743; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3744; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3745; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3746; EG-NEXT:     MOV T9.X, PV.W,
3747; EG-NEXT:     MOV * T0.Y, T2.X,
3748; EG-NEXT:    ALU clause starting at 118:
3749; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3750; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3751; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3752; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3753; EG-NEXT:     MOV T2.X, PV.W,
3754; EG-NEXT:     MOV * T0.Y, T4.X,
3755; EG-NEXT:    ALU clause starting at 124:
3756; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3757; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3758; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3759; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3760; EG-NEXT:     MOV T4.X, PV.W,
3761; EG-NEXT:     MOV * T0.Y, T6.X,
3762; EG-NEXT:    ALU clause starting at 130:
3763; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3764; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3765; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3766; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3767; EG-NEXT:     MOV T6.X, PV.W,
3768; EG-NEXT:     MOV * T0.Y, T8.X,
3769; EG-NEXT:    ALU clause starting at 136:
3770; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3771; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3772; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3773; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3774; EG-NEXT:     MOV T8.X, PV.W,
3775; EG-NEXT:     MOV * T0.Y, T2.X,
3776; EG-NEXT:    ALU clause starting at 142:
3777; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3778; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3779; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3780; EG-NEXT:     OR_INT * T12.Z, PV.W, PS,
3781; EG-NEXT:     MOV T2.X, PV.Z,
3782; EG-NEXT:     MOV * T0.Y, T4.X,
3783; EG-NEXT:    ALU clause starting at 148:
3784; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3785; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3786; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3787; EG-NEXT:     OR_INT * T12.X, PV.W, PS,
3788; EG-NEXT:     MOV T4.X, PV.X,
3789; EG-NEXT:     MOV * T0.Y, T6.X,
3790; EG-NEXT:    ALU clause starting at 154:
3791; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3792; EG-NEXT:     AND_INT * T1.W, T13.X, literal.y,
3793; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3794; EG-NEXT:     OR_INT * T11.Z, PV.W, PS,
3795; EG-NEXT:     MOV T6.X, PV.Z,
3796; EG-NEXT:     MOV * T0.Y, T8.X,
3797; EG-NEXT:    ALU clause starting at 160:
3798; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
3799; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3800; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3801; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
3802; EG-NEXT:     AND_INT T0.W, T0.Y, literal.y,
3803; EG-NEXT:     AND_INT * T1.W, T11.X, literal.z,
3804; EG-NEXT:    2(2.802597e-45), -65536(nan)
3805; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3806; EG-NEXT:     OR_INT * T11.X, PV.W, PS,
3807; EG-NEXT:     MOV T8.X, PV.X,
3808; EG-NEXT:     MOV * T12.W, T3.X,
3809; EG-NEXT:     MOV T12.Y, T5.X,
3810; EG-NEXT:     MOV T11.W, T7.X, BS:VEC_120/SCL_212
3811; EG-NEXT:     MOV * T11.Y, T9.X,
3812;
3813; CM-LABEL: v16i16_arg:
3814; CM:       ; %bb.0: ; %entry
3815; CM-NEXT:    ALU 1, @68, KC0[], KC1[]
3816; CM-NEXT:    TEX 0 @36
3817; CM-NEXT:    ALU 5, @70, KC0[], KC1[]
3818; CM-NEXT:    TEX 0 @38
3819; CM-NEXT:    ALU 5, @76, KC0[], KC1[]
3820; CM-NEXT:    TEX 0 @40
3821; CM-NEXT:    ALU 5, @82, KC0[], KC1[]
3822; CM-NEXT:    TEX 0 @42
3823; CM-NEXT:    ALU 5, @88, KC0[], KC1[]
3824; CM-NEXT:    TEX 0 @44
3825; CM-NEXT:    ALU 5, @94, KC0[], KC1[]
3826; CM-NEXT:    TEX 0 @46
3827; CM-NEXT:    ALU 5, @100, KC0[], KC1[]
3828; CM-NEXT:    TEX 0 @48
3829; CM-NEXT:    ALU 5, @106, KC0[], KC1[]
3830; CM-NEXT:    TEX 0 @50
3831; CM-NEXT:    ALU 5, @112, KC0[], KC1[]
3832; CM-NEXT:    TEX 0 @52
3833; CM-NEXT:    ALU 5, @118, KC0[], KC1[]
3834; CM-NEXT:    TEX 0 @54
3835; CM-NEXT:    ALU 5, @124, KC0[], KC1[]
3836; CM-NEXT:    TEX 0 @56
3837; CM-NEXT:    ALU 5, @130, KC0[], KC1[]
3838; CM-NEXT:    TEX 0 @58
3839; CM-NEXT:    ALU 5, @136, KC0[], KC1[]
3840; CM-NEXT:    TEX 0 @60
3841; CM-NEXT:    ALU 5, @142, KC0[], KC1[]
3842; CM-NEXT:    TEX 0 @62
3843; CM-NEXT:    ALU 5, @148, KC0[], KC1[]
3844; CM-NEXT:    TEX 0 @64
3845; CM-NEXT:    ALU 5, @154, KC0[], KC1[]
3846; CM-NEXT:    TEX 0 @66
3847; CM-NEXT:    ALU 14, @160, KC0[CB0:0-32], KC1[]
3848; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T14.X
3849; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T13.X
3850; CM-NEXT:    CF_END
3851; CM-NEXT:    Fetch clause starting at 36:
3852; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 98, #3
3853; CM-NEXT:    Fetch clause starting at 38:
3854; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 90, #3
3855; CM-NEXT:    Fetch clause starting at 40:
3856; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 82, #3
3857; CM-NEXT:    Fetch clause starting at 42:
3858; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 74, #3
3859; CM-NEXT:    Fetch clause starting at 44:
3860; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 96, #3
3861; CM-NEXT:    Fetch clause starting at 46:
3862; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 88, #3
3863; CM-NEXT:    Fetch clause starting at 48:
3864; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 80, #3
3865; CM-NEXT:    Fetch clause starting at 50:
3866; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 72, #3
3867; CM-NEXT:    Fetch clause starting at 52:
3868; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 94, #3
3869; CM-NEXT:    Fetch clause starting at 54:
3870; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 86, #3
3871; CM-NEXT:    Fetch clause starting at 56:
3872; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 78, #3
3873; CM-NEXT:    Fetch clause starting at 58:
3874; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 70, #3
3875; CM-NEXT:    Fetch clause starting at 60:
3876; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 92, #3
3877; CM-NEXT:    Fetch clause starting at 62:
3878; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 84, #3
3879; CM-NEXT:    Fetch clause starting at 64:
3880; CM-NEXT:     VTX_READ_16 T13.X, T11.X, 76, #3
3881; CM-NEXT:    Fetch clause starting at 66:
3882; CM-NEXT:     VTX_READ_16 T11.X, T11.X, 68, #3
3883; CM-NEXT:    ALU clause starting at 68:
3884; CM-NEXT:     MOV * T0.Y, T3.X,
3885; CM-NEXT:     MOV * T11.X, 0.0,
3886; CM-NEXT:    ALU clause starting at 70:
3887; CM-NEXT:     LSHL T0.Z, T12.X, literal.x,
3888; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
3889; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
3890; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
3891; CM-NEXT:     MOV T3.X, PV.W,
3892; CM-NEXT:     MOV * T0.Y, T5.X,
3893; CM-NEXT:    ALU clause starting at 76:
3894; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3895; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
3896; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3897; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3898; CM-NEXT:     MOV T5.X, PV.W,
3899; CM-NEXT:     MOV * T0.Y, T7.X,
3900; CM-NEXT:    ALU clause starting at 82:
3901; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3902; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
3903; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3904; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3905; CM-NEXT:     MOV T7.X, PV.W,
3906; CM-NEXT:     MOV * T0.Y, T9.X,
3907; CM-NEXT:    ALU clause starting at 88:
3908; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3909; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
3910; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3911; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3912; CM-NEXT:     MOV T9.X, PV.W,
3913; CM-NEXT:     MOV * T0.Y, T3.X,
3914; CM-NEXT:    ALU clause starting at 94:
3915; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3916; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
3917; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
3918; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3919; CM-NEXT:     MOV T3.X, PV.W,
3920; CM-NEXT:     MOV * T0.Y, T5.X,
3921; CM-NEXT:    ALU clause starting at 100:
3922; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3923; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
3924; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
3925; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3926; CM-NEXT:     MOV T5.X, PV.W,
3927; CM-NEXT:     MOV * T0.Y, T7.X,
3928; CM-NEXT:    ALU clause starting at 106:
3929; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3930; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
3931; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
3932; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3933; CM-NEXT:     MOV T7.X, PV.W,
3934; CM-NEXT:     MOV * T0.Y, T9.X,
3935; CM-NEXT:    ALU clause starting at 112:
3936; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3937; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
3938; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
3939; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3940; CM-NEXT:     MOV T9.X, PV.W,
3941; CM-NEXT:     MOV * T0.Y, T2.X,
3942; CM-NEXT:    ALU clause starting at 118:
3943; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3944; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
3945; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3946; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3947; CM-NEXT:     MOV T2.X, PV.W,
3948; CM-NEXT:     MOV * T0.Y, T4.X,
3949; CM-NEXT:    ALU clause starting at 124:
3950; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3951; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
3952; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3953; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3954; CM-NEXT:     MOV T4.X, PV.W,
3955; CM-NEXT:     MOV * T0.Y, T6.X,
3956; CM-NEXT:    ALU clause starting at 130:
3957; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3958; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
3959; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3960; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3961; CM-NEXT:     MOV T6.X, PV.W,
3962; CM-NEXT:     MOV * T0.Y, T8.X,
3963; CM-NEXT:    ALU clause starting at 136:
3964; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3965; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
3966; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3967; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3968; CM-NEXT:     MOV T8.X, PV.W,
3969; CM-NEXT:     MOV * T0.Y, T2.X,
3970; CM-NEXT:    ALU clause starting at 142:
3971; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3972; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
3973; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
3974; CM-NEXT:     OR_INT * T12.Z, PV.Z, PV.W,
3975; CM-NEXT:     MOV T2.X, PV.Z,
3976; CM-NEXT:     MOV * T0.Y, T4.X,
3977; CM-NEXT:    ALU clause starting at 148:
3978; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3979; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
3980; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
3981; CM-NEXT:     OR_INT * T12.X, PV.Z, PV.W,
3982; CM-NEXT:     MOV T4.X, PV.X,
3983; CM-NEXT:     MOV * T0.Y, T6.X,
3984; CM-NEXT:    ALU clause starting at 154:
3985; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3986; CM-NEXT:     AND_INT * T0.W, T13.X, literal.y,
3987; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
3988; CM-NEXT:     OR_INT * T11.Z, PV.Z, PV.W,
3989; CM-NEXT:     MOV T6.X, PV.Z,
3990; CM-NEXT:     MOV * T0.Y, T8.X,
3991; CM-NEXT:    ALU clause starting at 160:
3992; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
3993; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3994; CM-NEXT:     LSHR * T13.X, PV.W, literal.x,
3995; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3996; CM-NEXT:     LSHR T14.X, KC0[2].Y, literal.x,
3997; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
3998; CM-NEXT:     AND_INT * T0.W, T11.X, literal.z,
3999; CM-NEXT:    2(2.802597e-45), -65536(nan)
4000; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
4001; CM-NEXT:     OR_INT * T11.X, PV.Z, PV.W,
4002; CM-NEXT:     MOV T8.X, PV.X,
4003; CM-NEXT:     MOV * T12.W, T3.X,
4004; CM-NEXT:     MOV T12.Y, T5.X,
4005; CM-NEXT:     MOV * T11.W, T7.X, BS:VEC_120/SCL_212
4006; CM-NEXT:     MOV * T11.Y, T9.X,
4007entry:
4008  store <16 x i16> %in, ptr addrspace(1) %out
4009  ret void
4010}
4011
4012define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind {
4013; SI-LABEL: v16i32_arg:
4014; SI:       ; %bb.0: ; %entry
4015; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x19
4016; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
4017; SI-NEXT:    s_mov_b32 s3, 0xf000
4018; SI-NEXT:    s_mov_b32 s2, -1
4019; SI-NEXT:    s_waitcnt lgkmcnt(0)
4020; SI-NEXT:    v_mov_b32_e32 v0, s20
4021; SI-NEXT:    v_mov_b32_e32 v1, s21
4022; SI-NEXT:    v_mov_b32_e32 v2, s22
4023; SI-NEXT:    v_mov_b32_e32 v3, s23
4024; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
4025; SI-NEXT:    s_waitcnt expcnt(0)
4026; SI-NEXT:    v_mov_b32_e32 v0, s16
4027; SI-NEXT:    v_mov_b32_e32 v1, s17
4028; SI-NEXT:    v_mov_b32_e32 v2, s18
4029; SI-NEXT:    v_mov_b32_e32 v3, s19
4030; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
4031; SI-NEXT:    s_waitcnt expcnt(0)
4032; SI-NEXT:    v_mov_b32_e32 v0, s12
4033; SI-NEXT:    v_mov_b32_e32 v1, s13
4034; SI-NEXT:    v_mov_b32_e32 v2, s14
4035; SI-NEXT:    v_mov_b32_e32 v3, s15
4036; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
4037; SI-NEXT:    s_waitcnt expcnt(0)
4038; SI-NEXT:    v_mov_b32_e32 v0, s8
4039; SI-NEXT:    v_mov_b32_e32 v1, s9
4040; SI-NEXT:    v_mov_b32_e32 v2, s10
4041; SI-NEXT:    v_mov_b32_e32 v3, s11
4042; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
4043; SI-NEXT:    s_endpgm
4044;
4045; VI-LABEL: v16i32_arg:
4046; VI:       ; %bb.0: ; %entry
4047; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
4048; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4049; VI-NEXT:    s_waitcnt lgkmcnt(0)
4050; VI-NEXT:    v_mov_b32_e32 v0, s20
4051; VI-NEXT:    s_add_u32 s2, s0, 48
4052; VI-NEXT:    s_addc_u32 s3, s1, 0
4053; VI-NEXT:    v_mov_b32_e32 v5, s3
4054; VI-NEXT:    v_mov_b32_e32 v4, s2
4055; VI-NEXT:    s_add_u32 s2, s0, 32
4056; VI-NEXT:    v_mov_b32_e32 v1, s21
4057; VI-NEXT:    v_mov_b32_e32 v2, s22
4058; VI-NEXT:    v_mov_b32_e32 v3, s23
4059; VI-NEXT:    s_addc_u32 s3, s1, 0
4060; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4061; VI-NEXT:    v_mov_b32_e32 v5, s3
4062; VI-NEXT:    v_mov_b32_e32 v4, s2
4063; VI-NEXT:    s_add_u32 s2, s0, 16
4064; VI-NEXT:    v_mov_b32_e32 v0, s16
4065; VI-NEXT:    v_mov_b32_e32 v1, s17
4066; VI-NEXT:    v_mov_b32_e32 v2, s18
4067; VI-NEXT:    v_mov_b32_e32 v3, s19
4068; VI-NEXT:    s_addc_u32 s3, s1, 0
4069; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4070; VI-NEXT:    v_mov_b32_e32 v5, s3
4071; VI-NEXT:    v_mov_b32_e32 v0, s12
4072; VI-NEXT:    v_mov_b32_e32 v1, s13
4073; VI-NEXT:    v_mov_b32_e32 v2, s14
4074; VI-NEXT:    v_mov_b32_e32 v3, s15
4075; VI-NEXT:    v_mov_b32_e32 v4, s2
4076; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4077; VI-NEXT:    v_mov_b32_e32 v5, s1
4078; VI-NEXT:    v_mov_b32_e32 v0, s8
4079; VI-NEXT:    v_mov_b32_e32 v1, s9
4080; VI-NEXT:    v_mov_b32_e32 v2, s10
4081; VI-NEXT:    v_mov_b32_e32 v3, s11
4082; VI-NEXT:    v_mov_b32_e32 v4, s0
4083; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4084; VI-NEXT:    s_endpgm
4085;
4086; GFX9-LABEL: v16i32_arg:
4087; GFX9:       ; %bb.0: ; %entry
4088; GFX9-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x40
4089; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
4090; GFX9-NEXT:    v_mov_b32_e32 v4, 0
4091; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4092; GFX9-NEXT:    v_mov_b32_e32 v0, s24
4093; GFX9-NEXT:    v_mov_b32_e32 v1, s25
4094; GFX9-NEXT:    v_mov_b32_e32 v2, s26
4095; GFX9-NEXT:    v_mov_b32_e32 v3, s27
4096; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
4097; GFX9-NEXT:    s_nop 0
4098; GFX9-NEXT:    v_mov_b32_e32 v0, s20
4099; GFX9-NEXT:    v_mov_b32_e32 v1, s21
4100; GFX9-NEXT:    v_mov_b32_e32 v2, s22
4101; GFX9-NEXT:    v_mov_b32_e32 v3, s23
4102; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
4103; GFX9-NEXT:    s_nop 0
4104; GFX9-NEXT:    v_mov_b32_e32 v0, s16
4105; GFX9-NEXT:    v_mov_b32_e32 v1, s17
4106; GFX9-NEXT:    v_mov_b32_e32 v2, s18
4107; GFX9-NEXT:    v_mov_b32_e32 v3, s19
4108; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
4109; GFX9-NEXT:    s_nop 0
4110; GFX9-NEXT:    v_mov_b32_e32 v0, s12
4111; GFX9-NEXT:    v_mov_b32_e32 v1, s13
4112; GFX9-NEXT:    v_mov_b32_e32 v2, s14
4113; GFX9-NEXT:    v_mov_b32_e32 v3, s15
4114; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
4115; GFX9-NEXT:    s_endpgm
4116;
4117; EG-LABEL: v16i32_arg:
4118; EG:       ; %bb.0: ; %entry
4119; EG-NEXT:    ALU 29, @6, KC0[CB0:0-32], KC1[]
4120; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
4121; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
4122; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0
4123; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
4124; EG-NEXT:    CF_END
4125; EG-NEXT:    ALU clause starting at 6:
4126; EG-NEXT:     MOV * T0.W, KC0[7].X,
4127; EG-NEXT:     MOV * T0.Z, KC0[6].W,
4128; EG-NEXT:     MOV T0.Y, KC0[6].Z,
4129; EG-NEXT:     MOV * T1.W, KC0[8].X,
4130; EG-NEXT:     MOV T0.X, KC0[6].Y,
4131; EG-NEXT:     MOV * T1.Z, KC0[7].W,
4132; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
4133; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
4134; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4135; EG-NEXT:     MOV * T3.W, KC0[9].X,
4136; EG-NEXT:     MOV T1.X, KC0[7].Y,
4137; EG-NEXT:     MOV * T3.Z, KC0[8].W,
4138; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4139; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4140; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
4141; EG-NEXT:     MOV T3.Y, KC0[8].Z,
4142; EG-NEXT:     MOV * T5.W, KC0[10].X,
4143; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4144; EG-NEXT:     MOV T3.X, KC0[8].Y,
4145; EG-NEXT:     MOV * T5.Z, KC0[9].W,
4146; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4147; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
4148; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
4149; EG-NEXT:     MOV T5.Y, KC0[9].Z,
4150; EG-NEXT:     MOV * T5.X, KC0[9].Y,
4151; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4152; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4153; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
4154; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
4155; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4156;
4157; CM-LABEL: v16i32_arg:
4158; CM:       ; %bb.0: ; %entry
4159; CM-NEXT:    ALU 28, @6, KC0[CB0:0-32], KC1[]
4160; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
4161; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T6.X
4162; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
4163; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
4164; CM-NEXT:    CF_END
4165; CM-NEXT:    ALU clause starting at 6:
4166; CM-NEXT:     MOV * T0.W, KC0[10].X,
4167; CM-NEXT:     MOV * T0.Z, KC0[9].W,
4168; CM-NEXT:     MOV * T0.Y, KC0[9].Z,
4169; CM-NEXT:     MOV T0.X, KC0[9].Y,
4170; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
4171; CM-NEXT:     MOV * T2.W, KC0[9].X,
4172; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
4173; CM-NEXT:     MOV T2.Z, KC0[8].W,
4174; CM-NEXT:     MOV * T1.W, KC0[8].X,
4175; CM-NEXT:     LSHR T3.X, T1.Z, literal.x,
4176; CM-NEXT:     MOV T2.Y, KC0[8].Z,
4177; CM-NEXT:     MOV * T1.Z, KC0[7].W,
4178; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4179; CM-NEXT:     MOV T2.X, KC0[8].Y,
4180; CM-NEXT:     MOV * T1.Y, KC0[7].Z,
4181; CM-NEXT:     MOV T1.X, KC0[7].Y,
4182; CM-NEXT:     ADD_INT T3.Z, KC0[2].Y, literal.x,
4183; CM-NEXT:     MOV * T4.W, KC0[7].X,
4184; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
4185; CM-NEXT:     LSHR T5.X, PV.Z, literal.x,
4186; CM-NEXT:     MOV T4.Z, KC0[6].W,
4187; CM-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
4188; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4189; CM-NEXT:     LSHR T6.X, PV.W, literal.x,
4190; CM-NEXT:     MOV * T4.Y, KC0[6].Z,
4191; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4192; CM-NEXT:     MOV * T4.X, KC0[6].Y,
4193; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
4194; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4195entry:
4196  store <16 x i32> %in, ptr addrspace(1) %out, align 4
4197  ret void
4198}
4199
4200define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind {
4201; SI-LABEL: v16f32_arg:
4202; SI:       ; %bb.0: ; %entry
4203; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x19
4204; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
4205; SI-NEXT:    s_mov_b32 s3, 0xf000
4206; SI-NEXT:    s_mov_b32 s2, -1
4207; SI-NEXT:    s_waitcnt lgkmcnt(0)
4208; SI-NEXT:    v_mov_b32_e32 v0, s20
4209; SI-NEXT:    v_mov_b32_e32 v1, s21
4210; SI-NEXT:    v_mov_b32_e32 v2, s22
4211; SI-NEXT:    v_mov_b32_e32 v3, s23
4212; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
4213; SI-NEXT:    s_waitcnt expcnt(0)
4214; SI-NEXT:    v_mov_b32_e32 v0, s16
4215; SI-NEXT:    v_mov_b32_e32 v1, s17
4216; SI-NEXT:    v_mov_b32_e32 v2, s18
4217; SI-NEXT:    v_mov_b32_e32 v3, s19
4218; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
4219; SI-NEXT:    s_waitcnt expcnt(0)
4220; SI-NEXT:    v_mov_b32_e32 v0, s12
4221; SI-NEXT:    v_mov_b32_e32 v1, s13
4222; SI-NEXT:    v_mov_b32_e32 v2, s14
4223; SI-NEXT:    v_mov_b32_e32 v3, s15
4224; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
4225; SI-NEXT:    s_waitcnt expcnt(0)
4226; SI-NEXT:    v_mov_b32_e32 v0, s8
4227; SI-NEXT:    v_mov_b32_e32 v1, s9
4228; SI-NEXT:    v_mov_b32_e32 v2, s10
4229; SI-NEXT:    v_mov_b32_e32 v3, s11
4230; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
4231; SI-NEXT:    s_endpgm
4232;
4233; VI-LABEL: v16f32_arg:
4234; VI:       ; %bb.0: ; %entry
4235; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
4236; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4237; VI-NEXT:    s_waitcnt lgkmcnt(0)
4238; VI-NEXT:    v_mov_b32_e32 v0, s20
4239; VI-NEXT:    s_add_u32 s2, s0, 48
4240; VI-NEXT:    s_addc_u32 s3, s1, 0
4241; VI-NEXT:    v_mov_b32_e32 v5, s3
4242; VI-NEXT:    v_mov_b32_e32 v4, s2
4243; VI-NEXT:    s_add_u32 s2, s0, 32
4244; VI-NEXT:    v_mov_b32_e32 v1, s21
4245; VI-NEXT:    v_mov_b32_e32 v2, s22
4246; VI-NEXT:    v_mov_b32_e32 v3, s23
4247; VI-NEXT:    s_addc_u32 s3, s1, 0
4248; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4249; VI-NEXT:    v_mov_b32_e32 v5, s3
4250; VI-NEXT:    v_mov_b32_e32 v4, s2
4251; VI-NEXT:    s_add_u32 s2, s0, 16
4252; VI-NEXT:    v_mov_b32_e32 v0, s16
4253; VI-NEXT:    v_mov_b32_e32 v1, s17
4254; VI-NEXT:    v_mov_b32_e32 v2, s18
4255; VI-NEXT:    v_mov_b32_e32 v3, s19
4256; VI-NEXT:    s_addc_u32 s3, s1, 0
4257; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4258; VI-NEXT:    v_mov_b32_e32 v5, s3
4259; VI-NEXT:    v_mov_b32_e32 v0, s12
4260; VI-NEXT:    v_mov_b32_e32 v1, s13
4261; VI-NEXT:    v_mov_b32_e32 v2, s14
4262; VI-NEXT:    v_mov_b32_e32 v3, s15
4263; VI-NEXT:    v_mov_b32_e32 v4, s2
4264; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4265; VI-NEXT:    v_mov_b32_e32 v5, s1
4266; VI-NEXT:    v_mov_b32_e32 v0, s8
4267; VI-NEXT:    v_mov_b32_e32 v1, s9
4268; VI-NEXT:    v_mov_b32_e32 v2, s10
4269; VI-NEXT:    v_mov_b32_e32 v3, s11
4270; VI-NEXT:    v_mov_b32_e32 v4, s0
4271; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4272; VI-NEXT:    s_endpgm
4273;
4274; GFX9-LABEL: v16f32_arg:
4275; GFX9:       ; %bb.0: ; %entry
4276; GFX9-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x40
4277; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
4278; GFX9-NEXT:    v_mov_b32_e32 v4, 0
4279; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4280; GFX9-NEXT:    v_mov_b32_e32 v0, s24
4281; GFX9-NEXT:    v_mov_b32_e32 v1, s25
4282; GFX9-NEXT:    v_mov_b32_e32 v2, s26
4283; GFX9-NEXT:    v_mov_b32_e32 v3, s27
4284; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
4285; GFX9-NEXT:    s_nop 0
4286; GFX9-NEXT:    v_mov_b32_e32 v0, s20
4287; GFX9-NEXT:    v_mov_b32_e32 v1, s21
4288; GFX9-NEXT:    v_mov_b32_e32 v2, s22
4289; GFX9-NEXT:    v_mov_b32_e32 v3, s23
4290; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
4291; GFX9-NEXT:    s_nop 0
4292; GFX9-NEXT:    v_mov_b32_e32 v0, s16
4293; GFX9-NEXT:    v_mov_b32_e32 v1, s17
4294; GFX9-NEXT:    v_mov_b32_e32 v2, s18
4295; GFX9-NEXT:    v_mov_b32_e32 v3, s19
4296; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
4297; GFX9-NEXT:    s_nop 0
4298; GFX9-NEXT:    v_mov_b32_e32 v0, s12
4299; GFX9-NEXT:    v_mov_b32_e32 v1, s13
4300; GFX9-NEXT:    v_mov_b32_e32 v2, s14
4301; GFX9-NEXT:    v_mov_b32_e32 v3, s15
4302; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
4303; GFX9-NEXT:    s_endpgm
4304;
4305; EG-LABEL: v16f32_arg:
4306; EG:       ; %bb.0: ; %entry
4307; EG-NEXT:    ALU 29, @6, KC0[CB0:0-32], KC1[]
4308; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
4309; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
4310; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0
4311; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
4312; EG-NEXT:    CF_END
4313; EG-NEXT:    ALU clause starting at 6:
4314; EG-NEXT:     MOV * T0.W, KC0[7].X,
4315; EG-NEXT:     MOV * T0.Z, KC0[6].W,
4316; EG-NEXT:     MOV T0.Y, KC0[6].Z,
4317; EG-NEXT:     MOV * T1.W, KC0[8].X,
4318; EG-NEXT:     MOV T0.X, KC0[6].Y,
4319; EG-NEXT:     MOV * T1.Z, KC0[7].W,
4320; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
4321; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
4322; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4323; EG-NEXT:     MOV * T3.W, KC0[9].X,
4324; EG-NEXT:     MOV T1.X, KC0[7].Y,
4325; EG-NEXT:     MOV * T3.Z, KC0[8].W,
4326; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4327; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4328; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
4329; EG-NEXT:     MOV T3.Y, KC0[8].Z,
4330; EG-NEXT:     MOV * T5.W, KC0[10].X,
4331; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4332; EG-NEXT:     MOV T3.X, KC0[8].Y,
4333; EG-NEXT:     MOV * T5.Z, KC0[9].W,
4334; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4335; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
4336; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
4337; EG-NEXT:     MOV T5.Y, KC0[9].Z,
4338; EG-NEXT:     MOV * T5.X, KC0[9].Y,
4339; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4340; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4341; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
4342; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
4343; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4344;
4345; CM-LABEL: v16f32_arg:
4346; CM:       ; %bb.0: ; %entry
4347; CM-NEXT:    ALU 28, @6, KC0[CB0:0-32], KC1[]
4348; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
4349; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T6.X
4350; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
4351; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
4352; CM-NEXT:    CF_END
4353; CM-NEXT:    ALU clause starting at 6:
4354; CM-NEXT:     MOV * T0.W, KC0[10].X,
4355; CM-NEXT:     MOV * T0.Z, KC0[9].W,
4356; CM-NEXT:     MOV * T0.Y, KC0[9].Z,
4357; CM-NEXT:     MOV T0.X, KC0[9].Y,
4358; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
4359; CM-NEXT:     MOV * T2.W, KC0[9].X,
4360; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
4361; CM-NEXT:     MOV T2.Z, KC0[8].W,
4362; CM-NEXT:     MOV * T1.W, KC0[8].X,
4363; CM-NEXT:     LSHR T3.X, T1.Z, literal.x,
4364; CM-NEXT:     MOV T2.Y, KC0[8].Z,
4365; CM-NEXT:     MOV * T1.Z, KC0[7].W,
4366; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4367; CM-NEXT:     MOV T2.X, KC0[8].Y,
4368; CM-NEXT:     MOV * T1.Y, KC0[7].Z,
4369; CM-NEXT:     MOV T1.X, KC0[7].Y,
4370; CM-NEXT:     ADD_INT T3.Z, KC0[2].Y, literal.x,
4371; CM-NEXT:     MOV * T4.W, KC0[7].X,
4372; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
4373; CM-NEXT:     LSHR T5.X, PV.Z, literal.x,
4374; CM-NEXT:     MOV T4.Z, KC0[6].W,
4375; CM-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
4376; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4377; CM-NEXT:     LSHR T6.X, PV.W, literal.x,
4378; CM-NEXT:     MOV * T4.Y, KC0[6].Z,
4379; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4380; CM-NEXT:     MOV * T4.X, KC0[6].Y,
4381; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
4382; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4383entry:
4384  store <16 x float> %in, ptr addrspace(1) %out, align 4
4385  ret void
4386}
4387
4388define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind {
4389; SI-LABEL: kernel_arg_i64:
4390; SI:       ; %bb.0:
4391; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4392; SI-NEXT:    s_mov_b32 s7, 0xf000
4393; SI-NEXT:    s_mov_b32 s6, -1
4394; SI-NEXT:    s_waitcnt lgkmcnt(0)
4395; SI-NEXT:    s_mov_b32 s4, s0
4396; SI-NEXT:    s_mov_b32 s5, s1
4397; SI-NEXT:    v_mov_b32_e32 v0, s2
4398; SI-NEXT:    v_mov_b32_e32 v1, s3
4399; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4400; SI-NEXT:    s_endpgm
4401;
4402; VI-LABEL: kernel_arg_i64:
4403; VI:       ; %bb.0:
4404; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4405; VI-NEXT:    s_waitcnt lgkmcnt(0)
4406; VI-NEXT:    v_mov_b32_e32 v0, s0
4407; VI-NEXT:    v_mov_b32_e32 v1, s1
4408; VI-NEXT:    v_mov_b32_e32 v2, s2
4409; VI-NEXT:    v_mov_b32_e32 v3, s3
4410; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
4411; VI-NEXT:    s_endpgm
4412;
4413; GFX9-LABEL: kernel_arg_i64:
4414; GFX9:       ; %bb.0:
4415; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
4416; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4417; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4418; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4419; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4420; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
4421; GFX9-NEXT:    s_endpgm
4422;
4423; EG-LABEL: kernel_arg_i64:
4424; EG:       ; %bb.0:
4425; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
4426; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
4427; EG-NEXT:    CF_END
4428; EG-NEXT:    PAD
4429; EG-NEXT:    ALU clause starting at 4:
4430; EG-NEXT:     MOV * T0.Y, KC0[3].X,
4431; EG-NEXT:     MOV T0.X, KC0[2].W,
4432; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4433; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4434;
4435; CM-LABEL: kernel_arg_i64:
4436; CM:       ; %bb.0:
4437; CM-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
4438; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
4439; CM-NEXT:    CF_END
4440; CM-NEXT:    PAD
4441; CM-NEXT:    ALU clause starting at 4:
4442; CM-NEXT:     MOV * T0.Y, KC0[3].X,
4443; CM-NEXT:     MOV * T0.X, KC0[2].W,
4444; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4445; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4446  store i64 %a, ptr addrspace(1) %out, align 8
4447  ret void
4448}
4449
4450define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double  %in) {
4451; SI-LABEL: f64_kernel_arg:
4452; SI:       ; %bb.0: ; %entry
4453; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4454; SI-NEXT:    s_mov_b32 s7, 0xf000
4455; SI-NEXT:    s_mov_b32 s6, -1
4456; SI-NEXT:    s_waitcnt lgkmcnt(0)
4457; SI-NEXT:    s_mov_b32 s4, s0
4458; SI-NEXT:    s_mov_b32 s5, s1
4459; SI-NEXT:    v_mov_b32_e32 v0, s2
4460; SI-NEXT:    v_mov_b32_e32 v1, s3
4461; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4462; SI-NEXT:    s_endpgm
4463;
4464; VI-LABEL: f64_kernel_arg:
4465; VI:       ; %bb.0: ; %entry
4466; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4467; VI-NEXT:    s_waitcnt lgkmcnt(0)
4468; VI-NEXT:    v_mov_b32_e32 v0, s0
4469; VI-NEXT:    v_mov_b32_e32 v1, s1
4470; VI-NEXT:    v_mov_b32_e32 v2, s2
4471; VI-NEXT:    v_mov_b32_e32 v3, s3
4472; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
4473; VI-NEXT:    s_endpgm
4474;
4475; GFX9-LABEL: f64_kernel_arg:
4476; GFX9:       ; %bb.0: ; %entry
4477; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
4478; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4479; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4480; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4481; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4482; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
4483; GFX9-NEXT:    s_endpgm
4484;
4485; EG-LABEL: f64_kernel_arg:
4486; EG:       ; %bb.0: ; %entry
4487; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
4488; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
4489; EG-NEXT:    CF_END
4490; EG-NEXT:    PAD
4491; EG-NEXT:    ALU clause starting at 4:
4492; EG-NEXT:     MOV * T0.Y, KC0[3].X,
4493; EG-NEXT:     MOV T0.X, KC0[2].W,
4494; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4495; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4496;
4497; CM-LABEL: f64_kernel_arg:
4498; CM:       ; %bb.0: ; %entry
4499; CM-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
4500; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
4501; CM-NEXT:    CF_END
4502; CM-NEXT:    PAD
4503; CM-NEXT:    ALU clause starting at 4:
4504; CM-NEXT:     MOV * T0.Y, KC0[3].X,
4505; CM-NEXT:     MOV * T0.X, KC0[2].W,
4506; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4507; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4508entry:
4509  store double %in, ptr addrspace(1) %out
4510  ret void
4511}
4512
4513; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
4514; XGCN: s_load_dwordx2
4515; XGCN: s_load_dwordx2
4516; XGCN: buffer_store_dwordx2
4517; define amdgpu_kernel void @kernel_arg_v1i64(ptr addrspace(1) %out, <1 x i64> %a) nounwind {
4518;   store <1 x i64> %a, ptr addrspace(1) %out, align 8
4519;   ret void
4520; }
4521
4522define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind {
4523; SI-LABEL: i65_arg:
4524; SI:       ; %bb.0: ; %entry
4525; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
4526; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4527; SI-NEXT:    s_mov_b32 s7, 0xf000
4528; SI-NEXT:    s_waitcnt lgkmcnt(0)
4529; SI-NEXT:    s_and_b32 s8, s6, 1
4530; SI-NEXT:    s_mov_b32 s6, -1
4531; SI-NEXT:    s_mov_b32 s4, s0
4532; SI-NEXT:    s_mov_b32 s5, s1
4533; SI-NEXT:    v_mov_b32_e32 v0, s2
4534; SI-NEXT:    v_mov_b32_e32 v1, s3
4535; SI-NEXT:    v_mov_b32_e32 v2, s8
4536; SI-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:8
4537; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4538; SI-NEXT:    s_endpgm
4539;
4540; VI-LABEL: i65_arg:
4541; VI:       ; %bb.0: ; %entry
4542; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
4543; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4544; VI-NEXT:    s_waitcnt lgkmcnt(0)
4545; VI-NEXT:    s_and_b32 s4, s6, 1
4546; VI-NEXT:    v_mov_b32_e32 v0, s0
4547; VI-NEXT:    v_mov_b32_e32 v1, s1
4548; VI-NEXT:    s_add_u32 s0, s0, 8
4549; VI-NEXT:    s_addc_u32 s1, s1, 0
4550; VI-NEXT:    v_mov_b32_e32 v5, s1
4551; VI-NEXT:    v_mov_b32_e32 v2, s2
4552; VI-NEXT:    v_mov_b32_e32 v6, s4
4553; VI-NEXT:    v_mov_b32_e32 v4, s0
4554; VI-NEXT:    v_mov_b32_e32 v3, s3
4555; VI-NEXT:    flat_store_byte v[4:5], v6
4556; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
4557; VI-NEXT:    s_endpgm
4558;
4559; GFX9-LABEL: i65_arg:
4560; GFX9:       ; %bb.0: ; %entry
4561; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x10
4562; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
4563; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4564; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4565; GFX9-NEXT:    s_and_b32 s4, s4, 1
4566; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4567; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4568; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4569; GFX9-NEXT:    global_store_byte v2, v3, s[0:1] offset:8
4570; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
4571; GFX9-NEXT:    s_endpgm
4572;
4573; EG-LABEL: i65_arg:
4574; EG:       ; %bb.0: ; %entry
4575; EG-NEXT:    ALU 20, @6, KC0[CB0:0-32], KC1[]
4576; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
4577; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
4578; EG-NEXT:    MEM_RAT MSKOR T1.XW, T0.X
4579; EG-NEXT:    CF_END
4580; EG-NEXT:    PAD
4581; EG-NEXT:    ALU clause starting at 6:
4582; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4583; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
4584; EG-NEXT:     AND_INT * T1.W, PV.W, literal.x,
4585; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4586; EG-NEXT:     LSHL T1.W, PV.W, literal.x,
4587; EG-NEXT:     AND_INT * T2.W, KC0[3].Y, 1,
4588; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4589; EG-NEXT:     LSHL T1.X, PS, PV.W,
4590; EG-NEXT:     LSHL * T1.W, literal.x, PV.W,
4591; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
4592; EG-NEXT:     MOV T1.Y, 0.0,
4593; EG-NEXT:     MOV * T1.Z, 0.0,
4594; EG-NEXT:     LSHR T0.X, T0.W, literal.x,
4595; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4596; EG-NEXT:    2(2.802597e-45), 4(5.605194e-45)
4597; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
4598; EG-NEXT:     MOV * T3.X, KC0[3].X,
4599; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4600; EG-NEXT:     LSHR T4.X, KC0[2].Y, literal.x,
4601; EG-NEXT:     MOV * T5.X, KC0[2].W,
4602; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4603;
4604; CM-LABEL: i65_arg:
4605; CM:       ; %bb.0: ; %entry
4606; CM-NEXT:    ALU 21, @6, KC0[CB0:0-32], KC1[]
4607; CM-NEXT:    MEM_RAT MSKOR T1.XW, T5.X
4608; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
4609; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X
4610; CM-NEXT:    CF_END
4611; CM-NEXT:    PAD
4612; CM-NEXT:    ALU clause starting at 6:
4613; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4614; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
4615; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
4616; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4617; CM-NEXT:     LSHL T0.Z, PV.W, literal.x,
4618; CM-NEXT:     AND_INT * T1.W, KC0[3].Y, 1,
4619; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4620; CM-NEXT:     LSHL T1.X, PV.W, PV.Z,
4621; CM-NEXT:     LSHL * T1.W, literal.x, PV.Z,
4622; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
4623; CM-NEXT:     MOV T1.Y, 0.0,
4624; CM-NEXT:     MOV * T1.Z, 0.0,
4625; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
4626; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4627; CM-NEXT:     MOV T2.X, KC0[2].W,
4628; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4629; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
4630; CM-NEXT:     LSHR * T3.X, PV.W, literal.x,
4631; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4632; CM-NEXT:     MOV * T4.X, KC0[3].X,
4633; CM-NEXT:     LSHR * T5.X, T0.W, literal.x,
4634; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4635entry:
4636  store i65 %in, ptr addrspace(1) %out, align 4
4637  ret void
4638}
4639
4640define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
4641; SI-LABEL: i1_arg:
4642; SI:       ; %bb.0:
4643; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
4644; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
4645; SI-NEXT:    s_mov_b32 s3, 0xf000
4646; SI-NEXT:    s_waitcnt lgkmcnt(0)
4647; SI-NEXT:    s_and_b32 s4, s2, 1
4648; SI-NEXT:    s_mov_b32 s2, -1
4649; SI-NEXT:    v_mov_b32_e32 v0, s4
4650; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
4651; SI-NEXT:    s_endpgm
4652;
4653; VI-LABEL: i1_arg:
4654; VI:       ; %bb.0:
4655; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
4656; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4657; VI-NEXT:    s_waitcnt lgkmcnt(0)
4658; VI-NEXT:    s_and_b32 s2, s2, 1
4659; VI-NEXT:    v_mov_b32_e32 v0, s0
4660; VI-NEXT:    v_mov_b32_e32 v1, s1
4661; VI-NEXT:    v_mov_b32_e32 v2, s2
4662; VI-NEXT:    flat_store_byte v[0:1], v2
4663; VI-NEXT:    s_endpgm
4664;
4665; GFX9-LABEL: i1_arg:
4666; GFX9:       ; %bb.0:
4667; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
4668; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
4669; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4670; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4671; GFX9-NEXT:    s_and_b32 s2, s2, 1
4672; GFX9-NEXT:    v_mov_b32_e32 v1, s2
4673; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
4674; GFX9-NEXT:    s_endpgm
4675;
4676; EG-LABEL: i1_arg:
4677; EG:       ; %bb.0:
4678; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
4679; EG-NEXT:    TEX 0 @6
4680; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
4681; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
4682; EG-NEXT:    CF_END
4683; EG-NEXT:    PAD
4684; EG-NEXT:    Fetch clause starting at 6:
4685; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4686; EG-NEXT:    ALU clause starting at 8:
4687; EG-NEXT:     MOV * T0.X, 0.0,
4688; EG-NEXT:    ALU clause starting at 9:
4689; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
4690; EG-NEXT:     AND_INT * T1.W, T0.X, 1,
4691; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4692; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
4693; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4694; EG-NEXT:     LSHL T0.X, T1.W, PV.W,
4695; EG-NEXT:     LSHL * T0.W, literal.x, PV.W,
4696; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
4697; EG-NEXT:     MOV T0.Y, 0.0,
4698; EG-NEXT:     MOV * T0.Z, 0.0,
4699; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4700; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4701;
4702; CM-LABEL: i1_arg:
4703; CM:       ; %bb.0:
4704; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
4705; CM-NEXT:    TEX 0 @6
4706; CM-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
4707; CM-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
4708; CM-NEXT:    CF_END
4709; CM-NEXT:    PAD
4710; CM-NEXT:    Fetch clause starting at 6:
4711; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4712; CM-NEXT:    ALU clause starting at 8:
4713; CM-NEXT:     MOV * T0.X, 0.0,
4714; CM-NEXT:    ALU clause starting at 9:
4715; CM-NEXT:     AND_INT * T0.W, KC0[2].Y, literal.x,
4716; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4717; CM-NEXT:     AND_INT T0.Z, T0.X, 1,
4718; CM-NEXT:     LSHL * T0.W, PV.W, literal.x,
4719; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4720; CM-NEXT:     LSHL T0.X, PV.Z, PV.W,
4721; CM-NEXT:     LSHL * T0.W, literal.x, PV.W,
4722; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
4723; CM-NEXT:     MOV T0.Y, 0.0,
4724; CM-NEXT:     MOV * T0.Z, 0.0,
4725; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4726; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4727  store i1 %x, ptr addrspace(1) %out, align 1
4728  ret void
4729}
4730
4731define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind {
4732; SI-LABEL: i1_arg_zext_i32:
4733; SI:       ; %bb.0:
4734; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
4735; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
4736; SI-NEXT:    s_mov_b32 s3, 0xf000
4737; SI-NEXT:    s_waitcnt lgkmcnt(0)
4738; SI-NEXT:    s_and_b32 s4, s2, 1
4739; SI-NEXT:    s_mov_b32 s2, -1
4740; SI-NEXT:    v_mov_b32_e32 v0, s4
4741; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4742; SI-NEXT:    s_endpgm
4743;
4744; VI-LABEL: i1_arg_zext_i32:
4745; VI:       ; %bb.0:
4746; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
4747; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4748; VI-NEXT:    s_waitcnt lgkmcnt(0)
4749; VI-NEXT:    s_and_b32 s2, s2, 1
4750; VI-NEXT:    v_mov_b32_e32 v0, s0
4751; VI-NEXT:    v_mov_b32_e32 v1, s1
4752; VI-NEXT:    v_mov_b32_e32 v2, s2
4753; VI-NEXT:    flat_store_dword v[0:1], v2
4754; VI-NEXT:    s_endpgm
4755;
4756; GFX9-LABEL: i1_arg_zext_i32:
4757; GFX9:       ; %bb.0:
4758; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
4759; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
4760; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4761; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4762; GFX9-NEXT:    s_and_b32 s2, s2, 1
4763; GFX9-NEXT:    v_mov_b32_e32 v1, s2
4764; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
4765; GFX9-NEXT:    s_endpgm
4766;
4767; EG-LABEL: i1_arg_zext_i32:
4768; EG:       ; %bb.0:
4769; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
4770; EG-NEXT:    TEX 0 @6
4771; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
4772; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
4773; EG-NEXT:    CF_END
4774; EG-NEXT:    PAD
4775; EG-NEXT:    Fetch clause starting at 6:
4776; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4777; EG-NEXT:    ALU clause starting at 8:
4778; EG-NEXT:     MOV * T0.X, 0.0,
4779; EG-NEXT:    ALU clause starting at 9:
4780; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4781; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4782;
4783; CM-LABEL: i1_arg_zext_i32:
4784; CM:       ; %bb.0:
4785; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
4786; CM-NEXT:    TEX 0 @6
4787; CM-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
4788; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
4789; CM-NEXT:    CF_END
4790; CM-NEXT:    PAD
4791; CM-NEXT:    Fetch clause starting at 6:
4792; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4793; CM-NEXT:    ALU clause starting at 8:
4794; CM-NEXT:     MOV * T0.X, 0.0,
4795; CM-NEXT:    ALU clause starting at 9:
4796; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4797; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4798  %ext = zext i1 %x to i32
4799  store i32 %ext, ptr addrspace(1) %out, align 4
4800  ret void
4801}
4802
4803define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind {
4804; SI-LABEL: i1_arg_zext_i64:
4805; SI:       ; %bb.0:
4806; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
4807; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
4808; SI-NEXT:    s_mov_b32 s3, 0xf000
4809; SI-NEXT:    s_mov_b32 s2, -1
4810; SI-NEXT:    s_waitcnt lgkmcnt(0)
4811; SI-NEXT:    s_and_b32 s4, s6, 1
4812; SI-NEXT:    v_mov_b32_e32 v1, 0
4813; SI-NEXT:    v_mov_b32_e32 v0, s4
4814; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4815; SI-NEXT:    s_endpgm
4816;
4817; VI-LABEL: i1_arg_zext_i64:
4818; VI:       ; %bb.0:
4819; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
4820; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4821; VI-NEXT:    v_mov_b32_e32 v1, 0
4822; VI-NEXT:    s_waitcnt lgkmcnt(0)
4823; VI-NEXT:    s_and_b32 s2, s2, 1
4824; VI-NEXT:    v_mov_b32_e32 v3, s1
4825; VI-NEXT:    v_mov_b32_e32 v0, s2
4826; VI-NEXT:    v_mov_b32_e32 v2, s0
4827; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
4828; VI-NEXT:    s_endpgm
4829;
4830; GFX9-LABEL: i1_arg_zext_i64:
4831; GFX9:       ; %bb.0:
4832; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
4833; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
4834; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4835; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4836; GFX9-NEXT:    s_and_b32 s2, s2, 1
4837; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4838; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
4839; GFX9-NEXT:    s_endpgm
4840;
4841; EG-LABEL: i1_arg_zext_i64:
4842; EG:       ; %bb.0:
4843; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
4844; EG-NEXT:    TEX 0 @6
4845; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
4846; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
4847; EG-NEXT:    CF_END
4848; EG-NEXT:    PAD
4849; EG-NEXT:    Fetch clause starting at 6:
4850; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4851; EG-NEXT:    ALU clause starting at 8:
4852; EG-NEXT:     MOV * T0.X, 0.0,
4853; EG-NEXT:    ALU clause starting at 9:
4854; EG-NEXT:     MOV * T0.Y, 0.0,
4855; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4856; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4857;
4858; CM-LABEL: i1_arg_zext_i64:
4859; CM:       ; %bb.0:
4860; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
4861; CM-NEXT:    TEX 0 @6
4862; CM-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
4863; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
4864; CM-NEXT:    CF_END
4865; CM-NEXT:    PAD
4866; CM-NEXT:    Fetch clause starting at 6:
4867; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4868; CM-NEXT:    ALU clause starting at 8:
4869; CM-NEXT:     MOV * T0.X, 0.0,
4870; CM-NEXT:    ALU clause starting at 9:
4871; CM-NEXT:     MOV * T0.Y, 0.0,
4872; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4873; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4874  %ext = zext i1 %x to i64
4875  store i64 %ext, ptr addrspace(1) %out, align 8
4876  ret void
4877}
4878
4879define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind {
4880; SI-LABEL: i1_arg_sext_i32:
4881; SI:       ; %bb.0:
4882; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
4883; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
4884; SI-NEXT:    s_mov_b32 s3, 0xf000
4885; SI-NEXT:    s_waitcnt lgkmcnt(0)
4886; SI-NEXT:    s_bfe_i32 s4, s2, 0x10000
4887; SI-NEXT:    s_mov_b32 s2, -1
4888; SI-NEXT:    v_mov_b32_e32 v0, s4
4889; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4890; SI-NEXT:    s_endpgm
4891;
4892; VI-LABEL: i1_arg_sext_i32:
4893; VI:       ; %bb.0:
4894; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
4895; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
4896; VI-NEXT:    s_waitcnt lgkmcnt(0)
4897; VI-NEXT:    s_bfe_i32 s2, s2, 0x10000
4898; VI-NEXT:    v_mov_b32_e32 v0, s0
4899; VI-NEXT:    v_mov_b32_e32 v1, s1
4900; VI-NEXT:    v_mov_b32_e32 v2, s2
4901; VI-NEXT:    flat_store_dword v[0:1], v2
4902; VI-NEXT:    s_endpgm
4903;
4904; GFX9-LABEL: i1_arg_sext_i32:
4905; GFX9:       ; %bb.0:
4906; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
4907; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
4908; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4909; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4910; GFX9-NEXT:    s_bfe_i32 s2, s2, 0x10000
4911; GFX9-NEXT:    v_mov_b32_e32 v1, s2
4912; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
4913; GFX9-NEXT:    s_endpgm
4914;
4915; EG-LABEL: i1_arg_sext_i32:
4916; EG:       ; %bb.0:
4917; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
4918; EG-NEXT:    TEX 0 @6
4919; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
4920; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
4921; EG-NEXT:    CF_END
4922; EG-NEXT:    PAD
4923; EG-NEXT:    Fetch clause starting at 6:
4924; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4925; EG-NEXT:    ALU clause starting at 8:
4926; EG-NEXT:     MOV * T0.X, 0.0,
4927; EG-NEXT:    ALU clause starting at 9:
4928; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, 1,
4929; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4930; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4931;
4932; CM-LABEL: i1_arg_sext_i32:
4933; CM:       ; %bb.0:
4934; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
4935; CM-NEXT:    TEX 0 @6
4936; CM-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
4937; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
4938; CM-NEXT:    CF_END
4939; CM-NEXT:    PAD
4940; CM-NEXT:    Fetch clause starting at 6:
4941; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4942; CM-NEXT:    ALU clause starting at 8:
4943; CM-NEXT:     MOV * T0.X, 0.0,
4944; CM-NEXT:    ALU clause starting at 9:
4945; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, 1,
4946; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4947; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4948  %ext = sext i1 %x to i32
4949  store i32 %ext, ptr addrspace(1) %out, align 4
4950  ret void
4951}
4952
4953define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind {
4954; SI-LABEL: i1_arg_sext_i64:
4955; SI:       ; %bb.0:
4956; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
4957; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
4958; SI-NEXT:    s_mov_b32 s3, 0xf000
4959; SI-NEXT:    s_waitcnt lgkmcnt(0)
4960; SI-NEXT:    s_bfe_i64 s[4:5], s[2:3], 0x10000
4961; SI-NEXT:    s_mov_b32 s2, -1
4962; SI-NEXT:    v_mov_b32_e32 v0, s4
4963; SI-NEXT:    v_mov_b32_e32 v1, s5
4964; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4965; SI-NEXT:    s_endpgm
4966;
4967; VI-LABEL: i1_arg_sext_i64:
4968; VI:       ; %bb.0:
4969; VI-NEXT:    s_load_dword s0, s[4:5], 0x2c
4970; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
4971; VI-NEXT:    s_waitcnt lgkmcnt(0)
4972; VI-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x10000
4973; VI-NEXT:    v_mov_b32_e32 v0, s2
4974; VI-NEXT:    v_mov_b32_e32 v3, s1
4975; VI-NEXT:    v_mov_b32_e32 v1, s3
4976; VI-NEXT:    v_mov_b32_e32 v2, s0
4977; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
4978; VI-NEXT:    s_endpgm
4979;
4980; GFX9-LABEL: i1_arg_sext_i64:
4981; GFX9:       ; %bb.0:
4982; GFX9-NEXT:    s_load_dword s0, s[8:9], 0x8
4983; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
4984; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4985; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4986; GFX9-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x10000
4987; GFX9-NEXT:    v_mov_b32_e32 v0, s0
4988; GFX9-NEXT:    v_mov_b32_e32 v1, s1
4989; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
4990; GFX9-NEXT:    s_endpgm
4991;
4992; EG-LABEL: i1_arg_sext_i64:
4993; EG:       ; %bb.0:
4994; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
4995; EG-NEXT:    TEX 0 @6
4996; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
4997; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
4998; EG-NEXT:    CF_END
4999; EG-NEXT:    PAD
5000; EG-NEXT:    Fetch clause starting at 6:
5001; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
5002; EG-NEXT:    ALU clause starting at 8:
5003; EG-NEXT:     MOV * T0.X, 0.0,
5004; EG-NEXT:    ALU clause starting at 9:
5005; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, 1,
5006; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
5007; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5008; EG-NEXT:     MOV * T0.Y, PV.X,
5009;
5010; CM-LABEL: i1_arg_sext_i64:
5011; CM:       ; %bb.0:
5012; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
5013; CM-NEXT:    TEX 0 @6
5014; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
5015; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
5016; CM-NEXT:    CF_END
5017; CM-NEXT:    PAD
5018; CM-NEXT:    Fetch clause starting at 6:
5019; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
5020; CM-NEXT:    ALU clause starting at 8:
5021; CM-NEXT:     MOV * T0.X, 0.0,
5022; CM-NEXT:    ALU clause starting at 9:
5023; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, 1,
5024; CM-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
5025; CM-NEXT:     MOV * T0.Y, PV.X,
5026; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5027  %ext = sext i1 %x to i64
5028  store i64 %ext, ptr addrspace(1) %out, align 8
5029  ret void
5030}
5031
5032define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
5033; SI-LABEL: empty_struct_arg:
5034; SI:       ; %bb.0:
5035; SI-NEXT:    s_endpgm
5036;
5037; VI-LABEL: empty_struct_arg:
5038; VI:       ; %bb.0:
5039; VI-NEXT:    s_endpgm
5040;
5041; GFX9-LABEL: empty_struct_arg:
5042; GFX9:       ; %bb.0:
5043; GFX9-NEXT:    s_endpgm
5044;
5045; EGCM-LABEL: empty_struct_arg:
5046; EGCM:       ; %bb.0:
5047; EGCM-NEXT:    CF_END
5048; EGCM-NEXT:    PAD
5049  ret void
5050}
5051
5052; The correct load offsets for these:
5053; load 4 from 0,
5054; load 8 from 8
5055; load 4 from 24
5056; load 8 from 32
5057
5058; With the SelectionDAG argument lowering, the alignments for the
5059; struct members is not properly considered, making these wrong.
5060
5061; FIXME: Total argument size is computed wrong
5062define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
5063; SI-LABEL: struct_argument_alignment:
5064; SI:       ; %bb.0:
5065; SI-NEXT:    s_load_dword s8, s[4:5], 0x9
5066; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xb
5067; SI-NEXT:    s_load_dword s9, s[4:5], 0xf
5068; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x11
5069; SI-NEXT:    s_mov_b32 s0, 0
5070; SI-NEXT:    s_mov_b32 s3, 0xf000
5071; SI-NEXT:    s_mov_b32 s2, -1
5072; SI-NEXT:    s_mov_b32 s1, s0
5073; SI-NEXT:    s_waitcnt lgkmcnt(0)
5074; SI-NEXT:    v_mov_b32_e32 v0, s8
5075; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5076; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5077; SI-NEXT:    v_mov_b32_e32 v0, s6
5078; SI-NEXT:    v_mov_b32_e32 v1, s7
5079; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5080; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5081; SI-NEXT:    v_mov_b32_e32 v0, s9
5082; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5083; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5084; SI-NEXT:    v_mov_b32_e32 v0, s4
5085; SI-NEXT:    v_mov_b32_e32 v1, s5
5086; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5087; SI-NEXT:    s_waitcnt vmcnt(0)
5088; SI-NEXT:    s_endpgm
5089;
5090; VI-LABEL: struct_argument_alignment:
5091; VI:       ; %bb.0:
5092; VI-NEXT:    s_load_dword s6, s[4:5], 0x24
5093; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
5094; VI-NEXT:    s_load_dword s7, s[4:5], 0x3c
5095; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x44
5096; VI-NEXT:    v_mov_b32_e32 v0, 0
5097; VI-NEXT:    v_mov_b32_e32 v1, 0
5098; VI-NEXT:    s_waitcnt lgkmcnt(0)
5099; VI-NEXT:    v_mov_b32_e32 v2, s6
5100; VI-NEXT:    flat_store_dword v[0:1], v2
5101; VI-NEXT:    s_waitcnt vmcnt(0)
5102; VI-NEXT:    v_mov_b32_e32 v3, s1
5103; VI-NEXT:    v_mov_b32_e32 v2, s0
5104; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
5105; VI-NEXT:    s_waitcnt vmcnt(0)
5106; VI-NEXT:    v_mov_b32_e32 v2, s7
5107; VI-NEXT:    flat_store_dword v[0:1], v2
5108; VI-NEXT:    s_waitcnt vmcnt(0)
5109; VI-NEXT:    v_mov_b32_e32 v2, s2
5110; VI-NEXT:    v_mov_b32_e32 v3, s3
5111; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
5112; VI-NEXT:    s_waitcnt vmcnt(0)
5113; VI-NEXT:    s_endpgm
5114;
5115; GFX9-LABEL: struct_argument_alignment:
5116; GFX9:       ; %bb.0:
5117; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x0
5118; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
5119; GFX9-NEXT:    s_load_dword s5, s[8:9], 0x18
5120; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x20
5121; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5122; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5123; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5124; GFX9-NEXT:    v_mov_b32_e32 v2, s4
5125; GFX9-NEXT:    global_store_dword v[0:1], v2, off
5126; GFX9-NEXT:    s_waitcnt vmcnt(0)
5127; GFX9-NEXT:    v_mov_b32_e32 v3, s1
5128; GFX9-NEXT:    v_mov_b32_e32 v2, s0
5129; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
5130; GFX9-NEXT:    s_waitcnt vmcnt(0)
5131; GFX9-NEXT:    v_mov_b32_e32 v2, s5
5132; GFX9-NEXT:    global_store_dword v[0:1], v2, off
5133; GFX9-NEXT:    s_waitcnt vmcnt(0)
5134; GFX9-NEXT:    v_mov_b32_e32 v2, s2
5135; GFX9-NEXT:    v_mov_b32_e32 v3, s3
5136; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
5137; GFX9-NEXT:    s_waitcnt vmcnt(0)
5138; GFX9-NEXT:    s_endpgm
5139;
5140; EG-LABEL: struct_argument_alignment:
5141; EG:       ; %bb.0:
5142; EG-NEXT:    ALU 9, @8, KC0[CB0:0-32], KC1[]
5143; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.X, T6.X, 0
5144; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
5145; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T6.X, 0
5146; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T6.X, 0
5147; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 0
5148; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1
5149; EG-NEXT:    CF_END
5150; EG-NEXT:    ALU clause starting at 8:
5151; EG-NEXT:     MOV T0.X, KC0[4].Y,
5152; EG-NEXT:     MOV * T1.X, KC0[4].Z,
5153; EG-NEXT:     MOV T2.X, KC0[3].W,
5154; EG-NEXT:     MOV * T3.X, KC0[2].W,
5155; EG-NEXT:     MOV T4.X, literal.x,
5156; EG-NEXT:     MOV * T5.X, KC0[3].X,
5157; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5158; EG-NEXT:     MOV T6.X, literal.x,
5159; EG-NEXT:     MOV * T7.X, KC0[2].Y,
5160; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5161;
5162; CM-LABEL: struct_argument_alignment:
5163; CM:       ; %bb.0:
5164; CM-NEXT:    ALU 9, @8, KC0[CB0:0-32], KC1[]
5165; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7.X, T6.X
5166; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5.X, T4.X
5167; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3.X, T6.X
5168; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T6.X
5169; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X
5170; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T6.X
5171; CM-NEXT:    CF_END
5172; CM-NEXT:    ALU clause starting at 8:
5173; CM-NEXT:     MOV * T0.X, KC0[4].Y,
5174; CM-NEXT:     MOV * T1.X, KC0[4].Z,
5175; CM-NEXT:     MOV * T2.X, KC0[3].W,
5176; CM-NEXT:     MOV * T3.X, KC0[2].W,
5177; CM-NEXT:     MOV * T4.X, literal.x,
5178; CM-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5179; CM-NEXT:     MOV * T5.X, KC0[3].X,
5180; CM-NEXT:     MOV * T6.X, literal.x,
5181; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5182; CM-NEXT:     MOV * T7.X, KC0[2].Y,
5183  %val0 = extractvalue {i32, i64} %arg0, 0
5184  %val1 = extractvalue {i32, i64} %arg0, 1
5185  %val2 = extractvalue {i32, i64} %arg1, 0
5186  %val3 = extractvalue {i32, i64} %arg1, 1
5187  store volatile i32 %val0, ptr addrspace(1) null
5188  store volatile i64 %val1, ptr addrspace(1) null
5189  store volatile i32 %val2, ptr addrspace(1) null
5190  store volatile i64 %val3, ptr addrspace(1) null
5191  ret void
5192}
5193
5194; No padding between i8 and next struct, but round up at end to 4 byte
5195; multiple.
5196define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
5197; SI-LABEL: packed_struct_argument_alignment:
5198; SI:       ; %bb.0:
5199; SI-NEXT:    s_mov_b32 s7, 0xf000
5200; SI-NEXT:    s_mov_b32 s6, -1
5201; SI-NEXT:    s_load_dword s2, s[4:5], 0x9
5202; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa
5203; SI-NEXT:    buffer_load_ubyte v4, off, s[4:7], 0 offset:49
5204; SI-NEXT:    buffer_load_ubyte v5, off, s[4:7], 0 offset:50
5205; SI-NEXT:    buffer_load_ubyte v6, off, s[4:7], 0 offset:51
5206; SI-NEXT:    buffer_load_ubyte v7, off, s[4:7], 0 offset:52
5207; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
5208; SI-NEXT:    s_mov_b32 s4, 0
5209; SI-NEXT:    s_mov_b32 s5, s4
5210; SI-NEXT:    s_waitcnt lgkmcnt(0)
5211; SI-NEXT:    v_mov_b32_e32 v2, s2
5212; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
5213; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5214; SI-NEXT:    v_mov_b32_e32 v3, s1
5215; SI-NEXT:    v_mov_b32_e32 v2, s0
5216; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
5217; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5218; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
5219; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
5220; SI-NEXT:    v_or_b32_e32 v2, v2, v4
5221; SI-NEXT:    v_or_b32_e32 v3, v3, v6
5222; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
5223; SI-NEXT:    v_or_b32_e32 v2, v3, v2
5224; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
5225; SI-NEXT:    s_waitcnt vmcnt(0)
5226; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5227; SI-NEXT:    s_waitcnt vmcnt(0)
5228; SI-NEXT:    s_endpgm
5229;
5230; VI-LABEL: packed_struct_argument_alignment:
5231; VI:       ; %bb.0:
5232; VI-NEXT:    s_add_u32 s0, s4, 49
5233; VI-NEXT:    s_addc_u32 s1, s5, 0
5234; VI-NEXT:    s_add_u32 s2, s4, 50
5235; VI-NEXT:    s_addc_u32 s3, s5, 0
5236; VI-NEXT:    v_mov_b32_e32 v3, s1
5237; VI-NEXT:    v_mov_b32_e32 v2, s0
5238; VI-NEXT:    s_add_u32 s0, s0, 3
5239; VI-NEXT:    s_addc_u32 s1, s1, 0
5240; VI-NEXT:    v_mov_b32_e32 v5, s1
5241; VI-NEXT:    v_mov_b32_e32 v4, s0
5242; VI-NEXT:    s_add_u32 s0, s4, 51
5243; VI-NEXT:    s_addc_u32 s1, s5, 0
5244; VI-NEXT:    v_mov_b32_e32 v0, s2
5245; VI-NEXT:    v_mov_b32_e32 v7, s1
5246; VI-NEXT:    v_mov_b32_e32 v1, s3
5247; VI-NEXT:    v_mov_b32_e32 v6, s0
5248; VI-NEXT:    flat_load_ubyte v8, v[0:1]
5249; VI-NEXT:    flat_load_ubyte v9, v[2:3]
5250; VI-NEXT:    flat_load_ubyte v10, v[4:5]
5251; VI-NEXT:    flat_load_ubyte v6, v[6:7]
5252; VI-NEXT:    s_add_u32 s0, s4, 53
5253; VI-NEXT:    s_addc_u32 s1, s5, 0
5254; VI-NEXT:    v_mov_b32_e32 v0, s0
5255; VI-NEXT:    v_mov_b32_e32 v1, s1
5256; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
5257; VI-NEXT:    s_load_dword s2, s[4:5], 0x24
5258; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x28
5259; VI-NEXT:    v_mov_b32_e32 v2, 0
5260; VI-NEXT:    v_mov_b32_e32 v3, 0
5261; VI-NEXT:    s_waitcnt lgkmcnt(0)
5262; VI-NEXT:    v_mov_b32_e32 v7, s2
5263; VI-NEXT:    v_mov_b32_e32 v5, s1
5264; VI-NEXT:    v_mov_b32_e32 v4, s0
5265; VI-NEXT:    flat_store_dword v[2:3], v7
5266; VI-NEXT:    s_waitcnt vmcnt(0)
5267; VI-NEXT:    flat_store_dwordx2 v[2:3], v[4:5]
5268; VI-NEXT:    s_waitcnt vmcnt(0)
5269; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
5270; VI-NEXT:    v_or_b32_e32 v4, v4, v9
5271; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v10
5272; VI-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5273; VI-NEXT:    v_or_b32_e32 v4, v5, v4
5274; VI-NEXT:    flat_store_dword v[2:3], v4
5275; VI-NEXT:    s_waitcnt vmcnt(0)
5276; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
5277; VI-NEXT:    s_waitcnt vmcnt(0)
5278; VI-NEXT:    s_endpgm
5279;
5280; GFX9-LABEL: packed_struct_argument_alignment:
5281; GFX9:       ; %bb.0:
5282; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5283; GFX9-NEXT:    global_load_dword v6, v2, s[8:9] offset:13
5284; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[8:9] offset:17
5285; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x0
5286; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x4
5287; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5288; GFX9-NEXT:    v_mov_b32_e32 v3, 0
5289; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5290; GFX9-NEXT:    v_mov_b32_e32 v7, s2
5291; GFX9-NEXT:    v_mov_b32_e32 v5, s1
5292; GFX9-NEXT:    v_mov_b32_e32 v4, s0
5293; GFX9-NEXT:    global_store_dword v[2:3], v7, off
5294; GFX9-NEXT:    s_waitcnt vmcnt(0)
5295; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[4:5], off
5296; GFX9-NEXT:    s_waitcnt vmcnt(0)
5297; GFX9-NEXT:    global_store_dword v[2:3], v6, off
5298; GFX9-NEXT:    s_waitcnt vmcnt(0)
5299; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
5300; GFX9-NEXT:    s_waitcnt vmcnt(0)
5301; GFX9-NEXT:    s_endpgm
5302;
5303; EG-LABEL: packed_struct_argument_alignment:
5304; EG:       ; %bb.0:
5305; EG-NEXT:    ALU 6, @18, KC0[CB0:0-32], KC1[]
5306; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0
5307; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0
5308; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
5309; EG-NEXT:    ALU 2, @25, KC0[], KC1[]
5310; EG-NEXT:    TEX 0 @12
5311; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
5312; EG-NEXT:    TEX 0 @14
5313; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0
5314; EG-NEXT:    TEX 0 @16
5315; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 1
5316; EG-NEXT:    CF_END
5317; EG-NEXT:    Fetch clause starting at 12:
5318; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 49, #3
5319; EG-NEXT:    Fetch clause starting at 14:
5320; EG-NEXT:     VTX_READ_32 T2.X, T2.X, 57, #3
5321; EG-NEXT:    Fetch clause starting at 16:
5322; EG-NEXT:     VTX_READ_32 T4.X, T4.X, 53, #3
5323; EG-NEXT:    ALU clause starting at 18:
5324; EG-NEXT:     MOV T0.X, KC0[2].Z,
5325; EG-NEXT:     MOV * T1.X, literal.x,
5326; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5327; EG-NEXT:     MOV T2.X, KC0[2].W,
5328; EG-NEXT:     MOV * T3.X, literal.x,
5329; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5330; EG-NEXT:     MOV * T4.X, KC0[2].Y,
5331; EG-NEXT:    ALU clause starting at 25:
5332; EG-NEXT:     MOV T0.X, 0.0,
5333; EG-NEXT:     MOV * T2.X, 0.0,
5334; EG-NEXT:     MOV * T4.X, 0.0,
5335;
5336; CM-LABEL: packed_struct_argument_alignment:
5337; CM:       ; %bb.0:
5338; CM-NEXT:    ALU 6, @18, KC0[CB0:0-32], KC1[]
5339; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
5340; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
5341; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X
5342; CM-NEXT:    ALU 2, @25, KC0[], KC1[]
5343; CM-NEXT:    TEX 0 @12
5344; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X
5345; CM-NEXT:    TEX 0 @14
5346; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
5347; CM-NEXT:    TEX 0 @16
5348; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
5349; CM-NEXT:    CF_END
5350; CM-NEXT:    Fetch clause starting at 12:
5351; CM-NEXT:     VTX_READ_32 T0.X, T0.X, 49, #3
5352; CM-NEXT:    Fetch clause starting at 14:
5353; CM-NEXT:     VTX_READ_32 T2.X, T2.X, 57, #3
5354; CM-NEXT:    Fetch clause starting at 16:
5355; CM-NEXT:     VTX_READ_32 T4.X, T4.X, 53, #3
5356; CM-NEXT:    ALU clause starting at 18:
5357; CM-NEXT:     MOV * T0.X, KC0[2].Z,
5358; CM-NEXT:     MOV * T1.X, literal.x,
5359; CM-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5360; CM-NEXT:     MOV * T2.X, KC0[2].W,
5361; CM-NEXT:     MOV * T3.X, literal.x,
5362; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5363; CM-NEXT:     MOV * T4.X, KC0[2].Y,
5364; CM-NEXT:    ALU clause starting at 25:
5365; CM-NEXT:     MOV * T0.X, 0.0,
5366; CM-NEXT:     MOV * T2.X, 0.0,
5367; CM-NEXT:     MOV * T4.X, 0.0,
5368  %val0 = extractvalue <{i32, i64}> %arg0, 0
5369  %val1 = extractvalue <{i32, i64}> %arg0, 1
5370  %val2 = extractvalue <{i32, i64}> %arg1, 0
5371  %val3 = extractvalue <{i32, i64}> %arg1, 1
5372  store volatile i32 %val0, ptr addrspace(1) null
5373  store volatile i64 %val1, ptr addrspace(1) null
5374  store volatile i32 %val2, ptr addrspace(1) null
5375  store volatile i64 %val3, ptr addrspace(1) null
5376  ret void
5377}
5378
5379define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
5380; SI-LABEL: struct_argument_alignment_after:
5381; SI:       ; %bb.0:
5382; SI-NEXT:    s_load_dword s12, s[4:5], 0x9
5383; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xb
5384; SI-NEXT:    s_load_dword s13, s[4:5], 0xf
5385; SI-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x11
5386; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x15
5387; SI-NEXT:    s_mov_b32 s4, 0
5388; SI-NEXT:    s_mov_b32 s7, 0xf000
5389; SI-NEXT:    s_mov_b32 s6, -1
5390; SI-NEXT:    s_mov_b32 s5, s4
5391; SI-NEXT:    s_waitcnt lgkmcnt(0)
5392; SI-NEXT:    v_mov_b32_e32 v0, s12
5393; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5394; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5395; SI-NEXT:    v_mov_b32_e32 v0, s8
5396; SI-NEXT:    v_mov_b32_e32 v1, s9
5397; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5398; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5399; SI-NEXT:    v_mov_b32_e32 v0, s13
5400; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5401; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5402; SI-NEXT:    v_mov_b32_e32 v0, s10
5403; SI-NEXT:    v_mov_b32_e32 v1, s11
5404; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5405; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5406; SI-NEXT:    v_mov_b32_e32 v0, s0
5407; SI-NEXT:    v_mov_b32_e32 v1, s1
5408; SI-NEXT:    v_mov_b32_e32 v2, s2
5409; SI-NEXT:    v_mov_b32_e32 v3, s3
5410; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5411; SI-NEXT:    s_waitcnt vmcnt(0)
5412; SI-NEXT:    s_endpgm
5413;
5414; VI-LABEL: struct_argument_alignment_after:
5415; VI:       ; %bb.0:
5416; VI-NEXT:    s_load_dword s10, s[4:5], 0x24
5417; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x2c
5418; VI-NEXT:    s_load_dword s11, s[4:5], 0x3c
5419; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
5420; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
5421; VI-NEXT:    v_mov_b32_e32 v4, 0
5422; VI-NEXT:    v_mov_b32_e32 v5, 0
5423; VI-NEXT:    s_waitcnt lgkmcnt(0)
5424; VI-NEXT:    v_mov_b32_e32 v0, s10
5425; VI-NEXT:    flat_store_dword v[4:5], v0
5426; VI-NEXT:    s_waitcnt vmcnt(0)
5427; VI-NEXT:    v_mov_b32_e32 v0, s6
5428; VI-NEXT:    v_mov_b32_e32 v1, s7
5429; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
5430; VI-NEXT:    s_waitcnt vmcnt(0)
5431; VI-NEXT:    v_mov_b32_e32 v0, s11
5432; VI-NEXT:    flat_store_dword v[4:5], v0
5433; VI-NEXT:    s_waitcnt vmcnt(0)
5434; VI-NEXT:    v_mov_b32_e32 v0, s8
5435; VI-NEXT:    v_mov_b32_e32 v1, s9
5436; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
5437; VI-NEXT:    s_waitcnt vmcnt(0)
5438; VI-NEXT:    v_mov_b32_e32 v0, s0
5439; VI-NEXT:    v_mov_b32_e32 v1, s1
5440; VI-NEXT:    v_mov_b32_e32 v2, s2
5441; VI-NEXT:    v_mov_b32_e32 v3, s3
5442; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
5443; VI-NEXT:    s_waitcnt vmcnt(0)
5444; VI-NEXT:    s_endpgm
5445;
5446; GFX9-LABEL: struct_argument_alignment_after:
5447; GFX9:       ; %bb.0:
5448; GFX9-NEXT:    s_load_dword s10, s[8:9], 0x0
5449; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
5450; GFX9-NEXT:    s_load_dword s11, s[8:9], 0x18
5451; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x20
5452; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x30
5453; GFX9-NEXT:    v_mov_b32_e32 v4, 0
5454; GFX9-NEXT:    v_mov_b32_e32 v5, 0
5455; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5456; GFX9-NEXT:    v_mov_b32_e32 v0, s10
5457; GFX9-NEXT:    global_store_dword v[4:5], v0, off
5458; GFX9-NEXT:    s_waitcnt vmcnt(0)
5459; GFX9-NEXT:    v_mov_b32_e32 v0, s4
5460; GFX9-NEXT:    v_mov_b32_e32 v1, s5
5461; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
5462; GFX9-NEXT:    s_waitcnt vmcnt(0)
5463; GFX9-NEXT:    v_mov_b32_e32 v0, s11
5464; GFX9-NEXT:    global_store_dword v[4:5], v0, off
5465; GFX9-NEXT:    s_waitcnt vmcnt(0)
5466; GFX9-NEXT:    v_mov_b32_e32 v0, s6
5467; GFX9-NEXT:    v_mov_b32_e32 v1, s7
5468; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
5469; GFX9-NEXT:    s_waitcnt vmcnt(0)
5470; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5471; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5472; GFX9-NEXT:    v_mov_b32_e32 v2, s2
5473; GFX9-NEXT:    v_mov_b32_e32 v3, s3
5474; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
5475; GFX9-NEXT:    s_waitcnt vmcnt(0)
5476; GFX9-NEXT:    s_endpgm
5477;
5478; EG-LABEL: struct_argument_alignment_after:
5479; EG:       ; %bb.0:
5480; EG-NEXT:    ALU 13, @10, KC0[CB0:0-32], KC1[]
5481; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.X, T7.X, 0
5482; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T5.X, 0
5483; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T7.X, 0
5484; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T7.X, 0
5485; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T5.X, 0
5486; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T7.X, 0
5487; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T7.X, 1
5488; EG-NEXT:    CF_END
5489; EG-NEXT:    PAD
5490; EG-NEXT:    ALU clause starting at 10:
5491; EG-NEXT:     MOV * T0.W, KC0[6].X,
5492; EG-NEXT:     MOV * T0.Z, KC0[5].W,
5493; EG-NEXT:     MOV * T0.Y, KC0[5].Z,
5494; EG-NEXT:     MOV T0.X, KC0[5].Y,
5495; EG-NEXT:     MOV * T1.X, KC0[4].Y,
5496; EG-NEXT:     MOV T2.X, KC0[4].Z,
5497; EG-NEXT:     MOV * T3.X, KC0[3].W,
5498; EG-NEXT:     MOV T4.X, KC0[2].W,
5499; EG-NEXT:     MOV * T5.X, literal.x,
5500; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5501; EG-NEXT:     MOV T6.X, KC0[3].X,
5502; EG-NEXT:     MOV * T7.X, literal.x,
5503; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5504; EG-NEXT:     MOV * T8.X, KC0[2].Y,
5505;
5506; CM-LABEL: struct_argument_alignment_after:
5507; CM:       ; %bb.0:
5508; CM-NEXT:    ALU 13, @10, KC0[CB0:0-32], KC1[]
5509; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8.X, T7.X
5510; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6.X, T5.X
5511; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T7.X
5512; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3.X, T7.X
5513; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T5.X
5514; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T7.X
5515; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T7.X
5516; CM-NEXT:    CF_END
5517; CM-NEXT:    PAD
5518; CM-NEXT:    ALU clause starting at 10:
5519; CM-NEXT:     MOV * T0.W, KC0[6].X,
5520; CM-NEXT:     MOV * T0.Z, KC0[5].W,
5521; CM-NEXT:     MOV * T0.Y, KC0[5].Z,
5522; CM-NEXT:     MOV * T0.X, KC0[5].Y,
5523; CM-NEXT:     MOV * T1.X, KC0[4].Y,
5524; CM-NEXT:     MOV * T2.X, KC0[4].Z,
5525; CM-NEXT:     MOV * T3.X, KC0[3].W,
5526; CM-NEXT:     MOV * T4.X, KC0[2].W,
5527; CM-NEXT:     MOV * T5.X, literal.x,
5528; CM-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5529; CM-NEXT:     MOV * T6.X, KC0[3].X,
5530; CM-NEXT:     MOV * T7.X, literal.x,
5531; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5532; CM-NEXT:     MOV * T8.X, KC0[2].Y,
5533  %val0 = extractvalue {i32, i64} %arg0, 0
5534  %val1 = extractvalue {i32, i64} %arg0, 1
5535  %val2 = extractvalue {i32, i64} %arg2, 0
5536  %val3 = extractvalue {i32, i64} %arg2, 1
5537  store volatile i32 %val0, ptr addrspace(1) null
5538  store volatile i64 %val1, ptr addrspace(1) null
5539  store volatile i32 %val2, ptr addrspace(1) null
5540  store volatile i64 %val3, ptr addrspace(1) null
5541  store volatile <4 x i32> %arg4, ptr addrspace(1) null
5542  ret void
5543}
5544
5545define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
5546; SI-LABEL: array_3xi32:
5547; SI:       ; %bb.0:
5548; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5549; SI-NEXT:    s_mov_b32 s7, 0xf000
5550; SI-NEXT:    s_mov_b32 s6, -1
5551; SI-NEXT:    s_waitcnt lgkmcnt(0)
5552; SI-NEXT:    v_mov_b32_e32 v0, s0
5553; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
5554; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5555; SI-NEXT:    v_mov_b32_e32 v0, s3
5556; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5557; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5558; SI-NEXT:    v_mov_b32_e32 v0, s2
5559; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5560; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5561; SI-NEXT:    v_mov_b32_e32 v0, s1
5562; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5563; SI-NEXT:    s_waitcnt vmcnt(0)
5564; SI-NEXT:    s_endpgm
5565;
5566; VI-LABEL: array_3xi32:
5567; VI:       ; %bb.0:
5568; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5569; VI-NEXT:    s_waitcnt lgkmcnt(0)
5570; VI-NEXT:    v_mov_b32_e32 v0, s0
5571; VI-NEXT:    v_mov_b32_e32 v1, s3
5572; VI-NEXT:    v_mov_b32_e32 v2, s2
5573; VI-NEXT:    flat_store_short v[0:1], v0
5574; VI-NEXT:    s_waitcnt vmcnt(0)
5575; VI-NEXT:    flat_store_dword v[0:1], v1
5576; VI-NEXT:    s_waitcnt vmcnt(0)
5577; VI-NEXT:    flat_store_dword v[0:1], v2
5578; VI-NEXT:    s_waitcnt vmcnt(0)
5579; VI-NEXT:    v_mov_b32_e32 v0, s1
5580; VI-NEXT:    flat_store_dword v[0:1], v0
5581; VI-NEXT:    s_waitcnt vmcnt(0)
5582; VI-NEXT:    s_endpgm
5583;
5584; GFX9-LABEL: array_3xi32:
5585; GFX9:       ; %bb.0:
5586; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
5587; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5588; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5589; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5590; GFX9-NEXT:    v_mov_b32_e32 v2, s2
5591; GFX9-NEXT:    global_store_short v[0:1], v0, off
5592; GFX9-NEXT:    s_waitcnt vmcnt(0)
5593; GFX9-NEXT:    global_store_dword v[0:1], v1, off
5594; GFX9-NEXT:    s_waitcnt vmcnt(0)
5595; GFX9-NEXT:    global_store_dword v[0:1], v2, off
5596; GFX9-NEXT:    s_waitcnt vmcnt(0)
5597; GFX9-NEXT:    v_mov_b32_e32 v0, s1
5598; GFX9-NEXT:    global_store_dword v[0:1], v0, off
5599; GFX9-NEXT:    s_waitcnt vmcnt(0)
5600; GFX9-NEXT:    s_endpgm
5601;
5602; EG-LABEL: array_3xi32:
5603; EG:       ; %bb.0:
5604; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
5605; EG-NEXT:    TEX 0 @8
5606; EG-NEXT:    ALU 9, @11, KC0[CB0:0-32], KC1[]
5607; EG-NEXT:    MEM_RAT MSKOR T0.XW, T4.X
5608; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T4.X, 0
5609; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0
5610; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 1
5611; EG-NEXT:    CF_END
5612; EG-NEXT:    Fetch clause starting at 8:
5613; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 36, #3
5614; EG-NEXT:    ALU clause starting at 10:
5615; EG-NEXT:     MOV * T0.X, 0.0,
5616; EG-NEXT:    ALU clause starting at 11:
5617; EG-NEXT:     AND_INT T0.X, T0.X, literal.x,
5618; EG-NEXT:     MOV * T0.W, literal.x,
5619; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5620; EG-NEXT:     MOV T0.Y, 0.0,
5621; EG-NEXT:     MOV * T0.Z, 0.0,
5622; EG-NEXT:     MOV T1.X, KC0[2].Z,
5623; EG-NEXT:     MOV * T2.X, KC0[2].W,
5624; EG-NEXT:     MOV T3.X, KC0[3].X,
5625; EG-NEXT:     MOV * T4.X, literal.x,
5626; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5627;
5628; CM-LABEL: array_3xi32:
5629; CM:       ; %bb.0:
5630; CM-NEXT:    ALU 0, @10, KC0[], KC1[]
5631; CM-NEXT:    TEX 0 @8
5632; CM-NEXT:    ALU 9, @11, KC0[CB0:0-32], KC1[]
5633; CM-NEXT:    MEM_RAT MSKOR T0.XW, T4.X
5634; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3.X, T4.X
5635; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T4.X
5636; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X
5637; CM-NEXT:    CF_END
5638; CM-NEXT:    Fetch clause starting at 8:
5639; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 36, #3
5640; CM-NEXT:    ALU clause starting at 10:
5641; CM-NEXT:     MOV * T0.X, 0.0,
5642; CM-NEXT:    ALU clause starting at 11:
5643; CM-NEXT:     AND_INT T0.X, T0.X, literal.x,
5644; CM-NEXT:     MOV * T0.W, literal.x,
5645; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5646; CM-NEXT:     MOV T0.Y, 0.0,
5647; CM-NEXT:     MOV * T0.Z, 0.0,
5648; CM-NEXT:     MOV * T1.X, KC0[2].Z,
5649; CM-NEXT:     MOV * T2.X, KC0[2].W,
5650; CM-NEXT:     MOV * T3.X, KC0[3].X,
5651; CM-NEXT:     MOV * T4.X, literal.x,
5652; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5653  store volatile i16 %arg0, ptr addrspace(1) undef
5654  store volatile [3 x i32] %arg1, ptr addrspace(1) undef
5655  ret void
5656}
5657
5658; FIXME: Why not all scalar loads?
5659define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
5660; SI-LABEL: array_3xi16:
5661; SI:       ; %bb.0:
5662; SI-NEXT:    s_load_dword s0, s[4:5], 0x9
5663; SI-NEXT:    s_mov_b32 s7, 0xf000
5664; SI-NEXT:    s_mov_b32 s6, -1
5665; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0 offset:42
5666; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:40
5667; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 offset:38
5668; SI-NEXT:    s_waitcnt lgkmcnt(0)
5669; SI-NEXT:    v_mov_b32_e32 v3, s0
5670; SI-NEXT:    buffer_store_byte v3, off, s[4:7], 0
5671; SI-NEXT:    s_waitcnt vmcnt(0)
5672; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
5673; SI-NEXT:    s_waitcnt vmcnt(0)
5674; SI-NEXT:    buffer_store_short v1, off, s[4:7], 0
5675; SI-NEXT:    s_waitcnt vmcnt(0)
5676; SI-NEXT:    buffer_store_short v2, off, s[4:7], 0
5677; SI-NEXT:    s_waitcnt vmcnt(0)
5678; SI-NEXT:    s_endpgm
5679;
5680; VI-LABEL: array_3xi16:
5681; VI:       ; %bb.0:
5682; VI-NEXT:    s_add_u32 s0, s4, 38
5683; VI-NEXT:    s_addc_u32 s1, s5, 0
5684; VI-NEXT:    s_add_u32 s2, s0, 2
5685; VI-NEXT:    s_addc_u32 s3, s1, 0
5686; VI-NEXT:    v_mov_b32_e32 v0, s0
5687; VI-NEXT:    v_mov_b32_e32 v1, s1
5688; VI-NEXT:    s_add_u32 s0, s4, 42
5689; VI-NEXT:    s_addc_u32 s1, s5, 0
5690; VI-NEXT:    v_mov_b32_e32 v3, s1
5691; VI-NEXT:    v_mov_b32_e32 v2, s0
5692; VI-NEXT:    flat_load_ushort v4, v[0:1]
5693; VI-NEXT:    flat_load_ushort v2, v[2:3]
5694; VI-NEXT:    v_mov_b32_e32 v0, s2
5695; VI-NEXT:    v_mov_b32_e32 v1, s3
5696; VI-NEXT:    flat_load_ushort v0, v[0:1]
5697; VI-NEXT:    s_load_dword s0, s[4:5], 0x24
5698; VI-NEXT:    s_waitcnt lgkmcnt(0)
5699; VI-NEXT:    v_mov_b32_e32 v1, s0
5700; VI-NEXT:    s_waitcnt vmcnt(0)
5701; VI-NEXT:    flat_store_byte v[0:1], v1
5702; VI-NEXT:    s_waitcnt vmcnt(0)
5703; VI-NEXT:    flat_store_short v[0:1], v2
5704; VI-NEXT:    s_waitcnt vmcnt(0)
5705; VI-NEXT:    flat_store_short v[0:1], v4
5706; VI-NEXT:    s_waitcnt vmcnt(0)
5707; VI-NEXT:    flat_store_short v[0:1], v0
5708; VI-NEXT:    s_waitcnt vmcnt(0)
5709; VI-NEXT:    s_endpgm
5710;
5711; GFX9-LABEL: array_3xi16:
5712; GFX9:       ; %bb.0:
5713; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5714; GFX9-NEXT:    global_load_ushort v1, v0, s[8:9] offset:6
5715; GFX9-NEXT:    global_load_ushort v2, v0, s[8:9] offset:4
5716; GFX9-NEXT:    global_load_ushort v3, v0, s[8:9] offset:2
5717; GFX9-NEXT:    s_load_dword s0, s[8:9], 0x0
5718; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5719; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5720; GFX9-NEXT:    s_waitcnt vmcnt(2)
5721; GFX9-NEXT:    global_store_byte v[0:1], v0, off
5722; GFX9-NEXT:    s_waitcnt vmcnt(0)
5723; GFX9-NEXT:    global_store_short v[0:1], v1, off
5724; GFX9-NEXT:    s_waitcnt vmcnt(0)
5725; GFX9-NEXT:    global_store_short v[0:1], v2, off
5726; GFX9-NEXT:    s_waitcnt vmcnt(0)
5727; GFX9-NEXT:    global_store_short v[0:1], v3, off
5728; GFX9-NEXT:    s_waitcnt vmcnt(0)
5729; GFX9-NEXT:    s_endpgm
5730;
5731; EG-LABEL: array_3xi16:
5732; EG:       ; %bb.0:
5733; EG-NEXT:    ALU 0, @20, KC0[], KC1[]
5734; EG-NEXT:    TEX 1 @12
5735; EG-NEXT:    ALU 11, @21, KC0[], KC1[]
5736; EG-NEXT:    MEM_RAT MSKOR T1.XW, T3.X
5737; EG-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5738; EG-NEXT:    TEX 0 @16
5739; EG-NEXT:    ALU 3, @33, KC0[], KC1[]
5740; EG-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5741; EG-NEXT:    TEX 0 @18
5742; EG-NEXT:    ALU 3, @37, KC0[], KC1[]
5743; EG-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5744; EG-NEXT:    CF_END
5745; EG-NEXT:    Fetch clause starting at 12:
5746; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 36, #3
5747; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 42, #3
5748; EG-NEXT:    Fetch clause starting at 16:
5749; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 40, #3
5750; EG-NEXT:    Fetch clause starting at 18:
5751; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 38, #3
5752; EG-NEXT:    ALU clause starting at 20:
5753; EG-NEXT:     MOV * T0.X, 0.0,
5754; EG-NEXT:    ALU clause starting at 21:
5755; EG-NEXT:     AND_INT T1.X, T1.X, literal.x,
5756; EG-NEXT:     MOV * T1.W, literal.x,
5757; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
5758; EG-NEXT:     MOV * T1.Y, 0.0,
5759; EG-NEXT:     AND_INT T2.X, T2.X, literal.x,
5760; EG-NEXT:     MOV * T2.W, literal.x,
5761; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5762; EG-NEXT:     MOV T2.Y, 0.0,
5763; EG-NEXT:     MOV T1.Z, 0.0,
5764; EG-NEXT:     MOV * T2.Z, 0.0,
5765; EG-NEXT:     MOV * T3.X, literal.x,
5766; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5767; EG-NEXT:    ALU clause starting at 33:
5768; EG-NEXT:     AND_INT T2.X, T1.X, literal.x,
5769; EG-NEXT:     MOV T2.Y, 0.0,
5770; EG-NEXT:     MOV * T2.Z, 0.0,
5771; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5772; EG-NEXT:    ALU clause starting at 37:
5773; EG-NEXT:     AND_INT T2.X, T0.X, literal.x,
5774; EG-NEXT:     MOV T2.Y, 0.0,
5775; EG-NEXT:     MOV * T2.Z, 0.0,
5776; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5777;
5778; CM-LABEL: array_3xi16:
5779; CM:       ; %bb.0:
5780; CM-NEXT:    ALU 0, @20, KC0[], KC1[]
5781; CM-NEXT:    TEX 1 @12
5782; CM-NEXT:    ALU 11, @21, KC0[], KC1[]
5783; CM-NEXT:    MEM_RAT MSKOR T1.XW, T3.X
5784; CM-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5785; CM-NEXT:    TEX 0 @16
5786; CM-NEXT:    ALU 3, @33, KC0[], KC1[]
5787; CM-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5788; CM-NEXT:    TEX 0 @18
5789; CM-NEXT:    ALU 3, @37, KC0[], KC1[]
5790; CM-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5791; CM-NEXT:    CF_END
5792; CM-NEXT:    Fetch clause starting at 12:
5793; CM-NEXT:     VTX_READ_8 T1.X, T0.X, 36, #3
5794; CM-NEXT:     VTX_READ_16 T2.X, T0.X, 42, #3
5795; CM-NEXT:    Fetch clause starting at 16:
5796; CM-NEXT:     VTX_READ_16 T1.X, T0.X, 40, #3
5797; CM-NEXT:    Fetch clause starting at 18:
5798; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 38, #3
5799; CM-NEXT:    ALU clause starting at 20:
5800; CM-NEXT:     MOV * T0.X, 0.0,
5801; CM-NEXT:    ALU clause starting at 21:
5802; CM-NEXT:     AND_INT T1.X, T1.X, literal.x,
5803; CM-NEXT:     MOV * T1.W, literal.x,
5804; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
5805; CM-NEXT:     MOV * T1.Y, 0.0,
5806; CM-NEXT:     AND_INT T2.X, T2.X, literal.x,
5807; CM-NEXT:     MOV * T2.W, literal.x,
5808; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5809; CM-NEXT:     MOV T2.Y, 0.0,
5810; CM-NEXT:     MOV * T1.Z, 0.0,
5811; CM-NEXT:     MOV * T2.Z, 0.0,
5812; CM-NEXT:     MOV * T3.X, literal.x,
5813; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5814; CM-NEXT:    ALU clause starting at 33:
5815; CM-NEXT:     AND_INT T2.X, T1.X, literal.x,
5816; CM-NEXT:     MOV T2.Y, 0.0,
5817; CM-NEXT:     MOV * T2.Z, 0.0,
5818; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5819; CM-NEXT:    ALU clause starting at 37:
5820; CM-NEXT:     AND_INT T2.X, T0.X, literal.x,
5821; CM-NEXT:     MOV T2.Y, 0.0,
5822; CM-NEXT:     MOV * T2.Z, 0.0,
5823; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5824  store volatile i8 %arg0, ptr addrspace(1) undef
5825  store volatile [3 x i16] %arg1, ptr addrspace(1) undef
5826  ret void
5827}
5828
5829define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) {
5830; SI-LABEL: small_array_round_down_offset:
5831; SI:       ; %bb.0:
5832; SI-NEXT:    s_mov_b32 s7, 0xf000
5833; SI-NEXT:    s_mov_b32 s6, -1
5834; SI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 offset:37
5835; SI-NEXT:    s_waitcnt vmcnt(0)
5836; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
5837; SI-NEXT:    s_waitcnt vmcnt(0)
5838; SI-NEXT:    s_endpgm
5839;
5840; VI-LABEL: small_array_round_down_offset:
5841; VI:       ; %bb.0:
5842; VI-NEXT:    s_add_u32 s0, s4, 37
5843; VI-NEXT:    s_addc_u32 s1, s5, 0
5844; VI-NEXT:    v_mov_b32_e32 v0, s0
5845; VI-NEXT:    v_mov_b32_e32 v1, s1
5846; VI-NEXT:    flat_load_ubyte v0, v[0:1]
5847; VI-NEXT:    s_waitcnt vmcnt(0)
5848; VI-NEXT:    flat_store_byte v[0:1], v0
5849; VI-NEXT:    s_waitcnt vmcnt(0)
5850; VI-NEXT:    s_endpgm
5851;
5852; GFX9-LABEL: small_array_round_down_offset:
5853; GFX9:       ; %bb.0:
5854; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5855; GFX9-NEXT:    global_load_ubyte v0, v0, s[8:9] offset:1
5856; GFX9-NEXT:    s_waitcnt vmcnt(0)
5857; GFX9-NEXT:    global_store_byte v[0:1], v0, off
5858; GFX9-NEXT:    s_waitcnt vmcnt(0)
5859; GFX9-NEXT:    s_endpgm
5860;
5861; EGCM-LABEL: small_array_round_down_offset:
5862; EGCM:       ; %bb.0:
5863; EGCM-NEXT:    ALU 0, @8, KC0[], KC1[]
5864; EGCM-NEXT:    TEX 0 @6
5865; EGCM-NEXT:    ALU 6, @9, KC0[], KC1[]
5866; EGCM-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
5867; EGCM-NEXT:    CF_END
5868; EGCM-NEXT:    PAD
5869; EGCM-NEXT:    Fetch clause starting at 6:
5870; EGCM-NEXT:     VTX_READ_8 T0.X, T0.X, 37, #3
5871; EGCM-NEXT:    ALU clause starting at 8:
5872; EGCM-NEXT:     MOV * T0.X, 0.0,
5873; EGCM-NEXT:    ALU clause starting at 9:
5874; EGCM-NEXT:     AND_INT T0.X, T0.X, literal.x,
5875; EGCM-NEXT:     MOV * T0.W, literal.x,
5876; EGCM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
5877; EGCM-NEXT:     MOV T0.Y, 0.0,
5878; EGCM-NEXT:     MOV * T0.Z, 0.0,
5879; EGCM-NEXT:     MOV * T1.X, literal.x,
5880; EGCM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5881  %val = extractvalue [1 x i8] %arg, 0
5882  store volatile i8 %val, ptr addrspace(1) undef
5883  ret void
5884}
5885
5886define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
5887; SI-LABEL: byref_align_constant_i32_arg:
5888; SI:       ; %bb.0:
5889; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x49
5890; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
5891; SI-NEXT:    s_mov_b32 s3, 0xf000
5892; SI-NEXT:    s_mov_b32 s2, -1
5893; SI-NEXT:    s_waitcnt lgkmcnt(0)
5894; SI-NEXT:    v_mov_b32_e32 v0, s6
5895; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5896; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5897; SI-NEXT:    v_mov_b32_e32 v0, s7
5898; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5899; SI-NEXT:    s_waitcnt vmcnt(0)
5900; SI-NEXT:    s_endpgm
5901;
5902; VI-LABEL: byref_align_constant_i32_arg:
5903; VI:       ; %bb.0:
5904; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
5905; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x124
5906; VI-NEXT:    s_waitcnt lgkmcnt(0)
5907; VI-NEXT:    v_mov_b32_e32 v0, s0
5908; VI-NEXT:    v_mov_b32_e32 v1, s1
5909; VI-NEXT:    v_mov_b32_e32 v2, s2
5910; VI-NEXT:    v_mov_b32_e32 v3, s3
5911; VI-NEXT:    flat_store_dword v[0:1], v2
5912; VI-NEXT:    s_waitcnt vmcnt(0)
5913; VI-NEXT:    flat_store_dword v[0:1], v3
5914; VI-NEXT:    s_waitcnt vmcnt(0)
5915; VI-NEXT:    s_endpgm
5916;
5917; GFX9-LABEL: byref_align_constant_i32_arg:
5918; GFX9:       ; %bb.0:
5919; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x100
5920; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
5921; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5922; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5923; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5924; GFX9-NEXT:    v_mov_b32_e32 v2, s1
5925; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5926; GFX9-NEXT:    s_waitcnt vmcnt(0)
5927; GFX9-NEXT:    global_store_dword v0, v2, s[2:3]
5928; GFX9-NEXT:    s_waitcnt vmcnt(0)
5929; GFX9-NEXT:    s_endpgm
5930;
5931; EG-LABEL: byref_align_constant_i32_arg:
5932; EG:       ; %bb.0:
5933; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5934; EG-NEXT:    TEX 0 @6
5935; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
5936; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 0
5937; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1
5938; EG-NEXT:    CF_END
5939; EG-NEXT:    Fetch clause starting at 6:
5940; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
5941; EG-NEXT:    ALU clause starting at 8:
5942; EG-NEXT:     MOV * T0.X, KC0[18].Y,
5943; EG-NEXT:    ALU clause starting at 9:
5944; EG-NEXT:     MOV T1.X, KC0[18].Z,
5945; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
5946; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5947;
5948; CM-LABEL: byref_align_constant_i32_arg:
5949; CM:       ; %bb.0:
5950; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5951; CM-NEXT:    TEX 0 @6
5952; CM-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
5953; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X
5954; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
5955; CM-NEXT:    CF_END
5956; CM-NEXT:    Fetch clause starting at 6:
5957; CM-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
5958; CM-NEXT:    ALU clause starting at 8:
5959; CM-NEXT:     MOV * T0.X, KC0[18].Y,
5960; CM-NEXT:    ALU clause starting at 9:
5961; CM-NEXT:     MOV * T1.X, KC0[18].Z,
5962; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
5963; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5964  %in = load i32, ptr addrspace(4) %in.byref
5965  store volatile i32 %in, ptr addrspace(1) %out, align 4
5966  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
5967  ret void
5968}
5969
5970define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) {
5971; SI-LABEL: byref_natural_align_constant_v16i32_arg:
5972; SI:       ; %bb.0:
5973; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x19
5974; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
5975; SI-NEXT:    s_load_dword s4, s[4:5], 0x29
5976; SI-NEXT:    s_mov_b32 s3, 0xf000
5977; SI-NEXT:    s_mov_b32 s2, -1
5978; SI-NEXT:    s_waitcnt lgkmcnt(0)
5979; SI-NEXT:    v_mov_b32_e32 v0, s20
5980; SI-NEXT:    v_mov_b32_e32 v1, s21
5981; SI-NEXT:    v_mov_b32_e32 v2, s22
5982; SI-NEXT:    v_mov_b32_e32 v3, s23
5983; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
5984; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5985; SI-NEXT:    v_mov_b32_e32 v0, s16
5986; SI-NEXT:    v_mov_b32_e32 v1, s17
5987; SI-NEXT:    v_mov_b32_e32 v2, s18
5988; SI-NEXT:    v_mov_b32_e32 v3, s19
5989; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
5990; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5991; SI-NEXT:    v_mov_b32_e32 v0, s12
5992; SI-NEXT:    v_mov_b32_e32 v1, s13
5993; SI-NEXT:    v_mov_b32_e32 v2, s14
5994; SI-NEXT:    v_mov_b32_e32 v3, s15
5995; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
5996; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5997; SI-NEXT:    v_mov_b32_e32 v0, s8
5998; SI-NEXT:    v_mov_b32_e32 v1, s9
5999; SI-NEXT:    v_mov_b32_e32 v2, s10
6000; SI-NEXT:    v_mov_b32_e32 v3, s11
6001; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
6002; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6003; SI-NEXT:    v_mov_b32_e32 v0, s4
6004; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6005; SI-NEXT:    s_waitcnt vmcnt(0)
6006; SI-NEXT:    s_endpgm
6007;
6008; VI-LABEL: byref_natural_align_constant_v16i32_arg:
6009; VI:       ; %bb.0:
6010; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
6011; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
6012; VI-NEXT:    s_load_dword s4, s[4:5], 0xa4
6013; VI-NEXT:    s_waitcnt lgkmcnt(0)
6014; VI-NEXT:    v_mov_b32_e32 v0, s20
6015; VI-NEXT:    s_add_u32 s2, s0, 48
6016; VI-NEXT:    s_addc_u32 s3, s1, 0
6017; VI-NEXT:    v_mov_b32_e32 v5, s3
6018; VI-NEXT:    v_mov_b32_e32 v4, s2
6019; VI-NEXT:    s_add_u32 s2, s0, 32
6020; VI-NEXT:    v_mov_b32_e32 v1, s21
6021; VI-NEXT:    v_mov_b32_e32 v2, s22
6022; VI-NEXT:    v_mov_b32_e32 v3, s23
6023; VI-NEXT:    s_addc_u32 s3, s1, 0
6024; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
6025; VI-NEXT:    s_waitcnt vmcnt(0)
6026; VI-NEXT:    v_mov_b32_e32 v5, s3
6027; VI-NEXT:    v_mov_b32_e32 v4, s2
6028; VI-NEXT:    s_add_u32 s2, s0, 16
6029; VI-NEXT:    v_mov_b32_e32 v0, s16
6030; VI-NEXT:    v_mov_b32_e32 v1, s17
6031; VI-NEXT:    v_mov_b32_e32 v2, s18
6032; VI-NEXT:    v_mov_b32_e32 v3, s19
6033; VI-NEXT:    s_addc_u32 s3, s1, 0
6034; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
6035; VI-NEXT:    s_waitcnt vmcnt(0)
6036; VI-NEXT:    v_mov_b32_e32 v5, s3
6037; VI-NEXT:    v_mov_b32_e32 v0, s12
6038; VI-NEXT:    v_mov_b32_e32 v1, s13
6039; VI-NEXT:    v_mov_b32_e32 v2, s14
6040; VI-NEXT:    v_mov_b32_e32 v3, s15
6041; VI-NEXT:    v_mov_b32_e32 v4, s2
6042; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
6043; VI-NEXT:    s_waitcnt vmcnt(0)
6044; VI-NEXT:    v_mov_b32_e32 v5, s1
6045; VI-NEXT:    v_mov_b32_e32 v0, s8
6046; VI-NEXT:    v_mov_b32_e32 v1, s9
6047; VI-NEXT:    v_mov_b32_e32 v2, s10
6048; VI-NEXT:    v_mov_b32_e32 v3, s11
6049; VI-NEXT:    v_mov_b32_e32 v4, s0
6050; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
6051; VI-NEXT:    s_waitcnt vmcnt(0)
6052; VI-NEXT:    v_mov_b32_e32 v0, s4
6053; VI-NEXT:    flat_store_dword v[4:5], v0
6054; VI-NEXT:    s_waitcnt vmcnt(0)
6055; VI-NEXT:    s_endpgm
6056;
6057; GFX9-LABEL: byref_natural_align_constant_v16i32_arg:
6058; GFX9:       ; %bb.0:
6059; GFX9-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x40
6060; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
6061; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x80
6062; GFX9-NEXT:    v_mov_b32_e32 v4, 0
6063; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6064; GFX9-NEXT:    v_mov_b32_e32 v0, s24
6065; GFX9-NEXT:    v_mov_b32_e32 v1, s25
6066; GFX9-NEXT:    v_mov_b32_e32 v2, s26
6067; GFX9-NEXT:    v_mov_b32_e32 v3, s27
6068; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
6069; GFX9-NEXT:    s_waitcnt vmcnt(0)
6070; GFX9-NEXT:    v_mov_b32_e32 v0, s20
6071; GFX9-NEXT:    v_mov_b32_e32 v1, s21
6072; GFX9-NEXT:    v_mov_b32_e32 v2, s22
6073; GFX9-NEXT:    v_mov_b32_e32 v3, s23
6074; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
6075; GFX9-NEXT:    s_waitcnt vmcnt(0)
6076; GFX9-NEXT:    v_mov_b32_e32 v0, s16
6077; GFX9-NEXT:    v_mov_b32_e32 v1, s17
6078; GFX9-NEXT:    v_mov_b32_e32 v2, s18
6079; GFX9-NEXT:    v_mov_b32_e32 v3, s19
6080; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
6081; GFX9-NEXT:    s_waitcnt vmcnt(0)
6082; GFX9-NEXT:    v_mov_b32_e32 v0, s12
6083; GFX9-NEXT:    v_mov_b32_e32 v1, s13
6084; GFX9-NEXT:    v_mov_b32_e32 v2, s14
6085; GFX9-NEXT:    v_mov_b32_e32 v3, s15
6086; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
6087; GFX9-NEXT:    s_waitcnt vmcnt(0)
6088; GFX9-NEXT:    v_mov_b32_e32 v0, s2
6089; GFX9-NEXT:    global_store_dword v4, v0, s[0:1]
6090; GFX9-NEXT:    s_waitcnt vmcnt(0)
6091; GFX9-NEXT:    s_endpgm
6092;
6093; EG-LABEL: byref_natural_align_constant_v16i32_arg:
6094; EG:       ; %bb.0:
6095; EG-NEXT:    ALU 0, @24, KC0[CB0:0-32], KC1[]
6096; EG-NEXT:    TEX 0 @16
6097; EG-NEXT:    ALU 3, @25, KC0[CB0:0-32], KC1[]
6098; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
6099; EG-NEXT:    ALU 3, @29, KC0[CB0:0-32], KC1[]
6100; EG-NEXT:    TEX 0 @18
6101; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
6102; EG-NEXT:    ALU 3, @33, KC0[CB0:0-32], KC1[]
6103; EG-NEXT:    TEX 0 @20
6104; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
6105; EG-NEXT:    ALU 2, @37, KC0[CB0:0-32], KC1[]
6106; EG-NEXT:    TEX 0 @22
6107; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0
6108; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1
6109; EG-NEXT:    CF_END
6110; EG-NEXT:    PAD
6111; EG-NEXT:    Fetch clause starting at 16:
6112; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
6113; EG-NEXT:    Fetch clause starting at 18:
6114; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 32, #1
6115; EG-NEXT:    Fetch clause starting at 20:
6116; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 16, #1
6117; EG-NEXT:    Fetch clause starting at 22:
6118; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
6119; EG-NEXT:    ALU clause starting at 24:
6120; EG-NEXT:     MOV * T0.X, KC0[6].Y,
6121; EG-NEXT:    ALU clause starting at 25:
6122; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6123; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
6124; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
6125; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6126; EG-NEXT:    ALU clause starting at 29:
6127; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6128; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
6129; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
6130; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6131; EG-NEXT:    ALU clause starting at 33:
6132; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6133; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
6134; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
6135; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6136; EG-NEXT:    ALU clause starting at 37:
6137; EG-NEXT:     MOV T1.X, KC0[10].Y,
6138; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
6139; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6140;
6141; CM-LABEL: byref_natural_align_constant_v16i32_arg:
6142; CM:       ; %bb.0:
6143; CM-NEXT:    ALU 0, @24, KC0[CB0:0-32], KC1[]
6144; CM-NEXT:    TEX 0 @16
6145; CM-NEXT:    ALU 3, @25, KC0[CB0:0-32], KC1[]
6146; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
6147; CM-NEXT:    ALU 3, @29, KC0[CB0:0-32], KC1[]
6148; CM-NEXT:    TEX 0 @18
6149; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T1.X
6150; CM-NEXT:    ALU 3, @33, KC0[CB0:0-32], KC1[]
6151; CM-NEXT:    TEX 0 @20
6152; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T1.X
6153; CM-NEXT:    ALU 2, @37, KC0[CB0:0-32], KC1[]
6154; CM-NEXT:    TEX 0 @22
6155; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
6156; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
6157; CM-NEXT:    CF_END
6158; CM-NEXT:    PAD
6159; CM-NEXT:    Fetch clause starting at 16:
6160; CM-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
6161; CM-NEXT:    Fetch clause starting at 18:
6162; CM-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 32, #1
6163; CM-NEXT:    Fetch clause starting at 20:
6164; CM-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 16, #1
6165; CM-NEXT:    Fetch clause starting at 22:
6166; CM-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
6167; CM-NEXT:    ALU clause starting at 24:
6168; CM-NEXT:     MOV * T0.X, KC0[6].Y,
6169; CM-NEXT:    ALU clause starting at 25:
6170; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6171; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
6172; CM-NEXT:     LSHR * T2.X, PV.W, literal.x,
6173; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6174; CM-NEXT:    ALU clause starting at 29:
6175; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6176; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
6177; CM-NEXT:     LSHR * T1.X, PV.W, literal.x,
6178; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6179; CM-NEXT:    ALU clause starting at 33:
6180; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6181; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
6182; CM-NEXT:     LSHR * T1.X, PV.W, literal.x,
6183; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6184; CM-NEXT:    ALU clause starting at 37:
6185; CM-NEXT:     MOV * T1.X, KC0[10].Y,
6186; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
6187; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6188  %in = load <16 x i32>, ptr addrspace(4) %in.byref
6189  store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4
6190  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
6191  ret void
6192}
6193