xref: /llvm-project/llvm/test/CodeGen/AMDGPU/calling-conventions.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s
5
6; Make sure we don't crash or assert on spir_kernel calling convention.
7
8define spir_kernel void @kernel(ptr addrspace(1) %out) {
9; SI-LABEL: kernel:
10; SI:       ; %bb.0: ; %entry
11; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
12; SI-NEXT:    s_mov_b32 s3, 0xf000
13; SI-NEXT:    s_mov_b32 s2, -1
14; SI-NEXT:    v_mov_b32_e32 v0, 0
15; SI-NEXT:    s_waitcnt lgkmcnt(0)
16; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
17; SI-NEXT:    s_endpgm
18;
19; VI-LABEL: kernel:
20; VI:       ; %bb.0: ; %entry
21; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
22; VI-NEXT:    v_mov_b32_e32 v2, 0
23; VI-NEXT:    s_waitcnt lgkmcnt(0)
24; VI-NEXT:    v_mov_b32_e32 v0, s0
25; VI-NEXT:    v_mov_b32_e32 v1, s1
26; VI-NEXT:    flat_store_dword v[0:1], v2
27; VI-NEXT:    s_endpgm
28;
29; GFX11-LABEL: kernel:
30; GFX11:       ; %bb.0: ; %entry
31; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
32; GFX11-NEXT:    v_mov_b32_e32 v0, 0
33; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
34; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
35; GFX11-NEXT:    s_endpgm
36entry:
37  store i32 0, ptr addrspace(1) %out
38  ret void
39}
40
41; FIXME: This is treated like a kernel
42; XGCN-LABEL: {{^}}func:
43; XGCN: s_endpgm
44; define spir_func void @func(ptr addrspace(1) %out) {
45; entry:
46;   store i32 0, ptr addrspace(1) %out
47;   ret void
48; }
49
50define amdgpu_ps half @ps_ret_cc_f16(half %arg0) {
51; SI-LABEL: ps_ret_cc_f16:
52; SI:       ; %bb.0:
53; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
54; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
55; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
56; SI-NEXT:    ; return to shader part epilog
57;
58; VI-LABEL: ps_ret_cc_f16:
59; VI:       ; %bb.0:
60; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
61; VI-NEXT:    ; return to shader part epilog
62;
63; GFX11-LABEL: ps_ret_cc_f16:
64; GFX11:       ; %bb.0:
65; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
66; GFX11-NEXT:    ; return to shader part epilog
67  %add = fadd half %arg0, 1.0
68  ret half %add
69}
70
71define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) {
72; SI-LABEL: ps_ret_cc_inreg_f16:
73; SI:       ; %bb.0:
74; SI-NEXT:    v_cvt_f16_f32_e32 v0, s0
75; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
76; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
77; SI-NEXT:    ; return to shader part epilog
78;
79; VI-LABEL: ps_ret_cc_inreg_f16:
80; VI:       ; %bb.0:
81; VI-NEXT:    v_add_f16_e64 v0, s0, 1.0
82; VI-NEXT:    ; return to shader part epilog
83;
84; GFX11-LABEL: ps_ret_cc_inreg_f16:
85; GFX11:       ; %bb.0:
86; GFX11-NEXT:    v_add_f16_e64 v0, s0, 1.0
87; GFX11-NEXT:    ; return to shader part epilog
88  %add = fadd half %arg0, 1.0
89  ret half %add
90}
91
92define fastcc float @fastcc(float %arg0) #0 {
93; GCN-LABEL: fastcc:
94; GCN:       ; %bb.0:
95; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
96; GCN-NEXT:    v_add_f32_e32 v0, 4.0, v0
97; GCN-NEXT:    s_setpc_b64 s[30:31]
98  %add = fadd float %arg0, 4.0
99  ret float %add
100}
101
102define coldcc float @coldcc(float %arg0) #0 {
103; GCN-LABEL: coldcc:
104; GCN:       ; %bb.0:
105; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106; GCN-NEXT:    v_add_f32_e32 v0, 4.0, v0
107; GCN-NEXT:    s_setpc_b64 s[30:31]
108 %add = fadd float %arg0, 4.0
109 ret float %add
110}
111
112define amdgpu_kernel void @call_coldcc() #0 {
113; SI-LABEL: call_coldcc:
114; SI:       ; %bb.0:
115; SI-NEXT:    s_mov_b32 s32, 0
116; SI-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
117; SI-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
118; SI-NEXT:    s_mov_b32 s22, -1
119; SI-NEXT:    s_mov_b32 s23, 0xe8f000
120; SI-NEXT:    s_add_u32 s20, s20, s11
121; SI-NEXT:    s_addc_u32 s21, s21, 0
122; SI-NEXT:    s_mov_b32 s14, s10
123; SI-NEXT:    s_mov_b32 s13, s9
124; SI-NEXT:    s_mov_b32 s12, s8
125; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
126; SI-NEXT:    s_add_u32 s8, s4, 36
127; SI-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
128; SI-NEXT:    s_addc_u32 s9, s5, 0
129; SI-NEXT:    s_getpc_b64 s[4:5]
130; SI-NEXT:    s_add_u32 s4, s4, coldcc@gotpcrel32@lo+4
131; SI-NEXT:    s_addc_u32 s5, s5, coldcc@gotpcrel32@hi+12
132; SI-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
133; SI-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
134; SI-NEXT:    v_or_b32_e32 v0, v0, v1
135; SI-NEXT:    v_or_b32_e32 v31, v0, v2
136; SI-NEXT:    v_mov_b32_e32 v0, 1.0
137; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
138; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
139; SI-NEXT:    s_mov_b64 s[0:1], s[20:21]
140; SI-NEXT:    s_mov_b64 s[2:3], s[22:23]
141; SI-NEXT:    s_waitcnt lgkmcnt(0)
142; SI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
143; SI-NEXT:    s_mov_b32 s3, 0xf000
144; SI-NEXT:    s_mov_b32 s2, -1
145; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
146; SI-NEXT:    s_endpgm
147;
148; VI-LABEL: call_coldcc:
149; VI:       ; %bb.0:
150; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
151; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
152; VI-NEXT:    s_mov_b32 s90, -1
153; VI-NEXT:    s_mov_b32 s91, 0xe80000
154; VI-NEXT:    s_add_u32 s88, s88, s11
155; VI-NEXT:    s_addc_u32 s89, s89, 0
156; VI-NEXT:    s_mov_b32 s12, s8
157; VI-NEXT:    s_add_u32 s8, s4, 36
158; VI-NEXT:    s_mov_b32 s13, s9
159; VI-NEXT:    s_addc_u32 s9, s5, 0
160; VI-NEXT:    s_getpc_b64 s[4:5]
161; VI-NEXT:    s_add_u32 s4, s4, coldcc@gotpcrel32@lo+4
162; VI-NEXT:    s_addc_u32 s5, s5, coldcc@gotpcrel32@hi+12
163; VI-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
164; VI-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
165; VI-NEXT:    s_mov_b32 s14, s10
166; VI-NEXT:    s_mov_b64 s[10:11], s[6:7]
167; VI-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
168; VI-NEXT:    v_or_b32_e32 v0, v0, v1
169; VI-NEXT:    s_mov_b64 s[4:5], s[0:1]
170; VI-NEXT:    s_mov_b64 s[6:7], s[2:3]
171; VI-NEXT:    s_mov_b64 s[0:1], s[88:89]
172; VI-NEXT:    v_or_b32_e32 v31, v0, v2
173; VI-NEXT:    s_mov_b64 s[2:3], s[90:91]
174; VI-NEXT:    v_mov_b32_e32 v0, 1.0
175; VI-NEXT:    s_mov_b32 s32, 0
176; VI-NEXT:    s_waitcnt lgkmcnt(0)
177; VI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
178; VI-NEXT:    flat_store_dword v[0:1], v0
179; VI-NEXT:    s_endpgm
180;
181; GFX11-LABEL: call_coldcc:
182; GFX11:       ; %bb.0:
183; GFX11-NEXT:    s_add_u32 s8, s4, 36
184; GFX11-NEXT:    s_addc_u32 s9, s5, 0
185; GFX11-NEXT:    s_getpc_b64 s[4:5]
186; GFX11-NEXT:    s_add_u32 s4, s4, coldcc@gotpcrel32@lo+4
187; GFX11-NEXT:    s_addc_u32 s5, s5, coldcc@gotpcrel32@hi+12
188; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0
189; GFX11-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
190; GFX11-NEXT:    s_mov_b32 s12, s13
191; GFX11-NEXT:    s_mov_b64 s[10:11], s[6:7]
192; GFX11-NEXT:    s_mov_b64 s[4:5], s[0:1]
193; GFX11-NEXT:    s_mov_b64 s[6:7], s[2:3]
194; GFX11-NEXT:    s_mov_b32 s13, s14
195; GFX11-NEXT:    s_mov_b32 s14, s15
196; GFX11-NEXT:    s_mov_b32 s32, 0
197; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
198; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
199; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
200; GFX11-NEXT:    s_endpgm
201  %val = call float @coldcc(float 1.0)
202  store float %val, ptr addrspace(1) undef
203  ret void
204}
205
206define amdgpu_kernel void @call_fastcc() #0 {
207; SI-LABEL: call_fastcc:
208; SI:       ; %bb.0:
209; SI-NEXT:    s_mov_b32 s32, 0
210; SI-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
211; SI-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
212; SI-NEXT:    s_mov_b32 s22, -1
213; SI-NEXT:    s_mov_b32 s23, 0xe8f000
214; SI-NEXT:    s_add_u32 s20, s20, s11
215; SI-NEXT:    s_addc_u32 s21, s21, 0
216; SI-NEXT:    s_mov_b32 s14, s10
217; SI-NEXT:    s_mov_b32 s13, s9
218; SI-NEXT:    s_mov_b32 s12, s8
219; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
220; SI-NEXT:    s_add_u32 s8, s4, 36
221; SI-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
222; SI-NEXT:    s_addc_u32 s9, s5, 0
223; SI-NEXT:    s_getpc_b64 s[4:5]
224; SI-NEXT:    s_add_u32 s4, s4, fastcc@gotpcrel32@lo+4
225; SI-NEXT:    s_addc_u32 s5, s5, fastcc@gotpcrel32@hi+12
226; SI-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
227; SI-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
228; SI-NEXT:    v_or_b32_e32 v0, v0, v1
229; SI-NEXT:    v_or_b32_e32 v31, v0, v2
230; SI-NEXT:    v_mov_b32_e32 v0, 1.0
231; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
232; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
233; SI-NEXT:    s_mov_b64 s[0:1], s[20:21]
234; SI-NEXT:    s_mov_b64 s[2:3], s[22:23]
235; SI-NEXT:    s_waitcnt lgkmcnt(0)
236; SI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
237; SI-NEXT:    s_mov_b32 s3, 0xf000
238; SI-NEXT:    s_mov_b32 s2, -1
239; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
240; SI-NEXT:    s_endpgm
241;
242; VI-LABEL: call_fastcc:
243; VI:       ; %bb.0:
244; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
245; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
246; VI-NEXT:    s_mov_b32 s90, -1
247; VI-NEXT:    s_mov_b32 s91, 0xe80000
248; VI-NEXT:    s_add_u32 s88, s88, s11
249; VI-NEXT:    s_addc_u32 s89, s89, 0
250; VI-NEXT:    s_mov_b32 s12, s8
251; VI-NEXT:    s_add_u32 s8, s4, 36
252; VI-NEXT:    s_mov_b32 s13, s9
253; VI-NEXT:    s_addc_u32 s9, s5, 0
254; VI-NEXT:    s_getpc_b64 s[4:5]
255; VI-NEXT:    s_add_u32 s4, s4, fastcc@gotpcrel32@lo+4
256; VI-NEXT:    s_addc_u32 s5, s5, fastcc@gotpcrel32@hi+12
257; VI-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
258; VI-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
259; VI-NEXT:    s_mov_b32 s14, s10
260; VI-NEXT:    s_mov_b64 s[10:11], s[6:7]
261; VI-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
262; VI-NEXT:    v_or_b32_e32 v0, v0, v1
263; VI-NEXT:    s_mov_b64 s[4:5], s[0:1]
264; VI-NEXT:    s_mov_b64 s[6:7], s[2:3]
265; VI-NEXT:    s_mov_b64 s[0:1], s[88:89]
266; VI-NEXT:    v_or_b32_e32 v31, v0, v2
267; VI-NEXT:    s_mov_b64 s[2:3], s[90:91]
268; VI-NEXT:    v_mov_b32_e32 v0, 1.0
269; VI-NEXT:    s_mov_b32 s32, 0
270; VI-NEXT:    s_waitcnt lgkmcnt(0)
271; VI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
272; VI-NEXT:    flat_store_dword v[0:1], v0
273; VI-NEXT:    s_endpgm
274;
275; GFX11-LABEL: call_fastcc:
276; GFX11:       ; %bb.0:
277; GFX11-NEXT:    s_add_u32 s8, s4, 36
278; GFX11-NEXT:    s_addc_u32 s9, s5, 0
279; GFX11-NEXT:    s_getpc_b64 s[4:5]
280; GFX11-NEXT:    s_add_u32 s4, s4, fastcc@gotpcrel32@lo+4
281; GFX11-NEXT:    s_addc_u32 s5, s5, fastcc@gotpcrel32@hi+12
282; GFX11-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0
283; GFX11-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
284; GFX11-NEXT:    s_mov_b32 s12, s13
285; GFX11-NEXT:    s_mov_b64 s[10:11], s[6:7]
286; GFX11-NEXT:    s_mov_b64 s[4:5], s[0:1]
287; GFX11-NEXT:    s_mov_b64 s[6:7], s[2:3]
288; GFX11-NEXT:    s_mov_b32 s13, s14
289; GFX11-NEXT:    s_mov_b32 s14, s15
290; GFX11-NEXT:    s_mov_b32 s32, 0
291; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
293; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
294; GFX11-NEXT:    s_endpgm
295  %val = call float @fastcc(float 1.0)
296  store float %val, ptr addrspace(1) undef
297  ret void
298}
299
300; Mesa compute shader: check for 47176 (COMPUTE_PGM_RSRC1) in .AMDGPU.config
301define amdgpu_cs half @cs_mesa(half %arg0) {
302; SI-LABEL: cs_mesa:
303; SI:       ; %bb.0:
304; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
305; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
306; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
307; SI-NEXT:    ; return to shader part epilog
308;
309; VI-LABEL: cs_mesa:
310; VI:       ; %bb.0:
311; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
312; VI-NEXT:    ; return to shader part epilog
313;
314; GFX11-LABEL: cs_mesa:
315; GFX11:       ; %bb.0:
316; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
317; GFX11-NEXT:    ; return to shader part epilog
318  %add = fadd half %arg0, 1.0
319  ret half %add
320}
321
322; Mesa pixel shader: check for 45096 (SPI_SHADER_PGM_RSRC1_PS) in .AMDGPU.config
323define amdgpu_ps half @ps_mesa_f16(half %arg0) {
324; SI-LABEL: ps_mesa_f16:
325; SI:       ; %bb.0:
326; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
327; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
328; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
329; SI-NEXT:    ; return to shader part epilog
330;
331; VI-LABEL: ps_mesa_f16:
332; VI:       ; %bb.0:
333; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
334; VI-NEXT:    ; return to shader part epilog
335;
336; GFX11-LABEL: ps_mesa_f16:
337; GFX11:       ; %bb.0:
338; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
339; GFX11-NEXT:    ; return to shader part epilog
340  %add = fadd half %arg0, 1.0
341  ret half %add
342}
343
344; Mesa vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in .AMDGPU.config
345define amdgpu_vs half @vs_mesa(half %arg0) {
346; SI-LABEL: vs_mesa:
347; SI:       ; %bb.0:
348; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
349; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
350; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
351; SI-NEXT:    ; return to shader part epilog
352;
353; VI-LABEL: vs_mesa:
354; VI:       ; %bb.0:
355; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
356; VI-NEXT:    ; return to shader part epilog
357;
358; GFX11-LABEL: vs_mesa:
359; GFX11:       ; %bb.0:
360; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
361; GFX11-NEXT:    ; return to shader part epilog
362  %add = fadd half %arg0, 1.0
363  ret half %add
364}
365
366; Mesa geometry shader: check for 45608 (SPI_SHADER_PGM_RSRC1_GS) in .AMDGPU.config
367define amdgpu_gs half @gs_mesa(half %arg0) {
368; SI-LABEL: gs_mesa:
369; SI:       ; %bb.0:
370; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
371; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
372; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
373; SI-NEXT:    ; return to shader part epilog
374;
375; VI-LABEL: gs_mesa:
376; VI:       ; %bb.0:
377; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
378; VI-NEXT:    ; return to shader part epilog
379;
380; GFX11-LABEL: gs_mesa:
381; GFX11:       ; %bb.0:
382; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
383; GFX11-NEXT:    ; return to shader part epilog
384  %add = fadd half %arg0, 1.0
385  ret half %add
386}
387
388; Mesa hull shader: check for 46120 (SPI_SHADER_PGM_RSRC1_HS) in .AMDGPU.config
389define amdgpu_hs half @hs_mesa(half %arg0) {
390; SI-LABEL: hs_mesa:
391; SI:       ; %bb.0:
392; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
393; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
394; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
395; SI-NEXT:    ; return to shader part epilog
396;
397; VI-LABEL: hs_mesa:
398; VI:       ; %bb.0:
399; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
400; VI-NEXT:    ; return to shader part epilog
401;
402; GFX11-LABEL: hs_mesa:
403; GFX11:       ; %bb.0:
404; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
405; GFX11-NEXT:    ; return to shader part epilog
406  %add = fadd half %arg0, 1.0
407  ret half %add
408}
409
410; FIXME: Inconsistent ABI between targets
411
412define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) {
413; SI-LABEL: ps_mesa_v2f16:
414; SI:       ; %bb.0:
415; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
416; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
417; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
418; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
419; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
420; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
421; SI-NEXT:    ; return to shader part epilog
422;
423; VI-LABEL: ps_mesa_v2f16:
424; VI:       ; %bb.0:
425; VI-NEXT:    v_mov_b32_e32 v1, 0x3c00
426; VI-NEXT:    v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
427; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
428; VI-NEXT:    v_or_b32_e32 v0, v0, v1
429; VI-NEXT:    ; return to shader part epilog
430;
431; GFX11-LABEL: ps_mesa_v2f16:
432; GFX11:       ; %bb.0:
433; GFX11-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
434; GFX11-NEXT:    ; return to shader part epilog
435  %add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
436  ret <2 x half> %add
437}
438
439define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) {
440; SI-LABEL: ps_mesa_inreg_v2f16:
441; SI:       ; %bb.0:
442; SI-NEXT:    v_cvt_f16_f32_e32 v0, s1
443; SI-NEXT:    v_cvt_f16_f32_e32 v1, s0
444; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
445; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
446; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
447; SI-NEXT:    v_add_f32_e32 v1, 1.0, v2
448; SI-NEXT:    ; return to shader part epilog
449;
450; VI-LABEL: ps_mesa_inreg_v2f16:
451; VI:       ; %bb.0:
452; VI-NEXT:    s_lshr_b32 s1, s0, 16
453; VI-NEXT:    v_mov_b32_e32 v0, s1
454; VI-NEXT:    v_mov_b32_e32 v1, 0x3c00
455; VI-NEXT:    v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
456; VI-NEXT:    v_add_f16_e64 v1, s0, 1.0
457; VI-NEXT:    v_or_b32_e32 v0, v1, v0
458; VI-NEXT:    ; return to shader part epilog
459;
460; GFX11-LABEL: ps_mesa_inreg_v2f16:
461; GFX11:       ; %bb.0:
462; GFX11-NEXT:    v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0]
463; GFX11-NEXT:    ; return to shader part epilog
464  %add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
465  ret <2 x half> %add
466}
467
468define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) {
469; SI-LABEL: ps_mesa_v2i16:
470; SI:       ; %bb.0:
471; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
472; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
473; SI-NEXT:    s_mov_b32 s3, 0xf000
474; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
475; SI-NEXT:    v_or_b32_e32 v0, v1, v0
476; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x10000, v0
477; SI-NEXT:    s_mov_b32 s2, -1
478; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
479; SI-NEXT:    s_endpgm
480;
481; VI-LABEL: ps_mesa_v2i16:
482; VI:       ; %bb.0:
483; VI-NEXT:    v_mov_b32_e32 v2, 1
484; VI-NEXT:    v_add_u16_e32 v1, 1, v0
485; VI-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
486; VI-NEXT:    v_or_b32_e32 v0, v1, v0
487; VI-NEXT:    flat_store_dword v[0:1], v0
488; VI-NEXT:    s_endpgm
489;
490; GFX11-LABEL: ps_mesa_v2i16:
491; GFX11:       ; %bb.0:
492; GFX11-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
493; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
494; GFX11-NEXT:    s_endpgm
495  %add = add <2 x i16> %arg0, <i16 1, i16 1>
496  store <2 x i16> %add, ptr addrspace(1) undef
497  ret void
498}
499
500define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) {
501; SI-LABEL: ps_mesa_inreg_v2i16:
502; SI:       ; %bb.0:
503; SI-NEXT:    s_lshl_b32 s1, s1, 16
504; SI-NEXT:    s_add_i32 s0, s0, 1
505; SI-NEXT:    s_mov_b32 s3, 0xf000
506; SI-NEXT:    s_and_b32 s0, s0, 0xffff
507; SI-NEXT:    s_or_b32 s0, s1, s0
508; SI-NEXT:    s_add_i32 s0, s0, 0x10000
509; SI-NEXT:    s_mov_b32 s2, -1
510; SI-NEXT:    v_mov_b32_e32 v0, s0
511; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
512; SI-NEXT:    s_endpgm
513;
514; VI-LABEL: ps_mesa_inreg_v2i16:
515; VI:       ; %bb.0:
516; VI-NEXT:    s_and_b32 s1, s0, 0xffff0000
517; VI-NEXT:    s_add_i32 s0, s0, 1
518; VI-NEXT:    s_and_b32 s0, s0, 0xffff
519; VI-NEXT:    s_or_b32 s0, s1, s0
520; VI-NEXT:    s_add_i32 s0, s0, 0x10000
521; VI-NEXT:    v_mov_b32_e32 v0, s0
522; VI-NEXT:    flat_store_dword v[0:1], v0
523; VI-NEXT:    s_endpgm
524;
525; GFX11-LABEL: ps_mesa_inreg_v2i16:
526; GFX11:       ; %bb.0:
527; GFX11-NEXT:    v_pk_add_u16 v0, s0, 1 op_sel_hi:[1,0]
528; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
529; GFX11-NEXT:    s_endpgm
530  %add = add <2 x i16> %arg0, <i16 1, i16 1>
531  store <2 x i16> %add, ptr addrspace(1) undef
532  ret void
533}
534
535; FIXME: Differenet ABI for VI+
536
537define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) {
538; SI-LABEL: ps_mesa_v4f16:
539; SI:       ; %bb.0:
540; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
541; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
542; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
543; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
544; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
545; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
546; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
547; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
548; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
549; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
550; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
551; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
552; SI-NEXT:    ; return to shader part epilog
553;
554; VI-LABEL: ps_mesa_v4f16:
555; VI:       ; %bb.0:
556; VI-NEXT:    v_mov_b32_e32 v3, 0x3c00
557; VI-NEXT:    v_add_f16_e32 v2, 1.0, v1
558; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
559; VI-NEXT:    v_add_f16_e32 v4, 1.0, v0
560; VI-NEXT:    v_add_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
561; VI-NEXT:    v_or_b32_e32 v0, v4, v0
562; VI-NEXT:    v_or_b32_e32 v1, v2, v1
563; VI-NEXT:    ; return to shader part epilog
564;
565; GFX11-LABEL: ps_mesa_v4f16:
566; GFX11:       ; %bb.0:
567; GFX11-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
568; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
569; GFX11-NEXT:    ; return to shader part epilog
570  %add = fadd <4 x half> %arg0, <half 1.0, half 1.0, half 1.0, half 1.0>
571  ret <4 x half> %add
572}
573
574define amdgpu_ps <4 x half> @ps_mesa_inreg_v4f16(<4 x half> inreg %arg0) {
575; SI-LABEL: ps_mesa_inreg_v4f16:
576; SI:       ; %bb.0:
577; SI-NEXT:    v_cvt_f16_f32_e32 v0, s3
578; SI-NEXT:    v_cvt_f16_f32_e32 v1, s2
579; SI-NEXT:    v_cvt_f16_f32_e32 v2, s1
580; SI-NEXT:    v_cvt_f16_f32_e32 v3, s0
581; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
582; SI-NEXT:    v_cvt_f32_f16_e32 v5, v1
583; SI-NEXT:    v_cvt_f32_f16_e32 v1, v2
584; SI-NEXT:    v_cvt_f32_f16_e32 v0, v3
585; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
586; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
587; SI-NEXT:    v_add_f32_e32 v2, 1.0, v5
588; SI-NEXT:    v_add_f32_e32 v3, 1.0, v4
589; SI-NEXT:    ; return to shader part epilog
590;
591; VI-LABEL: ps_mesa_inreg_v4f16:
592; VI:       ; %bb.0:
593; VI-NEXT:    v_add_f16_e64 v1, s1, 1.0
594; VI-NEXT:    s_lshr_b32 s1, s1, 16
595; VI-NEXT:    v_mov_b32_e32 v0, s1
596; VI-NEXT:    v_mov_b32_e32 v2, 0x3c00
597; VI-NEXT:    v_add_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
598; VI-NEXT:    v_add_f16_e64 v0, s0, 1.0
599; VI-NEXT:    s_lshr_b32 s0, s0, 16
600; VI-NEXT:    v_mov_b32_e32 v4, s0
601; VI-NEXT:    v_add_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
602; VI-NEXT:    v_or_b32_e32 v0, v0, v2
603; VI-NEXT:    v_or_b32_e32 v1, v1, v3
604; VI-NEXT:    ; return to shader part epilog
605;
606; GFX11-LABEL: ps_mesa_inreg_v4f16:
607; GFX11:       ; %bb.0:
608; GFX11-NEXT:    v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0]
609; GFX11-NEXT:    v_pk_add_f16 v1, s1, 1.0 op_sel_hi:[1,0]
610; GFX11-NEXT:    ; return to shader part epilog
611  %add = fadd <4 x half> %arg0, <half 1.0, half 1.0, half 1.0, half 1.0>
612  ret <4 x half> %add
613}
614
615define amdgpu_ps void @ps_mesa_inreg_v3i32(<3 x i32> inreg %arg0) {
616; SI-LABEL: ps_mesa_inreg_v3i32:
617; SI:       ; %bb.0:
618; SI-NEXT:    s_add_i32 s1, s1, 2
619; SI-NEXT:    s_add_i32 s0, s0, 1
620; SI-NEXT:    s_add_i32 s4, s2, 3
621; SI-NEXT:    s_mov_b32 s3, 0xf000
622; SI-NEXT:    s_mov_b32 s2, -1
623; SI-NEXT:    v_mov_b32_e32 v0, s4
624; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
625; SI-NEXT:    s_waitcnt expcnt(0)
626; SI-NEXT:    v_mov_b32_e32 v0, s0
627; SI-NEXT:    v_mov_b32_e32 v1, s1
628; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
629; SI-NEXT:    s_endpgm
630;
631; VI-LABEL: ps_mesa_inreg_v3i32:
632; VI:       ; %bb.0:
633; VI-NEXT:    s_add_i32 s2, s2, 3
634; VI-NEXT:    s_add_i32 s1, s1, 2
635; VI-NEXT:    s_add_i32 s0, s0, 1
636; VI-NEXT:    v_mov_b32_e32 v0, s0
637; VI-NEXT:    v_mov_b32_e32 v1, s1
638; VI-NEXT:    v_mov_b32_e32 v2, s2
639; VI-NEXT:    flat_store_dwordx3 v[0:1], v[0:2]
640; VI-NEXT:    s_endpgm
641;
642; GFX11-LABEL: ps_mesa_inreg_v3i32:
643; GFX11:       ; %bb.0:
644; GFX11-NEXT:    s_add_i32 s2, s2, 3
645; GFX11-NEXT:    s_add_i32 s0, s0, 1
646; GFX11-NEXT:    s_add_i32 s1, s1, 2
647; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
648; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
649; GFX11-NEXT:    v_mov_b32_e32 v2, s2
650; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
651; GFX11-NEXT:    s_endpgm
652  %add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3>
653  store <3 x i32> %add, ptr addrspace(1) undef
654  ret void
655}
656
657define amdgpu_ps void @ps_mesa_inreg_v3f32(<3 x float> inreg %arg0) {
658; SI-LABEL: ps_mesa_inreg_v3f32:
659; SI:       ; %bb.0:
660; SI-NEXT:    v_add_f32_e64 v1, s1, 2.0
661; SI-NEXT:    v_add_f32_e64 v0, s0, 1.0
662; SI-NEXT:    v_add_f32_e64 v2, s2, 4.0
663; SI-NEXT:    s_mov_b32 s3, 0xf000
664; SI-NEXT:    s_mov_b32 s2, -1
665; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
666; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
667; SI-NEXT:    s_endpgm
668;
669; VI-LABEL: ps_mesa_inreg_v3f32:
670; VI:       ; %bb.0:
671; VI-NEXT:    v_add_f32_e64 v2, s2, 4.0
672; VI-NEXT:    v_add_f32_e64 v1, s1, 2.0
673; VI-NEXT:    v_add_f32_e64 v0, s0, 1.0
674; VI-NEXT:    flat_store_dwordx3 v[0:1], v[0:2]
675; VI-NEXT:    s_endpgm
676;
677; GFX11-LABEL: ps_mesa_inreg_v3f32:
678; GFX11:       ; %bb.0:
679; GFX11-NEXT:    v_add_f32_e64 v2, s2, 4.0
680; GFX11-NEXT:    v_add_f32_e64 v1, s1, 2.0
681; GFX11-NEXT:    v_add_f32_e64 v0, s0, 1.0
682; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
683; GFX11-NEXT:    s_endpgm
684  %add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0>
685  store <3 x float> %add, ptr addrspace(1) undef
686  ret void
687}
688
689define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) {
690; SI-LABEL: ps_mesa_inreg_v5i32:
691; SI:       ; %bb.0:
692; SI-NEXT:    s_add_i32 s5, s3, 4
693; SI-NEXT:    s_add_i32 s6, s2, 3
694; SI-NEXT:    s_add_i32 s1, s1, 2
695; SI-NEXT:    s_add_i32 s0, s0, 1
696; SI-NEXT:    s_add_i32 s4, s4, 5
697; SI-NEXT:    s_mov_b32 s3, 0xf000
698; SI-NEXT:    s_mov_b32 s2, -1
699; SI-NEXT:    v_mov_b32_e32 v0, s4
700; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
701; SI-NEXT:    s_waitcnt expcnt(0)
702; SI-NEXT:    v_mov_b32_e32 v0, s0
703; SI-NEXT:    v_mov_b32_e32 v1, s1
704; SI-NEXT:    v_mov_b32_e32 v2, s6
705; SI-NEXT:    v_mov_b32_e32 v3, s5
706; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
707; SI-NEXT:    s_endpgm
708;
709; VI-LABEL: ps_mesa_inreg_v5i32:
710; VI:       ; %bb.0:
711; VI-NEXT:    s_add_i32 s4, s4, 5
712; VI-NEXT:    s_add_i32 s3, s3, 4
713; VI-NEXT:    s_add_i32 s2, s2, 3
714; VI-NEXT:    s_add_i32 s1, s1, 2
715; VI-NEXT:    s_add_i32 s0, s0, 1
716; VI-NEXT:    v_mov_b32_e32 v0, s4
717; VI-NEXT:    flat_store_dword v[0:1], v0
718; VI-NEXT:    v_mov_b32_e32 v0, s0
719; VI-NEXT:    v_mov_b32_e32 v1, s1
720; VI-NEXT:    v_mov_b32_e32 v2, s2
721; VI-NEXT:    v_mov_b32_e32 v3, s3
722; VI-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
723; VI-NEXT:    s_endpgm
724;
725; GFX11-LABEL: ps_mesa_inreg_v5i32:
726; GFX11:       ; %bb.0:
727; GFX11-NEXT:    s_add_i32 s3, s3, 4
728; GFX11-NEXT:    s_add_i32 s2, s2, 3
729; GFX11-NEXT:    s_add_i32 s1, s1, 2
730; GFX11-NEXT:    s_add_i32 s4, s4, 5
731; GFX11-NEXT:    s_add_i32 s0, s0, 1
732; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1
733; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
734; GFX11-NEXT:    v_mov_b32_e32 v2, s2
735; GFX11-NEXT:    s_clause 0x1
736; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
737; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
738; GFX11-NEXT:    s_endpgm
739  %add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5>
740  store <5 x i32> %add, ptr addrspace(1) undef
741  ret void
742}
743
744define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) {
745; SI-LABEL: ps_mesa_inreg_v5f32:
746; SI:       ; %bb.0:
747; SI-NEXT:    v_add_f32_e64 v3, s3, -1.0
748; SI-NEXT:    v_add_f32_e64 v2, s2, 4.0
749; SI-NEXT:    v_add_f32_e64 v1, s1, 2.0
750; SI-NEXT:    v_add_f32_e64 v0, s0, 1.0
751; SI-NEXT:    v_add_f32_e64 v4, s4, 0.5
752; SI-NEXT:    s_mov_b32 s3, 0xf000
753; SI-NEXT:    s_mov_b32 s2, -1
754; SI-NEXT:    buffer_store_dword v4, off, s[0:3], 0
755; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
756; SI-NEXT:    s_endpgm
757;
758; VI-LABEL: ps_mesa_inreg_v5f32:
759; VI:       ; %bb.0:
760; VI-NEXT:    v_add_f32_e64 v3, s3, -1.0
761; VI-NEXT:    v_add_f32_e64 v2, s2, 4.0
762; VI-NEXT:    v_add_f32_e64 v1, s1, 2.0
763; VI-NEXT:    v_add_f32_e64 v0, s0, 1.0
764; VI-NEXT:    v_add_f32_e64 v4, s4, 0.5
765; VI-NEXT:    flat_store_dword v[0:1], v4
766; VI-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
767; VI-NEXT:    s_endpgm
768;
769; GFX11-LABEL: ps_mesa_inreg_v5f32:
770; GFX11:       ; %bb.0:
771; GFX11-NEXT:    v_add_f32_e64 v3, s3, -1.0
772; GFX11-NEXT:    v_add_f32_e64 v2, s2, 4.0
773; GFX11-NEXT:    v_add_f32_e64 v1, s1, 2.0
774; GFX11-NEXT:    v_add_f32_e64 v4, s4, 0.5
775; GFX11-NEXT:    v_add_f32_e64 v0, s0, 1.0
776; GFX11-NEXT:    s_clause 0x1
777; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
778; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
779; GFX11-NEXT:    s_endpgm
780  %add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>
781  store <5 x float> %add, ptr addrspace(1) undef
782  ret void
783}
784
785define amdgpu_ps void @ps_mesa_v3i32(<3 x i32> %arg0) {
786; SI-LABEL: ps_mesa_v3i32:
787; SI:       ; %bb.0:
788; SI-NEXT:    v_add_i32_e32 v1, vcc, 2, v1
789; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
790; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
791; SI-NEXT:    s_mov_b32 s3, 0xf000
792; SI-NEXT:    s_mov_b32 s2, -1
793; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
794; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
795; SI-NEXT:    s_endpgm
796;
797; VI-LABEL: ps_mesa_v3i32:
798; VI:       ; %bb.0:
799; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
800; VI-NEXT:    v_add_u32_e32 v1, vcc, 2, v1
801; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
802; VI-NEXT:    flat_store_dwordx3 v[0:1], v[0:2]
803; VI-NEXT:    s_endpgm
804;
805; GFX11-LABEL: ps_mesa_v3i32:
806; GFX11:       ; %bb.0:
807; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v2
808; GFX11-NEXT:    v_add_nc_u32_e32 v1, 2, v1
809; GFX11-NEXT:    v_add_nc_u32_e32 v0, 1, v0
810; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
811; GFX11-NEXT:    s_endpgm
812  %add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3>
813  store <3 x i32> %add, ptr addrspace(1) undef
814  ret void
815}
816
817define amdgpu_ps void @ps_mesa_v3f32(<3 x float> %arg0) {
818; SI-LABEL: ps_mesa_v3f32:
819; SI:       ; %bb.0:
820; SI-NEXT:    v_add_f32_e32 v1, 2.0, v1
821; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
822; SI-NEXT:    v_add_f32_e32 v2, 4.0, v2
823; SI-NEXT:    s_mov_b32 s3, 0xf000
824; SI-NEXT:    s_mov_b32 s2, -1
825; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
826; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
827; SI-NEXT:    s_endpgm
828;
829; VI-LABEL: ps_mesa_v3f32:
830; VI:       ; %bb.0:
831; VI-NEXT:    v_add_f32_e32 v2, 4.0, v2
832; VI-NEXT:    v_add_f32_e32 v1, 2.0, v1
833; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
834; VI-NEXT:    flat_store_dwordx3 v[0:1], v[0:2]
835; VI-NEXT:    s_endpgm
836;
837; GFX11-LABEL: ps_mesa_v3f32:
838; GFX11:       ; %bb.0:
839; GFX11-NEXT:    v_dual_add_f32 v2, 4.0, v2 :: v_dual_add_f32 v1, 2.0, v1
840; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
841; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
842; GFX11-NEXT:    s_endpgm
843  %add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0>
844  store <3 x float> %add, ptr addrspace(1) undef
845  ret void
846}
847
848define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) {
849; SI-LABEL: ps_mesa_v5i32:
850; SI:       ; %bb.0:
851; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v3
852; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
853; SI-NEXT:    v_add_i32_e32 v1, vcc, 2, v1
854; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
855; SI-NEXT:    v_add_i32_e32 v4, vcc, 5, v4
856; SI-NEXT:    s_mov_b32 s3, 0xf000
857; SI-NEXT:    s_mov_b32 s2, -1
858; SI-NEXT:    buffer_store_dword v4, off, s[0:3], 0
859; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
860; SI-NEXT:    s_endpgm
861;
862; VI-LABEL: ps_mesa_v5i32:
863; VI:       ; %bb.0:
864; VI-NEXT:    v_add_u32_e32 v3, vcc, 4, v3
865; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
866; VI-NEXT:    v_add_u32_e32 v1, vcc, 2, v1
867; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
868; VI-NEXT:    v_add_u32_e32 v4, vcc, 5, v4
869; VI-NEXT:    flat_store_dword v[0:1], v4
870; VI-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
871; VI-NEXT:    s_endpgm
872;
873; GFX11-LABEL: ps_mesa_v5i32:
874; GFX11:       ; %bb.0:
875; GFX11-NEXT:    v_add_nc_u32_e32 v3, 4, v3
876; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v2
877; GFX11-NEXT:    v_add_nc_u32_e32 v1, 2, v1
878; GFX11-NEXT:    v_add_nc_u32_e32 v4, 5, v4
879; GFX11-NEXT:    v_add_nc_u32_e32 v0, 1, v0
880; GFX11-NEXT:    s_clause 0x1
881; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
882; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
883; GFX11-NEXT:    s_endpgm
884  %add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5>
885  store <5 x i32> %add, ptr addrspace(1) undef
886  ret void
887}
888
889define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) {
890; SI-LABEL: ps_mesa_v5f32:
891; SI:       ; %bb.0:
892; SI-NEXT:    v_add_f32_e32 v3, -1.0, v3
893; SI-NEXT:    v_add_f32_e32 v2, 4.0, v2
894; SI-NEXT:    v_add_f32_e32 v1, 2.0, v1
895; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
896; SI-NEXT:    v_add_f32_e32 v4, 0.5, v4
897; SI-NEXT:    s_mov_b32 s3, 0xf000
898; SI-NEXT:    s_mov_b32 s2, -1
899; SI-NEXT:    buffer_store_dword v4, off, s[0:3], 0
900; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
901; SI-NEXT:    s_endpgm
902;
903; VI-LABEL: ps_mesa_v5f32:
904; VI:       ; %bb.0:
905; VI-NEXT:    v_add_f32_e32 v3, -1.0, v3
906; VI-NEXT:    v_add_f32_e32 v2, 4.0, v2
907; VI-NEXT:    v_add_f32_e32 v1, 2.0, v1
908; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
909; VI-NEXT:    v_add_f32_e32 v4, 0.5, v4
910; VI-NEXT:    flat_store_dword v[0:1], v4
911; VI-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
912; VI-NEXT:    s_endpgm
913;
914; GFX11-LABEL: ps_mesa_v5f32:
915; GFX11:       ; %bb.0:
916; GFX11-NEXT:    v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2
917; GFX11-NEXT:    v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4
918; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
919; GFX11-NEXT:    s_clause 0x1
920; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
921; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
922; GFX11-NEXT:    s_endpgm
923  %add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>
924  store <5 x float> %add, ptr addrspace(1) undef
925  ret void
926}
927
928define amdgpu_ps void @ps_mesa_i16(i16 %arg0) {
929; SI-LABEL: ps_mesa_i16:
930; SI:       ; %bb.0:
931; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v0
932; SI-NEXT:    s_mov_b32 s3, 0xf000
933; SI-NEXT:    s_mov_b32 s2, -1
934; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
935; SI-NEXT:    s_endpgm
936;
937; VI-LABEL: ps_mesa_i16:
938; VI:       ; %bb.0:
939; VI-NEXT:    v_add_u16_e32 v0, v0, v0
940; VI-NEXT:    flat_store_short v[0:1], v0
941; VI-NEXT:    s_endpgm
942;
943; GFX11-LABEL: ps_mesa_i16:
944; GFX11:       ; %bb.0:
945; GFX11-NEXT:    v_add_nc_u16 v0, v0, v0
946; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
947; GFX11-NEXT:    s_endpgm
948  %add = add i16 %arg0, %arg0
949  store i16 %add, ptr addrspace(1) undef
950  ret void
951}
952
953define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) {
954; SI-LABEL: ps_mesa_inreg_i16:
955; SI:       ; %bb.0:
956; SI-NEXT:    s_add_i32 s0, s0, s0
957; SI-NEXT:    s_mov_b32 s3, 0xf000
958; SI-NEXT:    s_mov_b32 s2, -1
959; SI-NEXT:    v_mov_b32_e32 v0, s0
960; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
961; SI-NEXT:    s_endpgm
962;
963; VI-LABEL: ps_mesa_inreg_i16:
964; VI:       ; %bb.0:
965; VI-NEXT:    s_and_b32 s0, 0xffff, s0
966; VI-NEXT:    s_add_i32 s0, s0, s0
967; VI-NEXT:    v_mov_b32_e32 v0, s0
968; VI-NEXT:    flat_store_short v[0:1], v0
969; VI-NEXT:    s_endpgm
970;
971; GFX11-LABEL: ps_mesa_inreg_i16:
972; GFX11:       ; %bb.0:
973; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
974; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
975; GFX11-NEXT:    s_add_i32 s0, s0, s0
976; GFX11-NEXT:    v_mov_b32_e32 v0, s0
977; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
978; GFX11-NEXT:    s_endpgm
979  %add = add i16 %arg0, %arg0
980  store i16 %add, ptr addrspace(1) undef
981  ret void
982}
983
984define amdgpu_ps i16 @ret_ps_mesa_i16() {
985; GCN-LABEL: ret_ps_mesa_i16:
986; GCN:       ; %bb.0:
987; GCN-NEXT:    s_movk_i32 s0, 0x7b
988; GCN-NEXT:    ; return to shader part epilog
989  ret i16 123
990}
991
992define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) {
993; SI-LABEL: amd_kernel_i8:
994; SI:       ; %bb.0: ; %entry
995; SI-NEXT:    s_load_dword s0, s[4:5], 0x9
996; SI-NEXT:    s_mov_b32 s3, 0xf000
997; SI-NEXT:    s_waitcnt lgkmcnt(0)
998; SI-NEXT:    s_add_i32 s0, s0, s0
999; SI-NEXT:    s_mov_b32 s2, -1
1000; SI-NEXT:    v_mov_b32_e32 v0, s0
1001; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1002; SI-NEXT:    s_endpgm
1003;
1004; VI-LABEL: amd_kernel_i8:
1005; VI:       ; %bb.0: ; %entry
1006; VI-NEXT:    s_load_dword s0, s[4:5], 0x24
1007; VI-NEXT:    s_waitcnt lgkmcnt(0)
1008; VI-NEXT:    s_add_i32 s0, s0, s0
1009; VI-NEXT:    v_mov_b32_e32 v0, s0
1010; VI-NEXT:    flat_store_byte v[0:1], v0
1011; VI-NEXT:    s_endpgm
1012;
1013; GFX11-LABEL: amd_kernel_i8:
1014; GFX11:       ; %bb.0: ; %entry
1015; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
1016; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1017; GFX11-NEXT:    s_add_i32 s0, s0, s0
1018; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1019; GFX11-NEXT:    v_mov_b32_e32 v0, s0
1020; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
1021; GFX11-NEXT:    s_endpgm
1022entry:
1023  %add = add i8 %arg0, %arg0
1024  store i8 %add, ptr addrspace(1) undef
1025  ret void
1026}
1027
1028define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) {
1029; SI-LABEL: amd_kernel_v2i8:
1030; SI:       ; %bb.0: ; %entry
1031; SI-NEXT:    s_load_dword s1, s[4:5], 0x9
1032; SI-NEXT:    s_mov_b32 s0, 0
1033; SI-NEXT:    s_mov_b32 s3, 0xf000
1034; SI-NEXT:    s_waitcnt lgkmcnt(0)
1035; SI-NEXT:    s_bfe_u32 s2, s1, 0x80008
1036; SI-NEXT:    s_add_i32 s1, s1, s1
1037; SI-NEXT:    s_and_b32 s1, s1, 0xff
1038; SI-NEXT:    s_add_i32 s2, s2, s2
1039; SI-NEXT:    s_lshl_b32 s2, s2, 8
1040; SI-NEXT:    s_or_b32 s4, s1, s2
1041; SI-NEXT:    s_mov_b32 s2, -1
1042; SI-NEXT:    s_mov_b32 s1, s0
1043; SI-NEXT:    v_mov_b32_e32 v0, s4
1044; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1045; SI-NEXT:    s_endpgm
1046;
1047; VI-LABEL: amd_kernel_v2i8:
1048; VI:       ; %bb.0: ; %entry
1049; VI-NEXT:    s_load_dword s0, s[4:5], 0x24
1050; VI-NEXT:    v_mov_b32_e32 v0, 0
1051; VI-NEXT:    v_mov_b32_e32 v1, 0
1052; VI-NEXT:    s_waitcnt lgkmcnt(0)
1053; VI-NEXT:    s_bfe_u32 s1, s0, 0x80008
1054; VI-NEXT:    s_add_i32 s0, s0, s0
1055; VI-NEXT:    s_add_i32 s1, s1, s1
1056; VI-NEXT:    s_and_b32 s0, s0, 0xff
1057; VI-NEXT:    s_lshl_b32 s1, s1, 8
1058; VI-NEXT:    s_or_b32 s0, s0, s1
1059; VI-NEXT:    v_mov_b32_e32 v2, s0
1060; VI-NEXT:    flat_store_short v[0:1], v2
1061; VI-NEXT:    s_endpgm
1062;
1063; GFX11-LABEL: amd_kernel_v2i8:
1064; GFX11:       ; %bb.0: ; %entry
1065; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
1066; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1067; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1068; GFX11-NEXT:    s_bfe_u32 s1, s0, 0x80008
1069; GFX11-NEXT:    s_add_i32 s0, s0, s0
1070; GFX11-NEXT:    s_add_i32 s1, s1, s1
1071; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
1072; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
1073; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1074; GFX11-NEXT:    s_or_b32 s0, s0, s1
1075; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
1076; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
1077; GFX11-NEXT:    s_endpgm
1078entry:
1079  %add = add <2 x i8> %arg0, %arg0
1080  store <2 x i8> %add, ptr addrspace(1) null
1081  ret void
1082}
1083
1084define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) {
1085; SI-LABEL: amd_kernel_v4i8:
1086; SI:       ; %bb.0: ; %entry
1087; SI-NEXT:    s_load_dword s1, s[4:5], 0x9
1088; SI-NEXT:    s_mov_b32 s0, 0
1089; SI-NEXT:    s_mov_b32 s3, 0xf000
1090; SI-NEXT:    s_waitcnt lgkmcnt(0)
1091; SI-NEXT:    s_lshr_b32 s2, s1, 16
1092; SI-NEXT:    s_lshr_b32 s4, s1, 24
1093; SI-NEXT:    s_bfe_u32 s5, s1, 0x80008
1094; SI-NEXT:    s_add_i32 s1, s1, s1
1095; SI-NEXT:    s_add_i32 s4, s4, s4
1096; SI-NEXT:    s_add_i32 s2, s2, s2
1097; SI-NEXT:    s_and_b32 s1, s1, 0xff
1098; SI-NEXT:    s_add_i32 s5, s5, s5
1099; SI-NEXT:    s_lshl_b32 s4, s4, 24
1100; SI-NEXT:    s_and_b32 s2, s2, 0xff
1101; SI-NEXT:    s_lshl_b32 s5, s5, 8
1102; SI-NEXT:    s_lshl_b32 s2, s2, 16
1103; SI-NEXT:    s_or_b32 s1, s1, s5
1104; SI-NEXT:    s_or_b32 s2, s4, s2
1105; SI-NEXT:    s_and_b32 s1, s1, 0xffff
1106; SI-NEXT:    s_or_b32 s4, s1, s2
1107; SI-NEXT:    s_mov_b32 s2, -1
1108; SI-NEXT:    s_mov_b32 s1, s0
1109; SI-NEXT:    v_mov_b32_e32 v0, s4
1110; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1111; SI-NEXT:    s_endpgm
1112;
1113; VI-LABEL: amd_kernel_v4i8:
1114; VI:       ; %bb.0: ; %entry
1115; VI-NEXT:    s_load_dword s0, s[4:5], 0x24
1116; VI-NEXT:    v_mov_b32_e32 v0, 0
1117; VI-NEXT:    v_mov_b32_e32 v1, 0
1118; VI-NEXT:    s_waitcnt lgkmcnt(0)
1119; VI-NEXT:    s_lshr_b32 s2, s0, 16
1120; VI-NEXT:    s_lshr_b32 s1, s0, 24
1121; VI-NEXT:    s_add_i32 s2, s2, s2
1122; VI-NEXT:    s_bfe_u32 s3, s0, 0x80008
1123; VI-NEXT:    s_add_i32 s1, s1, s1
1124; VI-NEXT:    s_and_b32 s2, s2, 0xff
1125; VI-NEXT:    s_add_i32 s3, s3, s3
1126; VI-NEXT:    s_add_i32 s0, s0, s0
1127; VI-NEXT:    s_lshl_b32 s1, s1, 24
1128; VI-NEXT:    s_lshl_b32 s2, s2, 16
1129; VI-NEXT:    s_or_b32 s1, s1, s2
1130; VI-NEXT:    s_and_b32 s0, s0, 0xff
1131; VI-NEXT:    s_lshl_b32 s2, s3, 8
1132; VI-NEXT:    s_or_b32 s0, s0, s2
1133; VI-NEXT:    s_and_b32 s0, s0, 0xffff
1134; VI-NEXT:    s_or_b32 s0, s0, s1
1135; VI-NEXT:    v_mov_b32_e32 v2, s0
1136; VI-NEXT:    flat_store_dword v[0:1], v2
1137; VI-NEXT:    s_endpgm
1138;
1139; GFX11-LABEL: amd_kernel_v4i8:
1140; GFX11:       ; %bb.0: ; %entry
1141; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
1142; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1143; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1144; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
1145; GFX11-NEXT:    s_lshr_b32 s2, s0, 24
1146; GFX11-NEXT:    s_add_i32 s3, s0, s0
1147; GFX11-NEXT:    s_bfe_u32 s0, s0, 0x80008
1148; GFX11-NEXT:    s_add_i32 s2, s2, s2
1149; GFX11-NEXT:    s_add_i32 s0, s0, s0
1150; GFX11-NEXT:    s_add_i32 s1, s1, s1
1151; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
1152; GFX11-NEXT:    s_lshl_b32 s0, s0, 8
1153; GFX11-NEXT:    s_lshl_b32 s2, s2, 8
1154; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
1155; GFX11-NEXT:    s_or_b32 s0, s3, s0
1156; GFX11-NEXT:    s_or_b32 s1, s1, s2
1157; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
1158; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
1159; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1160; GFX11-NEXT:    s_or_b32 s0, s0, s1
1161; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
1162; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
1163; GFX11-NEXT:    s_endpgm
1164entry:
1165  %add = add <4 x i8> %arg0, %arg0
1166  store <4 x i8> %add, ptr addrspace(1) null
1167  ret void
1168}
1169
1170define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) {
1171; SI-LABEL: amd_kernel_v3i8:
1172; SI:       ; %bb.0: ; %entry
1173; SI-NEXT:    s_load_dword s4, s[4:5], 0x9
1174; SI-NEXT:    s_mov_b32 s1, 0
1175; SI-NEXT:    s_mov_b32 s0, 2
1176; SI-NEXT:    s_mov_b32 s3, 0xf000
1177; SI-NEXT:    s_mov_b32 s2, -1
1178; SI-NEXT:    s_waitcnt lgkmcnt(0)
1179; SI-NEXT:    s_lshr_b32 s5, s4, 16
1180; SI-NEXT:    s_bfe_u32 s6, s4, 0x80008
1181; SI-NEXT:    s_add_i32 s4, s4, s4
1182; SI-NEXT:    s_and_b32 s4, s4, 0xff
1183; SI-NEXT:    s_add_i32 s6, s6, s6
1184; SI-NEXT:    s_add_i32 s5, s5, s5
1185; SI-NEXT:    s_lshl_b32 s6, s6, 8
1186; SI-NEXT:    v_mov_b32_e32 v0, s5
1187; SI-NEXT:    s_or_b32 s4, s4, s6
1188; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1189; SI-NEXT:    s_mov_b32 s0, s1
1190; SI-NEXT:    s_waitcnt expcnt(0)
1191; SI-NEXT:    v_mov_b32_e32 v0, s4
1192; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1193; SI-NEXT:    s_endpgm
1194;
1195; VI-LABEL: amd_kernel_v3i8:
1196; VI:       ; %bb.0: ; %entry
1197; VI-NEXT:    s_load_dword s0, s[4:5], 0x24
1198; VI-NEXT:    v_mov_b32_e32 v0, 2
1199; VI-NEXT:    v_mov_b32_e32 v1, 0
1200; VI-NEXT:    s_waitcnt lgkmcnt(0)
1201; VI-NEXT:    s_lshr_b32 s1, s0, 16
1202; VI-NEXT:    s_bfe_u32 s2, s0, 0x80008
1203; VI-NEXT:    s_add_i32 s0, s0, s0
1204; VI-NEXT:    s_add_i32 s1, s1, s1
1205; VI-NEXT:    s_add_i32 s2, s2, s2
1206; VI-NEXT:    s_and_b32 s0, s0, 0xff
1207; VI-NEXT:    s_lshl_b32 s2, s2, 8
1208; VI-NEXT:    v_mov_b32_e32 v2, s1
1209; VI-NEXT:    s_or_b32 s0, s0, s2
1210; VI-NEXT:    flat_store_byte v[0:1], v2
1211; VI-NEXT:    v_mov_b32_e32 v0, 0
1212; VI-NEXT:    v_mov_b32_e32 v1, 0
1213; VI-NEXT:    v_mov_b32_e32 v2, s0
1214; VI-NEXT:    flat_store_short v[0:1], v2
1215; VI-NEXT:    s_endpgm
1216;
1217; GFX11-LABEL: amd_kernel_v3i8:
1218; GFX11:       ; %bb.0: ; %entry
1219; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
1220; GFX11-NEXT:    v_mov_b32_e32 v0, 2
1221; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1222; GFX11-NEXT:    s_bfe_u32 s2, s0, 0x80008
1223; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
1224; GFX11-NEXT:    s_add_i32 s0, s0, s0
1225; GFX11-NEXT:    s_add_i32 s2, s2, s2
1226; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
1227; GFX11-NEXT:    s_lshl_b32 s2, s2, 8
1228; GFX11-NEXT:    s_add_i32 s1, s1, s1
1229; GFX11-NEXT:    s_or_b32 s0, s0, s2
1230; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s1
1231; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0
1232; GFX11-NEXT:    v_mov_b32_e32 v3, 0
1233; GFX11-NEXT:    s_clause 0x1
1234; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
1235; GFX11-NEXT:    global_store_b16 v[2:3], v5, off
1236; GFX11-NEXT:    s_endpgm
1237entry:
1238  %add = add <3 x i8> %arg0, %arg0
1239  store <3 x i8> %add, ptr addrspace(1) null
1240  ret void
1241}
1242
1243define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
1244; SI-LABEL: amd_kernel_v5i8:
1245; SI:       ; %bb.0: ; %entry
1246; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
1247; SI-NEXT:    s_mov_b32 s1, 0
1248; SI-NEXT:    s_mov_b32 s0, 4
1249; SI-NEXT:    s_mov_b32 s3, 0xf000
1250; SI-NEXT:    s_mov_b32 s2, -1
1251; SI-NEXT:    s_waitcnt lgkmcnt(0)
1252; SI-NEXT:    s_lshr_b32 s6, s4, 16
1253; SI-NEXT:    s_lshr_b32 s7, s4, 24
1254; SI-NEXT:    s_bfe_u32 s8, s4, 0x80008
1255; SI-NEXT:    s_add_i32 s4, s4, s4
1256; SI-NEXT:    s_add_i32 s5, s5, s5
1257; SI-NEXT:    s_add_i32 s7, s7, s7
1258; SI-NEXT:    s_add_i32 s6, s6, s6
1259; SI-NEXT:    s_and_b32 s4, s4, 0xff
1260; SI-NEXT:    s_add_i32 s8, s8, s8
1261; SI-NEXT:    v_mov_b32_e32 v0, s5
1262; SI-NEXT:    s_lshl_b32 s5, s7, 24
1263; SI-NEXT:    s_and_b32 s6, s6, 0xff
1264; SI-NEXT:    s_lshl_b32 s7, s8, 8
1265; SI-NEXT:    s_lshl_b32 s6, s6, 16
1266; SI-NEXT:    s_or_b32 s4, s4, s7
1267; SI-NEXT:    s_or_b32 s5, s5, s6
1268; SI-NEXT:    s_and_b32 s4, s4, 0xffff
1269; SI-NEXT:    s_or_b32 s4, s4, s5
1270; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1271; SI-NEXT:    s_mov_b32 s0, s1
1272; SI-NEXT:    s_waitcnt expcnt(0)
1273; SI-NEXT:    v_mov_b32_e32 v0, s4
1274; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1275; SI-NEXT:    s_endpgm
1276;
1277; VI-LABEL: amd_kernel_v5i8:
1278; VI:       ; %bb.0: ; %entry
1279; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1280; VI-NEXT:    v_mov_b32_e32 v0, 4
1281; VI-NEXT:    v_mov_b32_e32 v1, 0
1282; VI-NEXT:    s_waitcnt lgkmcnt(0)
1283; VI-NEXT:    s_lshr_b32 s3, s0, 16
1284; VI-NEXT:    s_lshr_b32 s2, s0, 24
1285; VI-NEXT:    s_add_i32 s3, s3, s3
1286; VI-NEXT:    s_bfe_u32 s4, s0, 0x80008
1287; VI-NEXT:    s_add_i32 s2, s2, s2
1288; VI-NEXT:    s_and_b32 s3, s3, 0xff
1289; VI-NEXT:    s_add_i32 s4, s4, s4
1290; VI-NEXT:    s_add_i32 s0, s0, s0
1291; VI-NEXT:    s_lshl_b32 s2, s2, 24
1292; VI-NEXT:    s_lshl_b32 s3, s3, 16
1293; VI-NEXT:    s_and_b32 s1, s1, 0xff
1294; VI-NEXT:    s_or_b32 s2, s2, s3
1295; VI-NEXT:    s_and_b32 s0, s0, 0xff
1296; VI-NEXT:    s_lshl_b32 s3, s4, 8
1297; VI-NEXT:    s_add_i32 s1, s1, s1
1298; VI-NEXT:    s_or_b32 s0, s0, s3
1299; VI-NEXT:    s_and_b32 s0, s0, 0xffff
1300; VI-NEXT:    v_mov_b32_e32 v2, s1
1301; VI-NEXT:    s_or_b32 s0, s0, s2
1302; VI-NEXT:    flat_store_byte v[0:1], v2
1303; VI-NEXT:    v_mov_b32_e32 v0, 0
1304; VI-NEXT:    v_mov_b32_e32 v1, 0
1305; VI-NEXT:    v_mov_b32_e32 v2, s0
1306; VI-NEXT:    flat_store_dword v[0:1], v2
1307; VI-NEXT:    s_endpgm
1308;
1309; GFX11-LABEL: amd_kernel_v5i8:
1310; GFX11:       ; %bb.0: ; %entry
1311; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1312; GFX11-NEXT:    v_mov_b32_e32 v0, 4
1313; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1314; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
1315; GFX11-NEXT:    s_lshr_b32 s3, s0, 24
1316; GFX11-NEXT:    s_add_i32 s4, s0, s0
1317; GFX11-NEXT:    s_bfe_u32 s0, s0, 0x80008
1318; GFX11-NEXT:    s_add_i32 s3, s3, s3
1319; GFX11-NEXT:    s_add_i32 s0, s0, s0
1320; GFX11-NEXT:    s_add_i32 s2, s2, s2
1321; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
1322; GFX11-NEXT:    s_lshl_b32 s0, s0, 8
1323; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
1324; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
1325; GFX11-NEXT:    s_or_b32 s0, s4, s0
1326; GFX11-NEXT:    s_or_b32 s2, s2, s3
1327; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
1328; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
1329; GFX11-NEXT:    s_add_i32 s1, s1, s1
1330; GFX11-NEXT:    s_or_b32 s0, s0, s2
1331; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s1
1332; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0
1333; GFX11-NEXT:    v_mov_b32_e32 v3, 0
1334; GFX11-NEXT:    s_clause 0x1
1335; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
1336; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
1337; GFX11-NEXT:    s_endpgm
1338entry:
1339  %add = add <5 x i8> %arg0, %arg0
1340  store <5 x i8> %add, ptr addrspace(1) null
1341  ret void
1342}
1343
1344define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
1345; SI-LABEL: amd_kernel_v8i8:
1346; SI:       ; %bb.0: ; %entry
1347; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
1348; SI-NEXT:    s_mov_b32 s0, 0
1349; SI-NEXT:    s_mov_b32 s3, 0xf000
1350; SI-NEXT:    s_waitcnt lgkmcnt(0)
1351; SI-NEXT:    s_lshr_b32 s1, s4, 16
1352; SI-NEXT:    s_lshr_b32 s2, s4, 24
1353; SI-NEXT:    s_lshr_b32 s6, s5, 16
1354; SI-NEXT:    s_lshr_b32 s7, s5, 24
1355; SI-NEXT:    s_bfe_u32 s8, s4, 0x80008
1356; SI-NEXT:    s_bfe_u32 s9, s5, 0x80008
1357; SI-NEXT:    s_add_i32 s5, s5, s5
1358; SI-NEXT:    s_add_i32 s4, s4, s4
1359; SI-NEXT:    s_add_i32 s7, s7, s7
1360; SI-NEXT:    s_add_i32 s6, s6, s6
1361; SI-NEXT:    s_and_b32 s5, s5, 0xff
1362; SI-NEXT:    s_add_i32 s9, s9, s9
1363; SI-NEXT:    s_add_i32 s2, s2, s2
1364; SI-NEXT:    s_add_i32 s1, s1, s1
1365; SI-NEXT:    s_and_b32 s4, s4, 0xff
1366; SI-NEXT:    s_add_i32 s8, s8, s8
1367; SI-NEXT:    s_lshl_b32 s7, s7, 24
1368; SI-NEXT:    s_and_b32 s6, s6, 0xff
1369; SI-NEXT:    s_lshl_b32 s9, s9, 8
1370; SI-NEXT:    s_lshl_b32 s2, s2, 24
1371; SI-NEXT:    s_and_b32 s1, s1, 0xff
1372; SI-NEXT:    s_lshl_b32 s8, s8, 8
1373; SI-NEXT:    s_lshl_b32 s6, s6, 16
1374; SI-NEXT:    s_or_b32 s5, s5, s9
1375; SI-NEXT:    s_lshl_b32 s1, s1, 16
1376; SI-NEXT:    s_or_b32 s4, s4, s8
1377; SI-NEXT:    s_or_b32 s6, s7, s6
1378; SI-NEXT:    s_and_b32 s5, s5, 0xffff
1379; SI-NEXT:    s_or_b32 s1, s2, s1
1380; SI-NEXT:    s_and_b32 s2, s4, 0xffff
1381; SI-NEXT:    s_or_b32 s4, s5, s6
1382; SI-NEXT:    s_or_b32 s5, s2, s1
1383; SI-NEXT:    s_mov_b32 s2, -1
1384; SI-NEXT:    s_mov_b32 s1, s0
1385; SI-NEXT:    v_mov_b32_e32 v0, s5
1386; SI-NEXT:    v_mov_b32_e32 v1, s4
1387; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1388; SI-NEXT:    s_endpgm
1389;
1390; VI-LABEL: amd_kernel_v8i8:
1391; VI:       ; %bb.0: ; %entry
1392; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1393; VI-NEXT:    v_mov_b32_e32 v2, 0
1394; VI-NEXT:    v_mov_b32_e32 v3, 0
1395; VI-NEXT:    s_waitcnt lgkmcnt(0)
1396; VI-NEXT:    s_lshr_b32 s3, s1, 16
1397; VI-NEXT:    s_lshr_b32 s2, s1, 24
1398; VI-NEXT:    s_add_i32 s3, s3, s3
1399; VI-NEXT:    s_bfe_u32 s6, s1, 0x80008
1400; VI-NEXT:    s_add_i32 s2, s2, s2
1401; VI-NEXT:    s_and_b32 s3, s3, 0xff
1402; VI-NEXT:    s_add_i32 s6, s6, s6
1403; VI-NEXT:    s_add_i32 s1, s1, s1
1404; VI-NEXT:    s_lshl_b32 s2, s2, 24
1405; VI-NEXT:    s_lshl_b32 s3, s3, 16
1406; VI-NEXT:    s_lshr_b32 s5, s0, 16
1407; VI-NEXT:    s_or_b32 s2, s2, s3
1408; VI-NEXT:    s_and_b32 s1, s1, 0xff
1409; VI-NEXT:    s_lshl_b32 s3, s6, 8
1410; VI-NEXT:    s_lshr_b32 s4, s0, 24
1411; VI-NEXT:    s_add_i32 s5, s5, s5
1412; VI-NEXT:    s_or_b32 s1, s1, s3
1413; VI-NEXT:    s_bfe_u32 s7, s0, 0x80008
1414; VI-NEXT:    s_add_i32 s4, s4, s4
1415; VI-NEXT:    s_and_b32 s1, s1, 0xffff
1416; VI-NEXT:    s_and_b32 s3, s5, 0xff
1417; VI-NEXT:    s_add_i32 s7, s7, s7
1418; VI-NEXT:    s_add_i32 s0, s0, s0
1419; VI-NEXT:    s_or_b32 s1, s1, s2
1420; VI-NEXT:    s_lshl_b32 s2, s4, 24
1421; VI-NEXT:    s_lshl_b32 s3, s3, 16
1422; VI-NEXT:    s_or_b32 s2, s2, s3
1423; VI-NEXT:    s_and_b32 s0, s0, 0xff
1424; VI-NEXT:    s_lshl_b32 s3, s7, 8
1425; VI-NEXT:    s_or_b32 s0, s0, s3
1426; VI-NEXT:    s_and_b32 s0, s0, 0xffff
1427; VI-NEXT:    s_or_b32 s0, s0, s2
1428; VI-NEXT:    v_mov_b32_e32 v0, s0
1429; VI-NEXT:    v_mov_b32_e32 v1, s1
1430; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1431; VI-NEXT:    s_endpgm
1432;
1433; GFX11-LABEL: amd_kernel_v8i8:
1434; GFX11:       ; %bb.0: ; %entry
1435; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1436; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1437; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
1438; GFX11-NEXT:    s_lshr_b32 s3, s0, 24
1439; GFX11-NEXT:    s_lshr_b32 s4, s1, 16
1440; GFX11-NEXT:    s_lshr_b32 s5, s1, 24
1441; GFX11-NEXT:    s_bfe_u32 s6, s0, 0x80008
1442; GFX11-NEXT:    s_bfe_u32 s7, s1, 0x80008
1443; GFX11-NEXT:    s_add_i32 s1, s1, s1
1444; GFX11-NEXT:    s_add_i32 s0, s0, s0
1445; GFX11-NEXT:    s_add_i32 s7, s7, s7
1446; GFX11-NEXT:    s_add_i32 s5, s5, s5
1447; GFX11-NEXT:    s_add_i32 s4, s4, s4
1448; GFX11-NEXT:    s_add_i32 s6, s6, s6
1449; GFX11-NEXT:    s_add_i32 s3, s3, s3
1450; GFX11-NEXT:    s_add_i32 s2, s2, s2
1451; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
1452; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
1453; GFX11-NEXT:    s_lshl_b32 s7, s7, 8
1454; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
1455; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
1456; GFX11-NEXT:    s_lshl_b32 s6, s6, 8
1457; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
1458; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
1459; GFX11-NEXT:    s_or_b32 s1, s1, s7
1460; GFX11-NEXT:    s_or_b32 s4, s4, s5
1461; GFX11-NEXT:    s_or_b32 s0, s0, s6
1462; GFX11-NEXT:    s_or_b32 s2, s2, s3
1463; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
1464; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
1465; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
1466; GFX11-NEXT:    s_lshl_b32 s3, s4, 16
1467; GFX11-NEXT:    s_or_b32 s0, s0, s2
1468; GFX11-NEXT:    s_or_b32 s1, s1, s3
1469; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1470; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
1471; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
1472; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
1473; GFX11-NEXT:    s_endpgm
1474entry:
1475  %add = add <8 x i8> %arg0, %arg0
1476  store <8 x i8> %add, ptr addrspace(1) null
1477  ret void
1478}
1479
1480define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
1481; SI-LABEL: amd_kernel_v16i8:
1482; SI:       ; %bb.0: ; %entry
1483; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1484; SI-NEXT:    s_mov_b32 s4, 0
1485; SI-NEXT:    s_mov_b32 s7, 0xf000
1486; SI-NEXT:    s_waitcnt lgkmcnt(0)
1487; SI-NEXT:    s_lshr_b32 s5, s0, 16
1488; SI-NEXT:    s_lshr_b32 s6, s0, 24
1489; SI-NEXT:    s_lshr_b32 s8, s1, 16
1490; SI-NEXT:    s_lshr_b32 s9, s1, 24
1491; SI-NEXT:    s_lshr_b32 s10, s2, 16
1492; SI-NEXT:    s_lshr_b32 s11, s2, 24
1493; SI-NEXT:    s_lshr_b32 s12, s3, 16
1494; SI-NEXT:    s_lshr_b32 s13, s3, 24
1495; SI-NEXT:    s_bfe_u32 s14, s0, 0x80008
1496; SI-NEXT:    s_bfe_u32 s15, s1, 0x80008
1497; SI-NEXT:    s_bfe_u32 s16, s2, 0x80008
1498; SI-NEXT:    s_bfe_u32 s17, s3, 0x80008
1499; SI-NEXT:    s_add_i32 s3, s3, s3
1500; SI-NEXT:    s_add_i32 s2, s2, s2
1501; SI-NEXT:    s_add_i32 s1, s1, s1
1502; SI-NEXT:    s_add_i32 s0, s0, s0
1503; SI-NEXT:    s_add_i32 s13, s13, s13
1504; SI-NEXT:    s_add_i32 s12, s12, s12
1505; SI-NEXT:    s_and_b32 s3, s3, 0xff
1506; SI-NEXT:    s_add_i32 s17, s17, s17
1507; SI-NEXT:    s_add_i32 s11, s11, s11
1508; SI-NEXT:    s_add_i32 s10, s10, s10
1509; SI-NEXT:    s_and_b32 s2, s2, 0xff
1510; SI-NEXT:    s_add_i32 s16, s16, s16
1511; SI-NEXT:    s_add_i32 s9, s9, s9
1512; SI-NEXT:    s_add_i32 s8, s8, s8
1513; SI-NEXT:    s_and_b32 s1, s1, 0xff
1514; SI-NEXT:    s_add_i32 s15, s15, s15
1515; SI-NEXT:    s_add_i32 s6, s6, s6
1516; SI-NEXT:    s_add_i32 s5, s5, s5
1517; SI-NEXT:    s_and_b32 s0, s0, 0xff
1518; SI-NEXT:    s_add_i32 s14, s14, s14
1519; SI-NEXT:    s_lshl_b32 s13, s13, 24
1520; SI-NEXT:    s_and_b32 s12, s12, 0xff
1521; SI-NEXT:    s_lshl_b32 s17, s17, 8
1522; SI-NEXT:    s_lshl_b32 s11, s11, 24
1523; SI-NEXT:    s_and_b32 s10, s10, 0xff
1524; SI-NEXT:    s_lshl_b32 s16, s16, 8
1525; SI-NEXT:    s_lshl_b32 s9, s9, 24
1526; SI-NEXT:    s_and_b32 s8, s8, 0xff
1527; SI-NEXT:    s_lshl_b32 s15, s15, 8
1528; SI-NEXT:    s_lshl_b32 s6, s6, 24
1529; SI-NEXT:    s_and_b32 s5, s5, 0xff
1530; SI-NEXT:    s_lshl_b32 s14, s14, 8
1531; SI-NEXT:    s_lshl_b32 s12, s12, 16
1532; SI-NEXT:    s_or_b32 s3, s3, s17
1533; SI-NEXT:    s_lshl_b32 s10, s10, 16
1534; SI-NEXT:    s_or_b32 s2, s2, s16
1535; SI-NEXT:    s_lshl_b32 s8, s8, 16
1536; SI-NEXT:    s_or_b32 s1, s1, s15
1537; SI-NEXT:    s_lshl_b32 s5, s5, 16
1538; SI-NEXT:    s_or_b32 s0, s0, s14
1539; SI-NEXT:    s_or_b32 s12, s13, s12
1540; SI-NEXT:    s_and_b32 s3, s3, 0xffff
1541; SI-NEXT:    s_or_b32 s10, s11, s10
1542; SI-NEXT:    s_and_b32 s2, s2, 0xffff
1543; SI-NEXT:    s_or_b32 s8, s9, s8
1544; SI-NEXT:    s_and_b32 s1, s1, 0xffff
1545; SI-NEXT:    s_or_b32 s5, s6, s5
1546; SI-NEXT:    s_and_b32 s0, s0, 0xffff
1547; SI-NEXT:    s_or_b32 s3, s3, s12
1548; SI-NEXT:    s_or_b32 s2, s2, s10
1549; SI-NEXT:    s_or_b32 s1, s1, s8
1550; SI-NEXT:    s_or_b32 s0, s0, s5
1551; SI-NEXT:    s_mov_b32 s6, -1
1552; SI-NEXT:    s_mov_b32 s5, s4
1553; SI-NEXT:    v_mov_b32_e32 v0, s0
1554; SI-NEXT:    v_mov_b32_e32 v1, s1
1555; SI-NEXT:    v_mov_b32_e32 v2, s2
1556; SI-NEXT:    v_mov_b32_e32 v3, s3
1557; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1558; SI-NEXT:    s_endpgm
1559;
1560; VI-LABEL: amd_kernel_v16i8:
1561; VI:       ; %bb.0: ; %entry
1562; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1563; VI-NEXT:    v_mov_b32_e32 v4, 0
1564; VI-NEXT:    v_mov_b32_e32 v5, 0
1565; VI-NEXT:    s_waitcnt lgkmcnt(0)
1566; VI-NEXT:    s_lshr_b32 s5, s3, 16
1567; VI-NEXT:    s_lshr_b32 s4, s3, 24
1568; VI-NEXT:    s_add_i32 s5, s5, s5
1569; VI-NEXT:    s_bfe_u32 s12, s3, 0x80008
1570; VI-NEXT:    s_add_i32 s4, s4, s4
1571; VI-NEXT:    s_and_b32 s5, s5, 0xff
1572; VI-NEXT:    s_add_i32 s12, s12, s12
1573; VI-NEXT:    s_add_i32 s3, s3, s3
1574; VI-NEXT:    s_lshl_b32 s4, s4, 24
1575; VI-NEXT:    s_lshl_b32 s5, s5, 16
1576; VI-NEXT:    s_lshr_b32 s7, s2, 16
1577; VI-NEXT:    s_or_b32 s4, s4, s5
1578; VI-NEXT:    s_and_b32 s3, s3, 0xff
1579; VI-NEXT:    s_lshl_b32 s5, s12, 8
1580; VI-NEXT:    s_lshr_b32 s6, s2, 24
1581; VI-NEXT:    s_add_i32 s7, s7, s7
1582; VI-NEXT:    s_or_b32 s3, s3, s5
1583; VI-NEXT:    s_bfe_u32 s13, s2, 0x80008
1584; VI-NEXT:    s_add_i32 s6, s6, s6
1585; VI-NEXT:    s_and_b32 s3, s3, 0xffff
1586; VI-NEXT:    s_and_b32 s5, s7, 0xff
1587; VI-NEXT:    s_add_i32 s13, s13, s13
1588; VI-NEXT:    s_add_i32 s2, s2, s2
1589; VI-NEXT:    s_or_b32 s3, s3, s4
1590; VI-NEXT:    s_lshl_b32 s4, s6, 24
1591; VI-NEXT:    s_lshl_b32 s5, s5, 16
1592; VI-NEXT:    s_lshr_b32 s9, s1, 16
1593; VI-NEXT:    s_or_b32 s4, s4, s5
1594; VI-NEXT:    s_and_b32 s2, s2, 0xff
1595; VI-NEXT:    s_lshl_b32 s5, s13, 8
1596; VI-NEXT:    s_lshr_b32 s8, s1, 24
1597; VI-NEXT:    s_add_i32 s9, s9, s9
1598; VI-NEXT:    s_or_b32 s2, s2, s5
1599; VI-NEXT:    s_bfe_u32 s14, s1, 0x80008
1600; VI-NEXT:    s_add_i32 s8, s8, s8
1601; VI-NEXT:    s_and_b32 s2, s2, 0xffff
1602; VI-NEXT:    s_and_b32 s5, s9, 0xff
1603; VI-NEXT:    s_add_i32 s14, s14, s14
1604; VI-NEXT:    s_add_i32 s1, s1, s1
1605; VI-NEXT:    s_or_b32 s2, s2, s4
1606; VI-NEXT:    s_lshl_b32 s4, s8, 24
1607; VI-NEXT:    s_lshl_b32 s5, s5, 16
1608; VI-NEXT:    s_lshr_b32 s11, s0, 16
1609; VI-NEXT:    s_or_b32 s4, s4, s5
1610; VI-NEXT:    s_and_b32 s1, s1, 0xff
1611; VI-NEXT:    s_lshl_b32 s5, s14, 8
1612; VI-NEXT:    s_lshr_b32 s10, s0, 24
1613; VI-NEXT:    s_add_i32 s11, s11, s11
1614; VI-NEXT:    s_or_b32 s1, s1, s5
1615; VI-NEXT:    s_bfe_u32 s15, s0, 0x80008
1616; VI-NEXT:    s_add_i32 s10, s10, s10
1617; VI-NEXT:    s_and_b32 s1, s1, 0xffff
1618; VI-NEXT:    s_and_b32 s5, s11, 0xff
1619; VI-NEXT:    s_add_i32 s15, s15, s15
1620; VI-NEXT:    s_add_i32 s0, s0, s0
1621; VI-NEXT:    s_or_b32 s1, s1, s4
1622; VI-NEXT:    s_lshl_b32 s4, s10, 24
1623; VI-NEXT:    s_lshl_b32 s5, s5, 16
1624; VI-NEXT:    s_or_b32 s4, s4, s5
1625; VI-NEXT:    s_and_b32 s0, s0, 0xff
1626; VI-NEXT:    s_lshl_b32 s5, s15, 8
1627; VI-NEXT:    s_or_b32 s0, s0, s5
1628; VI-NEXT:    s_and_b32 s0, s0, 0xffff
1629; VI-NEXT:    s_or_b32 s0, s0, s4
1630; VI-NEXT:    v_mov_b32_e32 v0, s0
1631; VI-NEXT:    v_mov_b32_e32 v1, s1
1632; VI-NEXT:    v_mov_b32_e32 v2, s2
1633; VI-NEXT:    v_mov_b32_e32 v3, s3
1634; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1635; VI-NEXT:    s_endpgm
1636;
1637; GFX11-LABEL: amd_kernel_v16i8:
1638; GFX11:       ; %bb.0: ; %entry
1639; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1640; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1641; GFX11-NEXT:    s_lshr_b32 s6, s1, 16
1642; GFX11-NEXT:    s_lshr_b32 s7, s1, 24
1643; GFX11-NEXT:    s_lshr_b32 s8, s2, 16
1644; GFX11-NEXT:    s_lshr_b32 s9, s2, 24
1645; GFX11-NEXT:    s_lshr_b32 s10, s3, 16
1646; GFX11-NEXT:    s_lshr_b32 s11, s3, 24
1647; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
1648; GFX11-NEXT:    s_lshr_b32 s5, s0, 24
1649; GFX11-NEXT:    s_bfe_u32 s12, s0, 0x80008
1650; GFX11-NEXT:    s_bfe_u32 s13, s1, 0x80008
1651; GFX11-NEXT:    s_bfe_u32 s14, s2, 0x80008
1652; GFX11-NEXT:    s_bfe_u32 s15, s3, 0x80008
1653; GFX11-NEXT:    s_add_i32 s11, s11, s11
1654; GFX11-NEXT:    s_add_i32 s10, s10, s10
1655; GFX11-NEXT:    s_add_i32 s9, s9, s9
1656; GFX11-NEXT:    s_add_i32 s8, s8, s8
1657; GFX11-NEXT:    s_add_i32 s7, s7, s7
1658; GFX11-NEXT:    s_add_i32 s6, s6, s6
1659; GFX11-NEXT:    s_add_i32 s3, s3, s3
1660; GFX11-NEXT:    s_add_i32 s2, s2, s2
1661; GFX11-NEXT:    s_add_i32 s15, s15, s15
1662; GFX11-NEXT:    s_add_i32 s14, s14, s14
1663; GFX11-NEXT:    s_lshl_b32 s11, s11, 8
1664; GFX11-NEXT:    s_and_b32 s10, s10, 0xff
1665; GFX11-NEXT:    s_lshl_b32 s9, s9, 8
1666; GFX11-NEXT:    s_and_b32 s8, s8, 0xff
1667; GFX11-NEXT:    s_add_i32 s1, s1, s1
1668; GFX11-NEXT:    s_add_i32 s13, s13, s13
1669; GFX11-NEXT:    s_lshl_b32 s7, s7, 8
1670; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
1671; GFX11-NEXT:    s_add_i32 s0, s0, s0
1672; GFX11-NEXT:    s_add_i32 s12, s12, s12
1673; GFX11-NEXT:    s_add_i32 s5, s5, s5
1674; GFX11-NEXT:    s_add_i32 s4, s4, s4
1675; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
1676; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
1677; GFX11-NEXT:    s_lshl_b32 s15, s15, 8
1678; GFX11-NEXT:    s_or_b32 s10, s10, s11
1679; GFX11-NEXT:    s_lshl_b32 s11, s14, 8
1680; GFX11-NEXT:    s_or_b32 s8, s8, s9
1681; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
1682; GFX11-NEXT:    s_lshl_b32 s9, s13, 8
1683; GFX11-NEXT:    s_or_b32 s6, s6, s7
1684; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
1685; GFX11-NEXT:    s_lshl_b32 s7, s12, 8
1686; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
1687; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
1688; GFX11-NEXT:    s_or_b32 s3, s3, s15
1689; GFX11-NEXT:    s_or_b32 s2, s2, s11
1690; GFX11-NEXT:    s_or_b32 s1, s1, s9
1691; GFX11-NEXT:    s_or_b32 s0, s0, s7
1692; GFX11-NEXT:    s_or_b32 s4, s4, s5
1693; GFX11-NEXT:    s_and_b32 s3, s3, 0xffff
1694; GFX11-NEXT:    s_lshl_b32 s10, s10, 16
1695; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
1696; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
1697; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
1698; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
1699; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
1700; GFX11-NEXT:    s_lshl_b32 s5, s6, 16
1701; GFX11-NEXT:    s_or_b32 s3, s3, s10
1702; GFX11-NEXT:    s_or_b32 s2, s2, s8
1703; GFX11-NEXT:    s_or_b32 s0, s0, s4
1704; GFX11-NEXT:    s_or_b32 s1, s1, s5
1705; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3
1706; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1707; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, 0
1708; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
1709; GFX11-NEXT:    s_endpgm
1710entry:
1711  %add = add <16 x i8> %arg0, %arg0
1712  store <16 x i8> %add, ptr addrspace(1) null
1713  ret void
1714}
1715
1716define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
1717; SI-LABEL: amd_kernel_v32i8:
1718; SI:       ; %bb.0: ; %entry
1719; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
1720; SI-NEXT:    s_mov_b32 s9, 0
1721; SI-NEXT:    s_mov_b32 s8, 16
1722; SI-NEXT:    s_mov_b32 s11, 0xf000
1723; SI-NEXT:    s_mov_b32 s10, -1
1724; SI-NEXT:    s_waitcnt lgkmcnt(0)
1725; SI-NEXT:    s_lshr_b32 s12, s4, 16
1726; SI-NEXT:    s_lshr_b32 s13, s4, 24
1727; SI-NEXT:    s_lshr_b32 s14, s5, 16
1728; SI-NEXT:    s_lshr_b32 s15, s5, 24
1729; SI-NEXT:    s_lshr_b32 s16, s6, 16
1730; SI-NEXT:    s_lshr_b32 s17, s6, 24
1731; SI-NEXT:    s_lshr_b32 s18, s7, 16
1732; SI-NEXT:    s_lshr_b32 s19, s7, 24
1733; SI-NEXT:    s_lshr_b32 s20, s0, 16
1734; SI-NEXT:    s_lshr_b32 s21, s0, 24
1735; SI-NEXT:    s_lshr_b32 s22, s1, 16
1736; SI-NEXT:    s_lshr_b32 s23, s1, 24
1737; SI-NEXT:    s_lshr_b32 s24, s2, 16
1738; SI-NEXT:    s_lshr_b32 s25, s2, 24
1739; SI-NEXT:    s_lshr_b32 s26, s3, 16
1740; SI-NEXT:    s_lshr_b32 s27, s3, 24
1741; SI-NEXT:    s_bfe_u32 s28, s4, 0x80008
1742; SI-NEXT:    s_bfe_u32 s29, s5, 0x80008
1743; SI-NEXT:    s_bfe_u32 s30, s6, 0x80008
1744; SI-NEXT:    s_bfe_u32 s31, s7, 0x80008
1745; SI-NEXT:    s_bfe_u32 s33, s0, 0x80008
1746; SI-NEXT:    s_bfe_u32 s34, s1, 0x80008
1747; SI-NEXT:    s_bfe_u32 s35, s2, 0x80008
1748; SI-NEXT:    s_bfe_u32 s36, s3, 0x80008
1749; SI-NEXT:    s_add_i32 s3, s3, s3
1750; SI-NEXT:    s_add_i32 s2, s2, s2
1751; SI-NEXT:    s_add_i32 s1, s1, s1
1752; SI-NEXT:    s_add_i32 s0, s0, s0
1753; SI-NEXT:    s_add_i32 s7, s7, s7
1754; SI-NEXT:    s_add_i32 s6, s6, s6
1755; SI-NEXT:    s_add_i32 s5, s5, s5
1756; SI-NEXT:    s_add_i32 s4, s4, s4
1757; SI-NEXT:    s_add_i32 s27, s27, s27
1758; SI-NEXT:    s_add_i32 s26, s26, s26
1759; SI-NEXT:    s_and_b32 s3, s3, 0xff
1760; SI-NEXT:    s_add_i32 s36, s36, s36
1761; SI-NEXT:    s_add_i32 s25, s25, s25
1762; SI-NEXT:    s_add_i32 s24, s24, s24
1763; SI-NEXT:    s_and_b32 s2, s2, 0xff
1764; SI-NEXT:    s_add_i32 s35, s35, s35
1765; SI-NEXT:    s_add_i32 s23, s23, s23
1766; SI-NEXT:    s_add_i32 s22, s22, s22
1767; SI-NEXT:    s_and_b32 s1, s1, 0xff
1768; SI-NEXT:    s_add_i32 s34, s34, s34
1769; SI-NEXT:    s_add_i32 s21, s21, s21
1770; SI-NEXT:    s_add_i32 s20, s20, s20
1771; SI-NEXT:    s_and_b32 s0, s0, 0xff
1772; SI-NEXT:    s_add_i32 s33, s33, s33
1773; SI-NEXT:    s_add_i32 s19, s19, s19
1774; SI-NEXT:    s_add_i32 s18, s18, s18
1775; SI-NEXT:    s_and_b32 s7, s7, 0xff
1776; SI-NEXT:    s_add_i32 s31, s31, s31
1777; SI-NEXT:    s_add_i32 s17, s17, s17
1778; SI-NEXT:    s_add_i32 s16, s16, s16
1779; SI-NEXT:    s_and_b32 s6, s6, 0xff
1780; SI-NEXT:    s_add_i32 s30, s30, s30
1781; SI-NEXT:    s_add_i32 s15, s15, s15
1782; SI-NEXT:    s_add_i32 s14, s14, s14
1783; SI-NEXT:    s_and_b32 s5, s5, 0xff
1784; SI-NEXT:    s_add_i32 s29, s29, s29
1785; SI-NEXT:    s_add_i32 s13, s13, s13
1786; SI-NEXT:    s_add_i32 s12, s12, s12
1787; SI-NEXT:    s_and_b32 s4, s4, 0xff
1788; SI-NEXT:    s_add_i32 s28, s28, s28
1789; SI-NEXT:    s_lshl_b32 s27, s27, 24
1790; SI-NEXT:    s_and_b32 s26, s26, 0xff
1791; SI-NEXT:    s_lshl_b32 s36, s36, 8
1792; SI-NEXT:    s_lshl_b32 s25, s25, 24
1793; SI-NEXT:    s_and_b32 s24, s24, 0xff
1794; SI-NEXT:    s_lshl_b32 s35, s35, 8
1795; SI-NEXT:    s_lshl_b32 s23, s23, 24
1796; SI-NEXT:    s_and_b32 s22, s22, 0xff
1797; SI-NEXT:    s_lshl_b32 s34, s34, 8
1798; SI-NEXT:    s_lshl_b32 s21, s21, 24
1799; SI-NEXT:    s_and_b32 s20, s20, 0xff
1800; SI-NEXT:    s_lshl_b32 s33, s33, 8
1801; SI-NEXT:    s_lshl_b32 s19, s19, 24
1802; SI-NEXT:    s_and_b32 s18, s18, 0xff
1803; SI-NEXT:    s_lshl_b32 s31, s31, 8
1804; SI-NEXT:    s_lshl_b32 s17, s17, 24
1805; SI-NEXT:    s_and_b32 s16, s16, 0xff
1806; SI-NEXT:    s_lshl_b32 s30, s30, 8
1807; SI-NEXT:    s_lshl_b32 s15, s15, 24
1808; SI-NEXT:    s_and_b32 s14, s14, 0xff
1809; SI-NEXT:    s_lshl_b32 s29, s29, 8
1810; SI-NEXT:    s_lshl_b32 s13, s13, 24
1811; SI-NEXT:    s_and_b32 s12, s12, 0xff
1812; SI-NEXT:    s_lshl_b32 s28, s28, 8
1813; SI-NEXT:    s_lshl_b32 s26, s26, 16
1814; SI-NEXT:    s_or_b32 s3, s3, s36
1815; SI-NEXT:    s_lshl_b32 s24, s24, 16
1816; SI-NEXT:    s_or_b32 s2, s2, s35
1817; SI-NEXT:    s_lshl_b32 s22, s22, 16
1818; SI-NEXT:    s_or_b32 s1, s1, s34
1819; SI-NEXT:    s_lshl_b32 s20, s20, 16
1820; SI-NEXT:    s_or_b32 s0, s0, s33
1821; SI-NEXT:    s_lshl_b32 s18, s18, 16
1822; SI-NEXT:    s_or_b32 s7, s7, s31
1823; SI-NEXT:    s_lshl_b32 s16, s16, 16
1824; SI-NEXT:    s_or_b32 s6, s6, s30
1825; SI-NEXT:    s_lshl_b32 s14, s14, 16
1826; SI-NEXT:    s_or_b32 s5, s5, s29
1827; SI-NEXT:    s_lshl_b32 s12, s12, 16
1828; SI-NEXT:    s_or_b32 s4, s4, s28
1829; SI-NEXT:    s_or_b32 s26, s27, s26
1830; SI-NEXT:    s_and_b32 s3, s3, 0xffff
1831; SI-NEXT:    s_or_b32 s24, s25, s24
1832; SI-NEXT:    s_and_b32 s2, s2, 0xffff
1833; SI-NEXT:    s_or_b32 s22, s23, s22
1834; SI-NEXT:    s_and_b32 s1, s1, 0xffff
1835; SI-NEXT:    s_or_b32 s20, s21, s20
1836; SI-NEXT:    s_and_b32 s0, s0, 0xffff
1837; SI-NEXT:    s_or_b32 s18, s19, s18
1838; SI-NEXT:    s_and_b32 s7, s7, 0xffff
1839; SI-NEXT:    s_or_b32 s16, s17, s16
1840; SI-NEXT:    s_and_b32 s6, s6, 0xffff
1841; SI-NEXT:    s_or_b32 s14, s15, s14
1842; SI-NEXT:    s_and_b32 s5, s5, 0xffff
1843; SI-NEXT:    s_or_b32 s12, s13, s12
1844; SI-NEXT:    s_and_b32 s4, s4, 0xffff
1845; SI-NEXT:    s_or_b32 s3, s3, s26
1846; SI-NEXT:    s_or_b32 s2, s2, s24
1847; SI-NEXT:    s_or_b32 s1, s1, s22
1848; SI-NEXT:    s_or_b32 s7, s7, s18
1849; SI-NEXT:    s_or_b32 s6, s6, s16
1850; SI-NEXT:    s_or_b32 s5, s5, s14
1851; SI-NEXT:    s_or_b32 s4, s4, s12
1852; SI-NEXT:    s_or_b32 s0, s0, s20
1853; SI-NEXT:    v_mov_b32_e32 v0, s4
1854; SI-NEXT:    v_mov_b32_e32 v1, s5
1855; SI-NEXT:    v_mov_b32_e32 v2, s6
1856; SI-NEXT:    v_mov_b32_e32 v3, s7
1857; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
1858; SI-NEXT:    s_waitcnt expcnt(0)
1859; SI-NEXT:    v_mov_b32_e32 v0, s0
1860; SI-NEXT:    v_mov_b32_e32 v1, s1
1861; SI-NEXT:    v_mov_b32_e32 v2, s2
1862; SI-NEXT:    v_mov_b32_e32 v3, s3
1863; SI-NEXT:    s_mov_b32 s8, s9
1864; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
1865; SI-NEXT:    s_endpgm
1866;
1867; VI-LABEL: amd_kernel_v32i8:
1868; VI:       ; %bb.0: ; %entry
1869; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
1870; VI-NEXT:    v_mov_b32_e32 v4, 16
1871; VI-NEXT:    v_mov_b32_e32 v5, 0
1872; VI-NEXT:    s_waitcnt lgkmcnt(0)
1873; VI-NEXT:    s_lshr_b32 s9, s3, 16
1874; VI-NEXT:    s_lshr_b32 s8, s3, 24
1875; VI-NEXT:    s_add_i32 s9, s9, s9
1876; VI-NEXT:    s_bfe_u32 s24, s3, 0x80008
1877; VI-NEXT:    s_add_i32 s8, s8, s8
1878; VI-NEXT:    s_and_b32 s9, s9, 0xff
1879; VI-NEXT:    s_add_i32 s24, s24, s24
1880; VI-NEXT:    s_add_i32 s3, s3, s3
1881; VI-NEXT:    s_lshl_b32 s8, s8, 24
1882; VI-NEXT:    s_lshl_b32 s9, s9, 16
1883; VI-NEXT:    s_lshr_b32 s11, s2, 16
1884; VI-NEXT:    s_or_b32 s8, s8, s9
1885; VI-NEXT:    s_and_b32 s3, s3, 0xff
1886; VI-NEXT:    s_lshl_b32 s9, s24, 8
1887; VI-NEXT:    s_lshr_b32 s10, s2, 24
1888; VI-NEXT:    s_add_i32 s11, s11, s11
1889; VI-NEXT:    s_or_b32 s3, s3, s9
1890; VI-NEXT:    s_bfe_u32 s25, s2, 0x80008
1891; VI-NEXT:    s_add_i32 s10, s10, s10
1892; VI-NEXT:    s_and_b32 s3, s3, 0xffff
1893; VI-NEXT:    s_and_b32 s9, s11, 0xff
1894; VI-NEXT:    s_add_i32 s25, s25, s25
1895; VI-NEXT:    s_add_i32 s2, s2, s2
1896; VI-NEXT:    s_or_b32 s3, s3, s8
1897; VI-NEXT:    s_lshl_b32 s8, s10, 24
1898; VI-NEXT:    s_lshl_b32 s9, s9, 16
1899; VI-NEXT:    s_lshr_b32 s13, s1, 16
1900; VI-NEXT:    s_or_b32 s8, s8, s9
1901; VI-NEXT:    s_and_b32 s2, s2, 0xff
1902; VI-NEXT:    s_lshl_b32 s9, s25, 8
1903; VI-NEXT:    s_lshr_b32 s12, s1, 24
1904; VI-NEXT:    s_add_i32 s13, s13, s13
1905; VI-NEXT:    s_or_b32 s2, s2, s9
1906; VI-NEXT:    s_bfe_u32 s26, s1, 0x80008
1907; VI-NEXT:    s_add_i32 s12, s12, s12
1908; VI-NEXT:    s_and_b32 s2, s2, 0xffff
1909; VI-NEXT:    s_and_b32 s9, s13, 0xff
1910; VI-NEXT:    s_add_i32 s26, s26, s26
1911; VI-NEXT:    s_add_i32 s1, s1, s1
1912; VI-NEXT:    s_or_b32 s2, s2, s8
1913; VI-NEXT:    s_lshl_b32 s8, s12, 24
1914; VI-NEXT:    s_lshl_b32 s9, s9, 16
1915; VI-NEXT:    s_lshr_b32 s15, s0, 16
1916; VI-NEXT:    s_or_b32 s8, s8, s9
1917; VI-NEXT:    s_and_b32 s1, s1, 0xff
1918; VI-NEXT:    s_lshl_b32 s9, s26, 8
1919; VI-NEXT:    s_lshr_b32 s14, s0, 24
1920; VI-NEXT:    s_add_i32 s15, s15, s15
1921; VI-NEXT:    s_or_b32 s1, s1, s9
1922; VI-NEXT:    s_bfe_u32 s27, s0, 0x80008
1923; VI-NEXT:    s_add_i32 s14, s14, s14
1924; VI-NEXT:    s_and_b32 s1, s1, 0xffff
1925; VI-NEXT:    s_and_b32 s9, s15, 0xff
1926; VI-NEXT:    s_add_i32 s27, s27, s27
1927; VI-NEXT:    s_add_i32 s0, s0, s0
1928; VI-NEXT:    s_or_b32 s1, s1, s8
1929; VI-NEXT:    s_lshl_b32 s8, s14, 24
1930; VI-NEXT:    s_lshl_b32 s9, s9, 16
1931; VI-NEXT:    s_lshr_b32 s17, s7, 16
1932; VI-NEXT:    s_or_b32 s8, s8, s9
1933; VI-NEXT:    s_and_b32 s0, s0, 0xff
1934; VI-NEXT:    s_lshl_b32 s9, s27, 8
1935; VI-NEXT:    s_lshr_b32 s16, s7, 24
1936; VI-NEXT:    s_add_i32 s17, s17, s17
1937; VI-NEXT:    s_or_b32 s0, s0, s9
1938; VI-NEXT:    s_bfe_u32 s28, s7, 0x80008
1939; VI-NEXT:    s_add_i32 s16, s16, s16
1940; VI-NEXT:    s_and_b32 s0, s0, 0xffff
1941; VI-NEXT:    s_and_b32 s9, s17, 0xff
1942; VI-NEXT:    s_add_i32 s28, s28, s28
1943; VI-NEXT:    s_add_i32 s7, s7, s7
1944; VI-NEXT:    s_or_b32 s0, s0, s8
1945; VI-NEXT:    s_lshl_b32 s8, s16, 24
1946; VI-NEXT:    s_lshl_b32 s9, s9, 16
1947; VI-NEXT:    s_lshr_b32 s19, s6, 16
1948; VI-NEXT:    s_or_b32 s8, s8, s9
1949; VI-NEXT:    s_and_b32 s7, s7, 0xff
1950; VI-NEXT:    s_lshl_b32 s9, s28, 8
1951; VI-NEXT:    s_lshr_b32 s18, s6, 24
1952; VI-NEXT:    s_add_i32 s19, s19, s19
1953; VI-NEXT:    s_or_b32 s7, s7, s9
1954; VI-NEXT:    s_bfe_u32 s29, s6, 0x80008
1955; VI-NEXT:    s_add_i32 s18, s18, s18
1956; VI-NEXT:    s_and_b32 s7, s7, 0xffff
1957; VI-NEXT:    s_and_b32 s9, s19, 0xff
1958; VI-NEXT:    s_add_i32 s29, s29, s29
1959; VI-NEXT:    s_add_i32 s6, s6, s6
1960; VI-NEXT:    s_or_b32 s7, s7, s8
1961; VI-NEXT:    s_lshl_b32 s8, s18, 24
1962; VI-NEXT:    s_lshl_b32 s9, s9, 16
1963; VI-NEXT:    s_lshr_b32 s21, s5, 16
1964; VI-NEXT:    s_or_b32 s8, s8, s9
1965; VI-NEXT:    s_and_b32 s6, s6, 0xff
1966; VI-NEXT:    s_lshl_b32 s9, s29, 8
1967; VI-NEXT:    s_lshr_b32 s20, s5, 24
1968; VI-NEXT:    s_add_i32 s21, s21, s21
1969; VI-NEXT:    s_or_b32 s6, s6, s9
1970; VI-NEXT:    s_bfe_u32 s30, s5, 0x80008
1971; VI-NEXT:    s_add_i32 s20, s20, s20
1972; VI-NEXT:    s_and_b32 s6, s6, 0xffff
1973; VI-NEXT:    s_and_b32 s9, s21, 0xff
1974; VI-NEXT:    s_add_i32 s30, s30, s30
1975; VI-NEXT:    s_add_i32 s5, s5, s5
1976; VI-NEXT:    s_or_b32 s6, s6, s8
1977; VI-NEXT:    s_lshl_b32 s8, s20, 24
1978; VI-NEXT:    s_lshl_b32 s9, s9, 16
1979; VI-NEXT:    s_lshr_b32 s23, s4, 16
1980; VI-NEXT:    s_or_b32 s8, s8, s9
1981; VI-NEXT:    s_and_b32 s5, s5, 0xff
1982; VI-NEXT:    s_lshl_b32 s9, s30, 8
1983; VI-NEXT:    s_lshr_b32 s22, s4, 24
1984; VI-NEXT:    s_add_i32 s23, s23, s23
1985; VI-NEXT:    s_or_b32 s5, s5, s9
1986; VI-NEXT:    s_bfe_u32 s31, s4, 0x80008
1987; VI-NEXT:    s_add_i32 s22, s22, s22
1988; VI-NEXT:    s_and_b32 s5, s5, 0xffff
1989; VI-NEXT:    s_and_b32 s9, s23, 0xff
1990; VI-NEXT:    s_add_i32 s31, s31, s31
1991; VI-NEXT:    s_add_i32 s4, s4, s4
1992; VI-NEXT:    s_or_b32 s5, s5, s8
1993; VI-NEXT:    s_lshl_b32 s8, s22, 24
1994; VI-NEXT:    s_lshl_b32 s9, s9, 16
1995; VI-NEXT:    s_or_b32 s8, s8, s9
1996; VI-NEXT:    s_and_b32 s4, s4, 0xff
1997; VI-NEXT:    s_lshl_b32 s9, s31, 8
1998; VI-NEXT:    s_or_b32 s4, s4, s9
1999; VI-NEXT:    s_and_b32 s4, s4, 0xffff
2000; VI-NEXT:    s_or_b32 s4, s4, s8
2001; VI-NEXT:    v_mov_b32_e32 v0, s4
2002; VI-NEXT:    v_mov_b32_e32 v1, s5
2003; VI-NEXT:    v_mov_b32_e32 v2, s6
2004; VI-NEXT:    v_mov_b32_e32 v3, s7
2005; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2006; VI-NEXT:    v_mov_b32_e32 v4, 0
2007; VI-NEXT:    v_mov_b32_e32 v0, s0
2008; VI-NEXT:    v_mov_b32_e32 v1, s1
2009; VI-NEXT:    v_mov_b32_e32 v2, s2
2010; VI-NEXT:    v_mov_b32_e32 v5, 0
2011; VI-NEXT:    v_mov_b32_e32 v3, s3
2012; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2013; VI-NEXT:    s_endpgm
2014;
2015; GFX11-LABEL: amd_kernel_v32i8:
2016; GFX11:       ; %bb.0: ; %entry
2017; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
2018; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2019; GFX11-NEXT:    s_lshr_b32 s16, s0, 16
2020; GFX11-NEXT:    s_lshr_b32 s17, s0, 24
2021; GFX11-NEXT:    s_lshr_b32 s20, s2, 16
2022; GFX11-NEXT:    s_lshr_b32 s21, s2, 24
2023; GFX11-NEXT:    s_lshr_b32 s14, s7, 16
2024; GFX11-NEXT:    s_lshr_b32 s15, s7, 24
2025; GFX11-NEXT:    s_bfe_u32 s27, s7, 0x80008
2026; GFX11-NEXT:    s_add_i32 s17, s17, s17
2027; GFX11-NEXT:    s_add_i32 s16, s16, s16
2028; GFX11-NEXT:    s_lshr_b32 s18, s1, 16
2029; GFX11-NEXT:    s_lshr_b32 s19, s1, 24
2030; GFX11-NEXT:    s_lshr_b32 s22, s3, 16
2031; GFX11-NEXT:    s_lshr_b32 s23, s3, 24
2032; GFX11-NEXT:    s_bfe_u32 s29, s1, 0x80008
2033; GFX11-NEXT:    s_bfe_u32 s30, s3, 0x80008
2034; GFX11-NEXT:    s_add_i32 s21, s21, s21
2035; GFX11-NEXT:    s_add_i32 s20, s20, s20
2036; GFX11-NEXT:    s_lshl_b32 s17, s17, 8
2037; GFX11-NEXT:    s_and_b32 s16, s16, 0xff
2038; GFX11-NEXT:    s_add_i32 s7, s7, s7
2039; GFX11-NEXT:    s_add_i32 s27, s27, s27
2040; GFX11-NEXT:    s_add_i32 s15, s15, s15
2041; GFX11-NEXT:    s_add_i32 s14, s14, s14
2042; GFX11-NEXT:    s_add_i32 s3, s3, s3
2043; GFX11-NEXT:    s_add_i32 s30, s30, s30
2044; GFX11-NEXT:    s_add_i32 s23, s23, s23
2045; GFX11-NEXT:    s_add_i32 s22, s22, s22
2046; GFX11-NEXT:    s_lshl_b32 s21, s21, 8
2047; GFX11-NEXT:    s_and_b32 s20, s20, 0xff
2048; GFX11-NEXT:    s_add_i32 s1, s1, s1
2049; GFX11-NEXT:    s_add_i32 s29, s29, s29
2050; GFX11-NEXT:    s_add_i32 s19, s19, s19
2051; GFX11-NEXT:    s_add_i32 s18, s18, s18
2052; GFX11-NEXT:    s_lshr_b32 s10, s5, 16
2053; GFX11-NEXT:    s_lshr_b32 s11, s5, 24
2054; GFX11-NEXT:    s_lshr_b32 s12, s6, 16
2055; GFX11-NEXT:    s_lshr_b32 s13, s6, 24
2056; GFX11-NEXT:    s_or_b32 s16, s16, s17
2057; GFX11-NEXT:    s_and_b32 s7, s7, 0xff
2058; GFX11-NEXT:    s_lshl_b32 s17, s27, 8
2059; GFX11-NEXT:    s_lshl_b32 s15, s15, 8
2060; GFX11-NEXT:    s_and_b32 s14, s14, 0xff
2061; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
2062; GFX11-NEXT:    s_lshl_b32 s30, s30, 8
2063; GFX11-NEXT:    s_lshl_b32 s23, s23, 8
2064; GFX11-NEXT:    s_and_b32 s22, s22, 0xff
2065; GFX11-NEXT:    s_or_b32 s20, s20, s21
2066; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
2067; GFX11-NEXT:    s_lshl_b32 s21, s29, 8
2068; GFX11-NEXT:    s_lshl_b32 s19, s19, 8
2069; GFX11-NEXT:    s_and_b32 s18, s18, 0xff
2070; GFX11-NEXT:    s_lshr_b32 s8, s4, 16
2071; GFX11-NEXT:    s_lshr_b32 s9, s4, 24
2072; GFX11-NEXT:    s_bfe_u32 s24, s4, 0x80008
2073; GFX11-NEXT:    s_bfe_u32 s25, s5, 0x80008
2074; GFX11-NEXT:    s_bfe_u32 s26, s6, 0x80008
2075; GFX11-NEXT:    s_or_b32 s7, s7, s17
2076; GFX11-NEXT:    s_or_b32 s14, s14, s15
2077; GFX11-NEXT:    s_add_i32 s13, s13, s13
2078; GFX11-NEXT:    s_add_i32 s12, s12, s12
2079; GFX11-NEXT:    s_add_i32 s11, s11, s11
2080; GFX11-NEXT:    s_add_i32 s10, s10, s10
2081; GFX11-NEXT:    s_bfe_u32 s28, s0, 0x80008
2082; GFX11-NEXT:    s_or_b32 s3, s3, s30
2083; GFX11-NEXT:    s_or_b32 s22, s22, s23
2084; GFX11-NEXT:    s_bfe_u32 s23, s2, 0x80008
2085; GFX11-NEXT:    s_or_b32 s1, s1, s21
2086; GFX11-NEXT:    s_or_b32 s18, s18, s19
2087; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff
2088; GFX11-NEXT:    s_lshl_b32 s14, s14, 16
2089; GFX11-NEXT:    s_add_i32 s6, s6, s6
2090; GFX11-NEXT:    s_add_i32 s26, s26, s26
2091; GFX11-NEXT:    s_lshl_b32 s13, s13, 8
2092; GFX11-NEXT:    s_and_b32 s12, s12, 0xff
2093; GFX11-NEXT:    s_add_i32 s5, s5, s5
2094; GFX11-NEXT:    s_add_i32 s25, s25, s25
2095; GFX11-NEXT:    s_lshl_b32 s11, s11, 8
2096; GFX11-NEXT:    s_and_b32 s10, s10, 0xff
2097; GFX11-NEXT:    s_add_i32 s4, s4, s4
2098; GFX11-NEXT:    s_add_i32 s24, s24, s24
2099; GFX11-NEXT:    s_add_i32 s9, s9, s9
2100; GFX11-NEXT:    s_add_i32 s8, s8, s8
2101; GFX11-NEXT:    s_and_b32 s3, s3, 0xffff
2102; GFX11-NEXT:    s_lshl_b32 s22, s22, 16
2103; GFX11-NEXT:    s_add_i32 s2, s2, s2
2104; GFX11-NEXT:    s_add_i32 s23, s23, s23
2105; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
2106; GFX11-NEXT:    s_lshl_b32 s18, s18, 16
2107; GFX11-NEXT:    s_add_i32 s0, s0, s0
2108; GFX11-NEXT:    s_add_i32 s28, s28, s28
2109; GFX11-NEXT:    s_or_b32 s7, s7, s14
2110; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
2111; GFX11-NEXT:    s_lshl_b32 s14, s26, 8
2112; GFX11-NEXT:    s_or_b32 s12, s12, s13
2113; GFX11-NEXT:    s_and_b32 s5, s5, 0xff
2114; GFX11-NEXT:    s_lshl_b32 s13, s25, 8
2115; GFX11-NEXT:    s_or_b32 s10, s10, s11
2116; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
2117; GFX11-NEXT:    s_lshl_b32 s11, s24, 8
2118; GFX11-NEXT:    s_lshl_b32 s9, s9, 8
2119; GFX11-NEXT:    s_and_b32 s8, s8, 0xff
2120; GFX11-NEXT:    s_or_b32 s3, s3, s22
2121; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
2122; GFX11-NEXT:    s_lshl_b32 s22, s23, 8
2123; GFX11-NEXT:    s_or_b32 s1, s1, s18
2124; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
2125; GFX11-NEXT:    s_lshl_b32 s18, s28, 8
2126; GFX11-NEXT:    s_or_b32 s6, s6, s14
2127; GFX11-NEXT:    s_or_b32 s5, s5, s13
2128; GFX11-NEXT:    s_or_b32 s4, s4, s11
2129; GFX11-NEXT:    s_or_b32 s8, s8, s9
2130; GFX11-NEXT:    s_or_b32 s2, s2, s22
2131; GFX11-NEXT:    s_or_b32 s0, s0, s18
2132; GFX11-NEXT:    s_and_b32 s6, s6, 0xffff
2133; GFX11-NEXT:    s_lshl_b32 s12, s12, 16
2134; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
2135; GFX11-NEXT:    s_and_b32 s4, s4, 0xffff
2136; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
2137; GFX11-NEXT:    s_lshl_b32 s9, s10, 16
2138; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
2139; GFX11-NEXT:    s_lshl_b32 s20, s20, 16
2140; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
2141; GFX11-NEXT:    s_lshl_b32 s16, s16, 16
2142; GFX11-NEXT:    s_or_b32 s6, s6, s12
2143; GFX11-NEXT:    s_or_b32 s4, s4, s8
2144; GFX11-NEXT:    s_or_b32 s5, s5, s9
2145; GFX11-NEXT:    s_or_b32 s2, s2, s20
2146; GFX11-NEXT:    s_or_b32 s0, s0, s16
2147; GFX11-NEXT:    v_dual_mov_b32 v8, 16 :: v_dual_mov_b32 v5, s1
2148; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
2149; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
2150; GFX11-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v4, s0
2151; GFX11-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v7, s3
2152; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v11, 0
2153; GFX11-NEXT:    s_clause 0x1
2154; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
2155; GFX11-NEXT:    global_store_b128 v[10:11], v[4:7], off
2156; GFX11-NEXT:    s_endpgm
2157entry:
2158  %add = add <32 x i8> %arg0, %arg0
2159  store <32 x i8> %add, ptr addrspace(1) null
2160  ret void
2161}
2162
2163define amdgpu_cs void @amdgpu_cs_i1(i1 %arg0) {
2164; SI-LABEL: amdgpu_cs_i1:
2165; SI:       ; %bb.0:
2166; SI-NEXT:    v_and_b32_e32 v0, 1, v0
2167; SI-NEXT:    s_mov_b32 s3, 0xf000
2168; SI-NEXT:    s_mov_b32 s2, -1
2169; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2170; SI-NEXT:    s_endpgm
2171;
2172; VI-LABEL: amdgpu_cs_i1:
2173; VI:       ; %bb.0:
2174; VI-NEXT:    v_and_b32_e32 v0, 1, v0
2175; VI-NEXT:    flat_store_byte v[0:1], v0
2176; VI-NEXT:    s_endpgm
2177;
2178; GFX11-LABEL: amdgpu_cs_i1:
2179; GFX11:       ; %bb.0:
2180; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
2181; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
2182; GFX11-NEXT:    s_endpgm
2183  store i1 %arg0, ptr addrspace(1) undef
2184  ret void
2185}
2186
2187define amdgpu_cs void @amdgpu_cs_v8i1(<8 x i1> %arg0) {
2188; SI-LABEL: amdgpu_cs_v8i1:
2189; SI:       ; %bb.0:
2190; SI-NEXT:    v_lshlrev_b32_e32 v7, 3, v7
2191; SI-NEXT:    v_and_b32_e32 v6, 1, v6
2192; SI-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
2193; SI-NEXT:    v_and_b32_e32 v4, 1, v4
2194; SI-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
2195; SI-NEXT:    v_and_b32_e32 v2, 1, v2
2196; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
2197; SI-NEXT:    v_and_b32_e32 v0, 1, v0
2198; SI-NEXT:    s_mov_b32 s3, 0xf000
2199; SI-NEXT:    v_lshlrev_b32_e32 v6, 2, v6
2200; SI-NEXT:    v_or_b32_e32 v4, v4, v5
2201; SI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
2202; SI-NEXT:    v_or_b32_e32 v0, v0, v1
2203; SI-NEXT:    v_or_b32_e32 v1, v7, v6
2204; SI-NEXT:    v_and_b32_e32 v4, 3, v4
2205; SI-NEXT:    v_or_b32_e32 v2, v3, v2
2206; SI-NEXT:    v_and_b32_e32 v0, 3, v0
2207; SI-NEXT:    v_or_b32_e32 v1, v4, v1
2208; SI-NEXT:    v_or_b32_e32 v0, v0, v2
2209; SI-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
2210; SI-NEXT:    v_and_b32_e32 v0, 15, v0
2211; SI-NEXT:    v_or_b32_e32 v0, v0, v1
2212; SI-NEXT:    s_mov_b32 s2, -1
2213; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2214; SI-NEXT:    s_endpgm
2215;
2216; VI-LABEL: amdgpu_cs_v8i1:
2217; VI:       ; %bb.0:
2218; VI-NEXT:    v_and_b32_e32 v6, 1, v6
2219; VI-NEXT:    v_lshlrev_b16_e32 v5, 1, v5
2220; VI-NEXT:    v_and_b32_e32 v4, 1, v4
2221; VI-NEXT:    v_and_b32_e32 v2, 1, v2
2222; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
2223; VI-NEXT:    v_and_b32_e32 v0, 1, v0
2224; VI-NEXT:    v_lshlrev_b16_e32 v7, 3, v7
2225; VI-NEXT:    v_lshlrev_b16_e32 v6, 2, v6
2226; VI-NEXT:    v_or_b32_e32 v4, v4, v5
2227; VI-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
2228; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
2229; VI-NEXT:    v_or_b32_e32 v0, v0, v1
2230; VI-NEXT:    v_or_b32_e32 v6, v7, v6
2231; VI-NEXT:    v_and_b32_e32 v4, 3, v4
2232; VI-NEXT:    v_or_b32_e32 v2, v3, v2
2233; VI-NEXT:    v_and_b32_e32 v0, 3, v0
2234; VI-NEXT:    v_or_b32_e32 v4, v4, v6
2235; VI-NEXT:    v_or_b32_e32 v0, v0, v2
2236; VI-NEXT:    v_lshlrev_b16_e32 v4, 4, v4
2237; VI-NEXT:    v_and_b32_e32 v0, 15, v0
2238; VI-NEXT:    v_or_b32_e32 v0, v0, v4
2239; VI-NEXT:    flat_store_byte v[0:1], v0
2240; VI-NEXT:    s_endpgm
2241;
2242; GFX11-LABEL: amdgpu_cs_v8i1:
2243; GFX11:       ; %bb.0:
2244; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
2245; GFX11-NEXT:    v_lshlrev_b16 v5, 1, v5
2246; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
2247; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
2248; GFX11-NEXT:    v_lshlrev_b16 v1, 1, v1
2249; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
2250; GFX11-NEXT:    v_lshlrev_b16 v7, 3, v7
2251; GFX11-NEXT:    v_lshlrev_b16 v6, 2, v6
2252; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
2253; GFX11-NEXT:    v_lshlrev_b16 v3, 3, v3
2254; GFX11-NEXT:    v_lshlrev_b16 v2, 2, v2
2255; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
2256; GFX11-NEXT:    v_or_b32_e32 v1, v7, v6
2257; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
2258; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2259; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
2260; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
2261; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2262; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
2263; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
2264; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2265; GFX11-NEXT:    v_lshlrev_b16 v1, 4, v1
2266; GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
2267; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2268; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
2269; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
2270; GFX11-NEXT:    s_endpgm
2271  store <8 x i1> %arg0, ptr addrspace(1) undef
2272  ret void
2273}
2274
2275define amdgpu_cs void @amdgpu_cs_v16i1(<16 x i1> %arg0) {
2276; SI-LABEL: amdgpu_cs_v16i1:
2277; SI:       ; %bb.0:
2278; SI-NEXT:    v_lshlrev_b32_e32 v15, 3, v15
2279; SI-NEXT:    v_and_b32_e32 v14, 1, v14
2280; SI-NEXT:    v_lshlrev_b32_e32 v13, 1, v13
2281; SI-NEXT:    v_and_b32_e32 v12, 1, v12
2282; SI-NEXT:    v_lshlrev_b32_e32 v11, 3, v11
2283; SI-NEXT:    v_and_b32_e32 v10, 1, v10
2284; SI-NEXT:    v_lshlrev_b32_e32 v9, 1, v9
2285; SI-NEXT:    v_and_b32_e32 v8, 1, v8
2286; SI-NEXT:    v_lshlrev_b32_e32 v7, 3, v7
2287; SI-NEXT:    v_and_b32_e32 v6, 1, v6
2288; SI-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
2289; SI-NEXT:    v_and_b32_e32 v4, 1, v4
2290; SI-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
2291; SI-NEXT:    v_and_b32_e32 v2, 1, v2
2292; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
2293; SI-NEXT:    v_and_b32_e32 v0, 1, v0
2294; SI-NEXT:    s_mov_b32 s3, 0xf000
2295; SI-NEXT:    v_lshlrev_b32_e32 v14, 2, v14
2296; SI-NEXT:    v_or_b32_e32 v12, v12, v13
2297; SI-NEXT:    v_lshlrev_b32_e32 v10, 2, v10
2298; SI-NEXT:    v_or_b32_e32 v8, v8, v9
2299; SI-NEXT:    v_lshlrev_b32_e32 v6, 2, v6
2300; SI-NEXT:    v_or_b32_e32 v4, v4, v5
2301; SI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
2302; SI-NEXT:    v_or_b32_e32 v0, v0, v1
2303; SI-NEXT:    v_or_b32_e32 v1, v15, v14
2304; SI-NEXT:    v_and_b32_e32 v5, 3, v12
2305; SI-NEXT:    v_or_b32_e32 v9, v11, v10
2306; SI-NEXT:    v_and_b32_e32 v8, 3, v8
2307; SI-NEXT:    v_or_b32_e32 v6, v7, v6
2308; SI-NEXT:    v_and_b32_e32 v4, 3, v4
2309; SI-NEXT:    v_or_b32_e32 v2, v3, v2
2310; SI-NEXT:    v_and_b32_e32 v0, 3, v0
2311; SI-NEXT:    v_or_b32_e32 v1, v5, v1
2312; SI-NEXT:    v_or_b32_e32 v3, v8, v9
2313; SI-NEXT:    v_or_b32_e32 v4, v4, v6
2314; SI-NEXT:    v_or_b32_e32 v0, v0, v2
2315; SI-NEXT:    v_lshlrev_b32_e32 v1, 12, v1
2316; SI-NEXT:    v_and_b32_e32 v2, 15, v3
2317; SI-NEXT:    v_lshlrev_b32_e32 v3, 4, v4
2318; SI-NEXT:    v_and_b32_e32 v0, 15, v0
2319; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
2320; SI-NEXT:    v_or_b32_e32 v0, v0, v3
2321; SI-NEXT:    v_or_b32_e32 v1, v1, v2
2322; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
2323; SI-NEXT:    v_or_b32_e32 v0, v0, v1
2324; SI-NEXT:    s_mov_b32 s2, -1
2325; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
2326; SI-NEXT:    s_endpgm
2327;
2328; VI-LABEL: amdgpu_cs_v16i1:
2329; VI:       ; %bb.0:
2330; VI-NEXT:    v_and_b32_e32 v14, 1, v14
2331; VI-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
2332; VI-NEXT:    v_and_b32_e32 v12, 1, v12
2333; VI-NEXT:    v_and_b32_e32 v10, 1, v10
2334; VI-NEXT:    v_lshlrev_b16_e32 v9, 1, v9
2335; VI-NEXT:    v_and_b32_e32 v8, 1, v8
2336; VI-NEXT:    v_and_b32_e32 v6, 1, v6
2337; VI-NEXT:    v_lshlrev_b16_e32 v5, 1, v5
2338; VI-NEXT:    v_and_b32_e32 v4, 1, v4
2339; VI-NEXT:    v_and_b32_e32 v2, 1, v2
2340; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
2341; VI-NEXT:    v_and_b32_e32 v0, 1, v0
2342; VI-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
2343; VI-NEXT:    v_lshlrev_b16_e32 v14, 2, v14
2344; VI-NEXT:    v_or_b32_e32 v12, v12, v13
2345; VI-NEXT:    v_lshlrev_b16_e32 v11, 3, v11
2346; VI-NEXT:    v_lshlrev_b16_e32 v10, 2, v10
2347; VI-NEXT:    v_or_b32_e32 v8, v8, v9
2348; VI-NEXT:    v_lshlrev_b16_e32 v7, 3, v7
2349; VI-NEXT:    v_lshlrev_b16_e32 v6, 2, v6
2350; VI-NEXT:    v_or_b32_e32 v4, v4, v5
2351; VI-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
2352; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
2353; VI-NEXT:    v_or_b32_e32 v0, v0, v1
2354; VI-NEXT:    v_or_b32_e32 v14, v15, v14
2355; VI-NEXT:    v_and_b32_e32 v12, 3, v12
2356; VI-NEXT:    v_or_b32_e32 v10, v11, v10
2357; VI-NEXT:    v_and_b32_e32 v8, 3, v8
2358; VI-NEXT:    v_or_b32_e32 v6, v7, v6
2359; VI-NEXT:    v_and_b32_e32 v4, 3, v4
2360; VI-NEXT:    v_or_b32_e32 v2, v3, v2
2361; VI-NEXT:    v_and_b32_e32 v0, 3, v0
2362; VI-NEXT:    v_or_b32_e32 v12, v12, v14
2363; VI-NEXT:    v_or_b32_e32 v8, v8, v10
2364; VI-NEXT:    v_mov_b32_e32 v9, 15
2365; VI-NEXT:    v_or_b32_e32 v4, v4, v6
2366; VI-NEXT:    v_or_b32_e32 v0, v0, v2
2367; VI-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
2368; VI-NEXT:    v_and_b32_sdwa v8, v8, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2369; VI-NEXT:    v_lshlrev_b16_e32 v4, 4, v4
2370; VI-NEXT:    v_and_b32_e32 v0, 15, v0
2371; VI-NEXT:    v_or_b32_e32 v8, v12, v8
2372; VI-NEXT:    v_or_b32_e32 v0, v0, v4
2373; VI-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2374; VI-NEXT:    flat_store_short v[0:1], v0
2375; VI-NEXT:    s_endpgm
2376;
2377; GFX11-LABEL: amdgpu_cs_v16i1:
2378; GFX11:       ; %bb.0:
2379; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
2380; GFX11-NEXT:    v_lshlrev_b16 v9, 1, v9
2381; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
2382; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
2383; GFX11-NEXT:    v_lshlrev_b16 v5, 1, v5
2384; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
2385; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
2386; GFX11-NEXT:    v_lshlrev_b16 v1, 1, v1
2387; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
2388; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
2389; GFX11-NEXT:    v_lshlrev_b16 v13, 1, v13
2390; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
2391; GFX11-NEXT:    v_lshlrev_b16 v11, 3, v11
2392; GFX11-NEXT:    v_lshlrev_b16 v10, 2, v10
2393; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
2394; GFX11-NEXT:    v_lshlrev_b16 v7, 3, v7
2395; GFX11-NEXT:    v_lshlrev_b16 v6, 2, v6
2396; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
2397; GFX11-NEXT:    v_lshlrev_b16 v3, 3, v3
2398; GFX11-NEXT:    v_lshlrev_b16 v2, 2, v2
2399; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
2400; GFX11-NEXT:    v_lshlrev_b16 v15, 3, v15
2401; GFX11-NEXT:    v_lshlrev_b16 v14, 2, v14
2402; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
2403; GFX11-NEXT:    v_or_b32_e32 v10, v11, v10
2404; GFX11-NEXT:    v_and_b32_e32 v1, 3, v8
2405; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
2406; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
2407; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
2408; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
2409; GFX11-NEXT:    v_or_b32_e32 v3, v15, v14
2410; GFX11-NEXT:    v_and_b32_e32 v6, 3, v12
2411; GFX11-NEXT:    v_or_b32_e32 v1, v1, v10
2412; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
2413; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
2414; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2415; GFX11-NEXT:    v_or_b32_e32 v2, v6, v3
2416; GFX11-NEXT:    v_and_b32_e32 v1, 15, v1
2417; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2418; GFX11-NEXT:    v_lshlrev_b16 v3, 4, v4
2419; GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
2420; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2421; GFX11-NEXT:    v_lshlrev_b16 v2, 12, v2
2422; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
2423; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2424; GFX11-NEXT:    v_or_b32_e32 v0, v0, v3
2425; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
2426; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2427; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
2428; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
2429; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
2430; GFX11-NEXT:    s_endpgm
2431  store <16 x i1> %arg0, ptr addrspace(1) undef
2432  ret void
2433}
2434
2435define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
2436; SI-LABEL: amdgpu_cs_v32i1:
2437; SI:       ; %bb.0:
2438; SI-NEXT:    v_lshlrev_b32_e32 v29, 1, v29
2439; SI-NEXT:    v_and_b32_e32 v28, 1, v28
2440; SI-NEXT:    v_lshlrev_b32_e32 v25, 1, v25
2441; SI-NEXT:    v_and_b32_e32 v24, 1, v24
2442; SI-NEXT:    v_lshlrev_b32_e32 v21, 1, v21
2443; SI-NEXT:    v_and_b32_e32 v20, 1, v20
2444; SI-NEXT:    v_lshlrev_b32_e32 v17, 1, v17
2445; SI-NEXT:    v_and_b32_e32 v16, 1, v16
2446; SI-NEXT:    v_lshlrev_b32_e32 v13, 1, v13
2447; SI-NEXT:    v_and_b32_e32 v12, 1, v12
2448; SI-NEXT:    v_lshlrev_b32_e32 v9, 1, v9
2449; SI-NEXT:    v_and_b32_e32 v8, 1, v8
2450; SI-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
2451; SI-NEXT:    v_and_b32_e32 v4, 1, v4
2452; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
2453; SI-NEXT:    v_and_b32_e32 v0, 1, v0
2454; SI-NEXT:    v_lshlrev_b32_e32 v31, 3, v31
2455; SI-NEXT:    v_and_b32_e32 v30, 1, v30
2456; SI-NEXT:    v_lshlrev_b32_e32 v27, 3, v27
2457; SI-NEXT:    v_and_b32_e32 v26, 1, v26
2458; SI-NEXT:    v_lshlrev_b32_e32 v23, 3, v23
2459; SI-NEXT:    v_and_b32_e32 v22, 1, v22
2460; SI-NEXT:    v_lshlrev_b32_e32 v19, 3, v19
2461; SI-NEXT:    v_and_b32_e32 v18, 1, v18
2462; SI-NEXT:    v_lshlrev_b32_e32 v15, 3, v15
2463; SI-NEXT:    v_and_b32_e32 v14, 1, v14
2464; SI-NEXT:    v_lshlrev_b32_e32 v11, 3, v11
2465; SI-NEXT:    v_and_b32_e32 v10, 1, v10
2466; SI-NEXT:    v_lshlrev_b32_e32 v7, 3, v7
2467; SI-NEXT:    v_and_b32_e32 v6, 1, v6
2468; SI-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
2469; SI-NEXT:    v_and_b32_e32 v2, 1, v2
2470; SI-NEXT:    s_mov_b32 s3, 0xf000
2471; SI-NEXT:    v_or_b32_e32 v28, v28, v29
2472; SI-NEXT:    v_or_b32_e32 v24, v24, v25
2473; SI-NEXT:    v_or_b32_e32 v20, v20, v21
2474; SI-NEXT:    v_or_b32_e32 v16, v16, v17
2475; SI-NEXT:    v_or_b32_e32 v12, v12, v13
2476; SI-NEXT:    v_or_b32_e32 v8, v8, v9
2477; SI-NEXT:    v_or_b32_e32 v4, v4, v5
2478; SI-NEXT:    v_or_b32_e32 v0, v0, v1
2479; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v30
2480; SI-NEXT:    v_lshlrev_b32_e32 v5, 2, v26
2481; SI-NEXT:    v_lshlrev_b32_e32 v9, 2, v22
2482; SI-NEXT:    v_lshlrev_b32_e32 v13, 2, v18
2483; SI-NEXT:    v_lshlrev_b32_e32 v14, 2, v14
2484; SI-NEXT:    v_lshlrev_b32_e32 v10, 2, v10
2485; SI-NEXT:    v_lshlrev_b32_e32 v6, 2, v6
2486; SI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
2487; SI-NEXT:    v_or_b32_e32 v1, v31, v1
2488; SI-NEXT:    v_or_b32_e32 v5, v27, v5
2489; SI-NEXT:    v_or_b32_e32 v9, v23, v9
2490; SI-NEXT:    v_or_b32_e32 v13, v19, v13
2491; SI-NEXT:    v_and_b32_e32 v17, 3, v28
2492; SI-NEXT:    v_and_b32_e32 v18, 3, v24
2493; SI-NEXT:    v_and_b32_e32 v19, 3, v20
2494; SI-NEXT:    v_and_b32_e32 v16, 3, v16
2495; SI-NEXT:    v_or_b32_e32 v14, v15, v14
2496; SI-NEXT:    v_and_b32_e32 v12, 3, v12
2497; SI-NEXT:    v_or_b32_e32 v10, v11, v10
2498; SI-NEXT:    v_and_b32_e32 v8, 3, v8
2499; SI-NEXT:    v_or_b32_e32 v6, v7, v6
2500; SI-NEXT:    v_and_b32_e32 v4, 3, v4
2501; SI-NEXT:    v_or_b32_e32 v2, v3, v2
2502; SI-NEXT:    v_and_b32_e32 v0, 3, v0
2503; SI-NEXT:    v_or_b32_e32 v1, v17, v1
2504; SI-NEXT:    v_or_b32_e32 v3, v18, v5
2505; SI-NEXT:    v_or_b32_e32 v5, v19, v9
2506; SI-NEXT:    v_or_b32_e32 v7, v16, v13
2507; SI-NEXT:    v_or_b32_e32 v9, v12, v14
2508; SI-NEXT:    v_or_b32_e32 v8, v8, v10
2509; SI-NEXT:    v_or_b32_e32 v4, v4, v6
2510; SI-NEXT:    v_or_b32_e32 v0, v0, v2
2511; SI-NEXT:    v_lshlrev_b32_e32 v1, 12, v1
2512; SI-NEXT:    v_and_b32_e32 v2, 15, v3
2513; SI-NEXT:    v_lshlrev_b32_e32 v3, 4, v5
2514; SI-NEXT:    v_and_b32_e32 v5, 15, v7
2515; SI-NEXT:    v_lshlrev_b32_e32 v6, 12, v9
2516; SI-NEXT:    v_and_b32_e32 v7, 15, v8
2517; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
2518; SI-NEXT:    v_and_b32_e32 v0, 15, v0
2519; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
2520; SI-NEXT:    v_or_b32_e32 v3, v5, v3
2521; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v7
2522; SI-NEXT:    v_or_b32_e32 v0, v0, v4
2523; SI-NEXT:    v_or_b32_e32 v1, v1, v2
2524; SI-NEXT:    v_and_b32_e32 v2, 0xff, v3
2525; SI-NEXT:    v_or_b32_e32 v3, v6, v5
2526; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
2527; SI-NEXT:    v_or_b32_e32 v1, v2, v1
2528; SI-NEXT:    v_or_b32_e32 v0, v0, v3
2529; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2530; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2531; SI-NEXT:    v_or_b32_e32 v0, v0, v1
2532; SI-NEXT:    s_mov_b32 s2, -1
2533; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2534; SI-NEXT:    s_endpgm
2535;
2536; VI-LABEL: amdgpu_cs_v32i1:
2537; VI:       ; %bb.0:
2538; VI-NEXT:    v_and_b32_e32 v6, 1, v6
2539; VI-NEXT:    v_lshlrev_b16_e32 v5, 1, v5
2540; VI-NEXT:    v_and_b32_e32 v4, 1, v4
2541; VI-NEXT:    v_and_b32_e32 v2, 1, v2
2542; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
2543; VI-NEXT:    v_and_b32_e32 v0, 1, v0
2544; VI-NEXT:    v_lshlrev_b16_e32 v7, 3, v7
2545; VI-NEXT:    v_lshlrev_b16_e32 v6, 2, v6
2546; VI-NEXT:    v_or_b32_e32 v4, v4, v5
2547; VI-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
2548; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
2549; VI-NEXT:    v_or_b32_e32 v0, v0, v1
2550; VI-NEXT:    v_or_b32_e32 v6, v7, v6
2551; VI-NEXT:    v_and_b32_e32 v4, 3, v4
2552; VI-NEXT:    v_or_b32_e32 v2, v3, v2
2553; VI-NEXT:    v_and_b32_e32 v0, 3, v0
2554; VI-NEXT:    v_or_b32_e32 v4, v4, v6
2555; VI-NEXT:    v_or_b32_e32 v0, v0, v2
2556; VI-NEXT:    v_lshlrev_b16_e32 v1, 4, v4
2557; VI-NEXT:    v_and_b32_e32 v0, 15, v0
2558; VI-NEXT:    v_and_b32_e32 v2, 1, v30
2559; VI-NEXT:    v_or_b32_e32 v0, v0, v1
2560; VI-NEXT:    v_lshlrev_b16_e32 v1, 3, v31
2561; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
2562; VI-NEXT:    v_or_b32_e32 v1, v1, v2
2563; VI-NEXT:    v_lshlrev_b16_e32 v2, 1, v29
2564; VI-NEXT:    v_and_b32_e32 v3, 1, v28
2565; VI-NEXT:    v_or_b32_e32 v2, v3, v2
2566; VI-NEXT:    v_and_b32_e32 v2, 3, v2
2567; VI-NEXT:    v_and_b32_e32 v3, 1, v26
2568; VI-NEXT:    v_or_b32_e32 v1, v2, v1
2569; VI-NEXT:    v_lshlrev_b16_e32 v2, 3, v27
2570; VI-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
2571; VI-NEXT:    v_and_b32_e32 v10, 1, v10
2572; VI-NEXT:    v_lshlrev_b16_e32 v9, 1, v9
2573; VI-NEXT:    v_and_b32_e32 v8, 1, v8
2574; VI-NEXT:    v_or_b32_e32 v2, v2, v3
2575; VI-NEXT:    v_lshlrev_b16_e32 v3, 1, v25
2576; VI-NEXT:    v_and_b32_e32 v4, 1, v24
2577; VI-NEXT:    v_lshlrev_b16_e32 v11, 3, v11
2578; VI-NEXT:    v_lshlrev_b16_e32 v10, 2, v10
2579; VI-NEXT:    v_or_b32_e32 v8, v8, v9
2580; VI-NEXT:    v_or_b32_e32 v3, v4, v3
2581; VI-NEXT:    v_or_b32_e32 v10, v11, v10
2582; VI-NEXT:    v_and_b32_e32 v8, 3, v8
2583; VI-NEXT:    v_and_b32_e32 v3, 3, v3
2584; VI-NEXT:    v_or_b32_e32 v8, v8, v10
2585; VI-NEXT:    v_mov_b32_e32 v10, 15
2586; VI-NEXT:    v_or_b32_e32 v2, v3, v2
2587; VI-NEXT:    v_lshlrev_b16_e32 v1, 12, v1
2588; VI-NEXT:    v_and_b32_sdwa v2, v2, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2589; VI-NEXT:    v_and_b32_e32 v3, 1, v22
2590; VI-NEXT:    v_or_b32_e32 v1, v1, v2
2591; VI-NEXT:    v_lshlrev_b16_e32 v2, 3, v23
2592; VI-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
2593; VI-NEXT:    v_or_b32_e32 v2, v2, v3
2594; VI-NEXT:    v_lshlrev_b16_e32 v3, 1, v21
2595; VI-NEXT:    v_and_b32_e32 v4, 1, v20
2596; VI-NEXT:    v_or_b32_e32 v3, v4, v3
2597; VI-NEXT:    v_and_b32_e32 v3, 3, v3
2598; VI-NEXT:    v_and_b32_e32 v4, 1, v18
2599; VI-NEXT:    v_or_b32_e32 v2, v3, v2
2600; VI-NEXT:    v_lshlrev_b16_e32 v3, 3, v19
2601; VI-NEXT:    v_lshlrev_b16_e32 v4, 2, v4
2602; VI-NEXT:    v_and_b32_e32 v14, 1, v14
2603; VI-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
2604; VI-NEXT:    v_and_b32_e32 v12, 1, v12
2605; VI-NEXT:    v_or_b32_e32 v3, v3, v4
2606; VI-NEXT:    v_lshlrev_b16_e32 v4, 1, v17
2607; VI-NEXT:    v_and_b32_e32 v5, 1, v16
2608; VI-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
2609; VI-NEXT:    v_lshlrev_b16_e32 v14, 2, v14
2610; VI-NEXT:    v_or_b32_e32 v12, v12, v13
2611; VI-NEXT:    v_or_b32_e32 v4, v5, v4
2612; VI-NEXT:    v_or_b32_e32 v14, v15, v14
2613; VI-NEXT:    v_and_b32_e32 v12, 3, v12
2614; VI-NEXT:    v_and_b32_e32 v4, 3, v4
2615; VI-NEXT:    v_or_b32_e32 v12, v12, v14
2616; VI-NEXT:    v_or_b32_e32 v3, v4, v3
2617; VI-NEXT:    v_lshlrev_b16_e32 v9, 12, v12
2618; VI-NEXT:    v_and_b32_sdwa v8, v8, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2619; VI-NEXT:    v_lshlrev_b16_e32 v2, 4, v2
2620; VI-NEXT:    v_and_b32_e32 v3, 15, v3
2621; VI-NEXT:    v_or_b32_e32 v8, v9, v8
2622; VI-NEXT:    v_or_b32_e32 v2, v3, v2
2623; VI-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2624; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2625; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2626; VI-NEXT:    flat_store_dword v[0:1], v0
2627; VI-NEXT:    s_endpgm
2628;
2629; GFX11-LABEL: amdgpu_cs_v32i1:
2630; GFX11:       ; %bb.0:
2631; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
2632; GFX11-NEXT:    v_lshlrev_b16 v9, 1, v9
2633; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
2634; GFX11-NEXT:    v_lshlrev_b16 v11, 3, v11
2635; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
2636; GFX11-NEXT:    v_lshlrev_b16 v10, 2, v10
2637; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
2638; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
2639; GFX11-NEXT:    v_lshlrev_b16 v1, 1, v1
2640; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
2641; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
2642; GFX11-NEXT:    v_lshlrev_b16 v13, 1, v13
2643; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
2644; GFX11-NEXT:    v_lshlrev_b16 v5, 1, v5
2645; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
2646; GFX11-NEXT:    v_or_b32_e32 v9, v11, v10
2647; GFX11-NEXT:    v_and_b32_e32 v8, 3, v8
2648; GFX11-NEXT:    v_lshlrev_b16 v7, 3, v7
2649; GFX11-NEXT:    v_lshlrev_b16 v6, 2, v6
2650; GFX11-NEXT:    v_lshlrev_b16 v3, 3, v3
2651; GFX11-NEXT:    v_lshlrev_b16 v2, 2, v2
2652; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
2653; GFX11-NEXT:    v_lshlrev_b16 v15, 3, v15
2654; GFX11-NEXT:    v_lshlrev_b16 v14, 2, v14
2655; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
2656; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
2657; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
2658; GFX11-NEXT:    v_or_b32_e32 v6, v8, v9
2659; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
2660; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
2661; GFX11-NEXT:    v_or_b32_e32 v13, v15, v14
2662; GFX11-NEXT:    v_and_b32_e32 v12, 3, v12
2663; GFX11-NEXT:    v_and_b32_e32 v3, 15, v6
2664; GFX11-NEXT:    v_lshlrev_b16 v6, 1, v29
2665; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
2666; GFX11-NEXT:    v_and_b32_e32 v2, 1, v26
2667; GFX11-NEXT:    v_and_b32_e32 v7, 1, v28
2668; GFX11-NEXT:    v_lshlrev_b16 v9, 1, v25
2669; GFX11-NEXT:    v_and_b32_e32 v10, 1, v24
2670; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
2671; GFX11-NEXT:    v_or_b32_e32 v1, v12, v13
2672; GFX11-NEXT:    v_lshlrev_b16 v8, 3, v27
2673; GFX11-NEXT:    v_lshlrev_b16 v2, 2, v2
2674; GFX11-NEXT:    v_or_b32_e32 v6, v7, v6
2675; GFX11-NEXT:    v_or_b32_e32 v7, v10, v9
2676; GFX11-NEXT:    v_and_b32_e32 v9, 1, v22
2677; GFX11-NEXT:    v_lshlrev_b16 v10, 1, v21
2678; GFX11-NEXT:    v_and_b32_e32 v12, 1, v20
2679; GFX11-NEXT:    v_and_b32_e32 v13, 1, v18
2680; GFX11-NEXT:    v_lshlrev_b16 v14, 1, v17
2681; GFX11-NEXT:    v_and_b32_e32 v15, 1, v16
2682; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
2683; GFX11-NEXT:    v_and_b32_e32 v5, 1, v30
2684; GFX11-NEXT:    v_or_b32_e32 v2, v8, v2
2685; GFX11-NEXT:    v_lshlrev_b16 v8, 3, v23
2686; GFX11-NEXT:    v_lshlrev_b16 v9, 2, v9
2687; GFX11-NEXT:    v_or_b32_e32 v10, v12, v10
2688; GFX11-NEXT:    v_lshlrev_b16 v12, 3, v19
2689; GFX11-NEXT:    v_lshlrev_b16 v13, 2, v13
2690; GFX11-NEXT:    v_or_b32_e32 v14, v15, v14
2691; GFX11-NEXT:    v_lshlrev_b16 v11, 3, v31
2692; GFX11-NEXT:    v_lshlrev_b16 v5, 2, v5
2693; GFX11-NEXT:    v_and_b32_e32 v7, 3, v7
2694; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
2695; GFX11-NEXT:    v_and_b32_e32 v9, 3, v10
2696; GFX11-NEXT:    v_or_b32_e32 v10, v12, v13
2697; GFX11-NEXT:    v_and_b32_e32 v12, 3, v14
2698; GFX11-NEXT:    v_or_b32_e32 v5, v11, v5
2699; GFX11-NEXT:    v_and_b32_e32 v6, 3, v6
2700; GFX11-NEXT:    v_or_b32_e32 v2, v7, v2
2701; GFX11-NEXT:    v_or_b32_e32 v7, v9, v8
2702; GFX11-NEXT:    v_or_b32_e32 v8, v12, v10
2703; GFX11-NEXT:    v_lshlrev_b16 v4, 4, v4
2704; GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
2705; GFX11-NEXT:    v_or_b32_e32 v5, v6, v5
2706; GFX11-NEXT:    v_and_b32_e32 v2, 15, v2
2707; GFX11-NEXT:    v_lshlrev_b16 v6, 4, v7
2708; GFX11-NEXT:    v_and_b32_e32 v7, 15, v8
2709; GFX11-NEXT:    v_lshlrev_b16 v1, 12, v1
2710; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
2711; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
2712; GFX11-NEXT:    v_lshlrev_b16 v4, 12, v5
2713; GFX11-NEXT:    v_lshlrev_b16 v2, 8, v2
2714; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
2715; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
2716; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
2717; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2718; GFX11-NEXT:    v_or_b32_e32 v2, v4, v2
2719; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v5
2720; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2721; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
2722; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
2723; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2724; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2725; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2726; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2727; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
2728; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
2729; GFX11-NEXT:    s_endpgm
2730  store <32 x i1> %arg0, ptr addrspace(1) undef
2731  ret void
2732}
2733
2734define amdgpu_cs void @amdgpu_cs_inreg_i1(i1 inreg %arg0) {
2735; SI-LABEL: amdgpu_cs_inreg_i1:
2736; SI:       ; %bb.0:
2737; SI-NEXT:    s_and_b32 s0, s0, 1
2738; SI-NEXT:    s_mov_b32 s3, 0xf000
2739; SI-NEXT:    s_mov_b32 s2, -1
2740; SI-NEXT:    v_mov_b32_e32 v0, s0
2741; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2742; SI-NEXT:    s_endpgm
2743;
2744; VI-LABEL: amdgpu_cs_inreg_i1:
2745; VI:       ; %bb.0:
2746; VI-NEXT:    s_and_b32 s0, s0, 1
2747; VI-NEXT:    v_mov_b32_e32 v0, s0
2748; VI-NEXT:    flat_store_byte v[0:1], v0
2749; VI-NEXT:    s_endpgm
2750;
2751; GFX11-LABEL: amdgpu_cs_inreg_i1:
2752; GFX11:       ; %bb.0:
2753; GFX11-NEXT:    s_and_b32 s0, s0, 1
2754; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2755; GFX11-NEXT:    v_mov_b32_e32 v0, s0
2756; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
2757; GFX11-NEXT:    s_endpgm
2758  store i1 %arg0, ptr addrspace(1) undef
2759  ret void
2760}
2761
2762define amdgpu_cs void @amdgpu_cs_inreg_v8i1(<8 x i1> inreg %arg0) {
2763; SI-LABEL: amdgpu_cs_inreg_v8i1:
2764; SI:       ; %bb.0:
2765; SI-NEXT:    s_lshl_b32 s7, s7, 3
2766; SI-NEXT:    s_and_b32 s6, s6, 1
2767; SI-NEXT:    s_lshl_b32 s5, s5, 1
2768; SI-NEXT:    s_and_b32 s4, s4, 1
2769; SI-NEXT:    s_lshl_b32 s8, s3, 3
2770; SI-NEXT:    s_and_b32 s2, s2, 1
2771; SI-NEXT:    s_lshl_b32 s1, s1, 1
2772; SI-NEXT:    s_and_b32 s0, s0, 1
2773; SI-NEXT:    s_mov_b32 s3, 0xf000
2774; SI-NEXT:    s_lshl_b32 s6, s6, 2
2775; SI-NEXT:    s_or_b32 s4, s4, s5
2776; SI-NEXT:    s_lshl_b32 s2, s2, 2
2777; SI-NEXT:    s_or_b32 s0, s0, s1
2778; SI-NEXT:    s_or_b32 s1, s7, s6
2779; SI-NEXT:    s_and_b32 s4, s4, 3
2780; SI-NEXT:    s_or_b32 s2, s8, s2
2781; SI-NEXT:    s_and_b32 s0, s0, 3
2782; SI-NEXT:    s_or_b32 s1, s4, s1
2783; SI-NEXT:    s_or_b32 s0, s0, s2
2784; SI-NEXT:    s_lshl_b32 s1, s1, 4
2785; SI-NEXT:    s_and_b32 s0, s0, 15
2786; SI-NEXT:    s_or_b32 s0, s0, s1
2787; SI-NEXT:    s_mov_b32 s2, -1
2788; SI-NEXT:    v_mov_b32_e32 v0, s0
2789; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2790; SI-NEXT:    s_endpgm
2791;
2792; VI-LABEL: amdgpu_cs_inreg_v8i1:
2793; VI:       ; %bb.0:
2794; VI-NEXT:    s_and_b32 s6, s6, 1
2795; VI-NEXT:    s_lshl_b32 s5, s5, 1
2796; VI-NEXT:    s_and_b32 s4, s4, 1
2797; VI-NEXT:    s_and_b32 s2, s2, 1
2798; VI-NEXT:    s_lshl_b32 s1, s1, 1
2799; VI-NEXT:    s_and_b32 s0, s0, 1
2800; VI-NEXT:    s_lshl_b32 s7, s7, 3
2801; VI-NEXT:    s_lshl_b32 s6, s6, 2
2802; VI-NEXT:    s_or_b32 s4, s4, s5
2803; VI-NEXT:    s_lshl_b32 s3, s3, 3
2804; VI-NEXT:    s_lshl_b32 s2, s2, 2
2805; VI-NEXT:    s_or_b32 s0, s0, s1
2806; VI-NEXT:    s_or_b32 s6, s7, s6
2807; VI-NEXT:    s_and_b32 s4, s4, 3
2808; VI-NEXT:    s_or_b32 s2, s3, s2
2809; VI-NEXT:    s_and_b32 s0, s0, 3
2810; VI-NEXT:    s_or_b32 s4, s4, s6
2811; VI-NEXT:    s_or_b32 s0, s0, s2
2812; VI-NEXT:    s_lshl_b32 s4, s4, 4
2813; VI-NEXT:    s_and_b32 s0, s0, 15
2814; VI-NEXT:    s_or_b32 s0, s0, s4
2815; VI-NEXT:    v_mov_b32_e32 v0, s0
2816; VI-NEXT:    flat_store_byte v[0:1], v0
2817; VI-NEXT:    s_endpgm
2818;
2819; GFX11-LABEL: amdgpu_cs_inreg_v8i1:
2820; GFX11:       ; %bb.0:
2821; GFX11-NEXT:    s_and_b32 s6, s6, 1
2822; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
2823; GFX11-NEXT:    s_and_b32 s4, s4, 1
2824; GFX11-NEXT:    s_and_b32 s2, s2, 1
2825; GFX11-NEXT:    s_lshl_b32 s1, s1, 1
2826; GFX11-NEXT:    s_and_b32 s0, s0, 1
2827; GFX11-NEXT:    s_lshl_b32 s7, s7, 3
2828; GFX11-NEXT:    s_lshl_b32 s6, s6, 2
2829; GFX11-NEXT:    s_or_b32 s4, s4, s5
2830; GFX11-NEXT:    s_lshl_b32 s3, s3, 3
2831; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
2832; GFX11-NEXT:    s_or_b32 s0, s0, s1
2833; GFX11-NEXT:    s_or_b32 s5, s7, s6
2834; GFX11-NEXT:    s_and_b32 s4, s4, 3
2835; GFX11-NEXT:    s_or_b32 s1, s3, s2
2836; GFX11-NEXT:    s_and_b32 s0, s0, 3
2837; GFX11-NEXT:    s_or_b32 s2, s4, s5
2838; GFX11-NEXT:    s_or_b32 s0, s0, s1
2839; GFX11-NEXT:    s_lshl_b32 s1, s2, 4
2840; GFX11-NEXT:    s_and_b32 s0, s0, 15
2841; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2842; GFX11-NEXT:    s_or_b32 s0, s0, s1
2843; GFX11-NEXT:    v_mov_b32_e32 v0, s0
2844; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
2845; GFX11-NEXT:    s_endpgm
2846  store <8 x i1> %arg0, ptr addrspace(1) undef
2847  ret void
2848}
2849
2850define amdgpu_cs void @amdgpu_cs_inreg_v16i1(<16 x i1> inreg %arg0) {
2851; SI-LABEL: amdgpu_cs_inreg_v16i1:
2852; SI:       ; %bb.0:
2853; SI-NEXT:    s_lshl_b32 s15, s15, 3
2854; SI-NEXT:    s_and_b32 s14, s14, 1
2855; SI-NEXT:    s_lshl_b32 s13, s13, 1
2856; SI-NEXT:    s_and_b32 s12, s12, 1
2857; SI-NEXT:    s_lshl_b32 s11, s11, 3
2858; SI-NEXT:    s_and_b32 s10, s10, 1
2859; SI-NEXT:    s_lshl_b32 s9, s9, 1
2860; SI-NEXT:    s_and_b32 s8, s8, 1
2861; SI-NEXT:    s_lshl_b32 s7, s7, 3
2862; SI-NEXT:    s_and_b32 s6, s6, 1
2863; SI-NEXT:    s_lshl_b32 s5, s5, 1
2864; SI-NEXT:    s_and_b32 s4, s4, 1
2865; SI-NEXT:    s_lshl_b32 s16, s3, 3
2866; SI-NEXT:    s_and_b32 s2, s2, 1
2867; SI-NEXT:    s_lshl_b32 s1, s1, 1
2868; SI-NEXT:    s_and_b32 s0, s0, 1
2869; SI-NEXT:    s_mov_b32 s3, 0xf000
2870; SI-NEXT:    s_lshl_b32 s14, s14, 2
2871; SI-NEXT:    s_or_b32 s12, s12, s13
2872; SI-NEXT:    s_lshl_b32 s10, s10, 2
2873; SI-NEXT:    s_or_b32 s8, s8, s9
2874; SI-NEXT:    s_lshl_b32 s6, s6, 2
2875; SI-NEXT:    s_or_b32 s4, s4, s5
2876; SI-NEXT:    s_lshl_b32 s2, s2, 2
2877; SI-NEXT:    s_or_b32 s0, s0, s1
2878; SI-NEXT:    s_or_b32 s1, s15, s14
2879; SI-NEXT:    s_and_b32 s5, s12, 3
2880; SI-NEXT:    s_or_b32 s9, s11, s10
2881; SI-NEXT:    s_and_b32 s8, s8, 3
2882; SI-NEXT:    s_or_b32 s6, s7, s6
2883; SI-NEXT:    s_and_b32 s4, s4, 3
2884; SI-NEXT:    s_or_b32 s2, s16, s2
2885; SI-NEXT:    s_and_b32 s0, s0, 3
2886; SI-NEXT:    s_or_b32 s1, s5, s1
2887; SI-NEXT:    s_or_b32 s5, s8, s9
2888; SI-NEXT:    s_or_b32 s4, s4, s6
2889; SI-NEXT:    s_or_b32 s0, s0, s2
2890; SI-NEXT:    s_lshl_b32 s1, s1, 12
2891; SI-NEXT:    s_and_b32 s2, s5, 15
2892; SI-NEXT:    s_lshl_b32 s4, s4, 4
2893; SI-NEXT:    s_and_b32 s0, s0, 15
2894; SI-NEXT:    s_lshl_b32 s2, s2, 8
2895; SI-NEXT:    s_or_b32 s0, s0, s4
2896; SI-NEXT:    s_or_b32 s1, s1, s2
2897; SI-NEXT:    s_and_b32 s0, s0, 0xff
2898; SI-NEXT:    s_or_b32 s0, s0, s1
2899; SI-NEXT:    s_mov_b32 s2, -1
2900; SI-NEXT:    v_mov_b32_e32 v0, s0
2901; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
2902; SI-NEXT:    s_endpgm
2903;
2904; VI-LABEL: amdgpu_cs_inreg_v16i1:
2905; VI:       ; %bb.0:
2906; VI-NEXT:    s_and_b32 s10, s10, 1
2907; VI-NEXT:    s_lshl_b32 s9, s9, 1
2908; VI-NEXT:    s_and_b32 s8, s8, 1
2909; VI-NEXT:    s_and_b32 s6, s6, 1
2910; VI-NEXT:    s_lshl_b32 s5, s5, 1
2911; VI-NEXT:    s_and_b32 s4, s4, 1
2912; VI-NEXT:    s_and_b32 s2, s2, 1
2913; VI-NEXT:    s_lshl_b32 s1, s1, 1
2914; VI-NEXT:    s_and_b32 s0, s0, 1
2915; VI-NEXT:    s_and_b32 s14, s14, 1
2916; VI-NEXT:    s_lshl_b32 s13, s13, 1
2917; VI-NEXT:    s_and_b32 s12, s12, 1
2918; VI-NEXT:    s_lshl_b32 s11, s11, 3
2919; VI-NEXT:    s_lshl_b32 s10, s10, 2
2920; VI-NEXT:    s_or_b32 s8, s8, s9
2921; VI-NEXT:    s_lshl_b32 s7, s7, 3
2922; VI-NEXT:    s_lshl_b32 s6, s6, 2
2923; VI-NEXT:    s_or_b32 s4, s4, s5
2924; VI-NEXT:    s_lshl_b32 s3, s3, 3
2925; VI-NEXT:    s_lshl_b32 s2, s2, 2
2926; VI-NEXT:    s_or_b32 s0, s0, s1
2927; VI-NEXT:    s_lshl_b32 s15, s15, 3
2928; VI-NEXT:    s_lshl_b32 s14, s14, 2
2929; VI-NEXT:    s_or_b32 s12, s12, s13
2930; VI-NEXT:    s_or_b32 s10, s11, s10
2931; VI-NEXT:    s_and_b32 s8, s8, 3
2932; VI-NEXT:    s_or_b32 s6, s7, s6
2933; VI-NEXT:    s_and_b32 s4, s4, 3
2934; VI-NEXT:    s_or_b32 s2, s3, s2
2935; VI-NEXT:    s_and_b32 s0, s0, 3
2936; VI-NEXT:    s_or_b32 s14, s15, s14
2937; VI-NEXT:    s_and_b32 s12, s12, 3
2938; VI-NEXT:    s_or_b32 s8, s8, s10
2939; VI-NEXT:    s_or_b32 s4, s4, s6
2940; VI-NEXT:    s_or_b32 s0, s0, s2
2941; VI-NEXT:    s_or_b32 s12, s12, s14
2942; VI-NEXT:    s_and_b32 s8, s8, 15
2943; VI-NEXT:    s_lshl_b32 s4, s4, 4
2944; VI-NEXT:    s_and_b32 s0, s0, 15
2945; VI-NEXT:    s_lshl_b32 s12, s12, 12
2946; VI-NEXT:    s_lshl_b32 s8, s8, 8
2947; VI-NEXT:    s_or_b32 s0, s0, s4
2948; VI-NEXT:    s_or_b32 s8, s12, s8
2949; VI-NEXT:    s_and_b32 s0, s0, 0xff
2950; VI-NEXT:    s_or_b32 s0, s0, s8
2951; VI-NEXT:    v_mov_b32_e32 v0, s0
2952; VI-NEXT:    flat_store_short v[0:1], v0
2953; VI-NEXT:    s_endpgm
2954;
2955; GFX11-LABEL: amdgpu_cs_inreg_v16i1:
2956; GFX11:       ; %bb.0:
2957; GFX11-NEXT:    s_and_b32 s10, s10, 1
2958; GFX11-NEXT:    s_lshl_b32 s9, s9, 1
2959; GFX11-NEXT:    s_and_b32 s8, s8, 1
2960; GFX11-NEXT:    s_and_b32 s6, s6, 1
2961; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
2962; GFX11-NEXT:    s_and_b32 s4, s4, 1
2963; GFX11-NEXT:    s_and_b32 s2, s2, 1
2964; GFX11-NEXT:    s_lshl_b32 s1, s1, 1
2965; GFX11-NEXT:    s_and_b32 s0, s0, 1
2966; GFX11-NEXT:    s_and_b32 s14, s14, 1
2967; GFX11-NEXT:    s_lshl_b32 s13, s13, 1
2968; GFX11-NEXT:    s_and_b32 s12, s12, 1
2969; GFX11-NEXT:    s_lshl_b32 s11, s11, 3
2970; GFX11-NEXT:    s_lshl_b32 s10, s10, 2
2971; GFX11-NEXT:    s_or_b32 s8, s8, s9
2972; GFX11-NEXT:    s_lshl_b32 s7, s7, 3
2973; GFX11-NEXT:    s_lshl_b32 s6, s6, 2
2974; GFX11-NEXT:    s_or_b32 s4, s4, s5
2975; GFX11-NEXT:    s_lshl_b32 s3, s3, 3
2976; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
2977; GFX11-NEXT:    s_or_b32 s0, s0, s1
2978; GFX11-NEXT:    s_lshl_b32 s15, s15, 3
2979; GFX11-NEXT:    s_lshl_b32 s14, s14, 2
2980; GFX11-NEXT:    s_or_b32 s12, s12, s13
2981; GFX11-NEXT:    s_or_b32 s9, s11, s10
2982; GFX11-NEXT:    s_and_b32 s8, s8, 3
2983; GFX11-NEXT:    s_or_b32 s5, s7, s6
2984; GFX11-NEXT:    s_and_b32 s4, s4, 3
2985; GFX11-NEXT:    s_or_b32 s1, s3, s2
2986; GFX11-NEXT:    s_and_b32 s0, s0, 3
2987; GFX11-NEXT:    s_or_b32 s13, s15, s14
2988; GFX11-NEXT:    s_and_b32 s12, s12, 3
2989; GFX11-NEXT:    s_or_b32 s8, s8, s9
2990; GFX11-NEXT:    s_or_b32 s2, s4, s5
2991; GFX11-NEXT:    s_or_b32 s0, s0, s1
2992; GFX11-NEXT:    s_or_b32 s10, s12, s13
2993; GFX11-NEXT:    s_and_b32 s8, s8, 15
2994; GFX11-NEXT:    s_lshl_b32 s1, s2, 4
2995; GFX11-NEXT:    s_and_b32 s0, s0, 15
2996; GFX11-NEXT:    s_lshl_b32 s9, s10, 12
2997; GFX11-NEXT:    s_lshl_b32 s2, s8, 8
2998; GFX11-NEXT:    s_or_b32 s0, s0, s1
2999; GFX11-NEXT:    s_or_b32 s1, s9, s2
3000; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
3001; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3002; GFX11-NEXT:    s_or_b32 s0, s0, s1
3003; GFX11-NEXT:    v_mov_b32_e32 v0, s0
3004; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
3005; GFX11-NEXT:    s_endpgm
3006  store <16 x i1> %arg0, ptr addrspace(1) undef
3007  ret void
3008}
3009
3010define amdgpu_cs void @amdgpu_cs_inreg_v32i1(<32 x i1> inreg %arg0) {
3011; SI-LABEL: amdgpu_cs_inreg_v32i1:
3012; SI:       ; %bb.0:
3013; SI-NEXT:    s_lshl_b32 s31, s31, 3
3014; SI-NEXT:    s_and_b32 s30, s30, 1
3015; SI-NEXT:    s_lshl_b32 s29, s29, 1
3016; SI-NEXT:    s_and_b32 s28, s28, 1
3017; SI-NEXT:    s_lshl_b32 s27, s27, 3
3018; SI-NEXT:    s_and_b32 s26, s26, 1
3019; SI-NEXT:    s_lshl_b32 s25, s25, 1
3020; SI-NEXT:    s_and_b32 s24, s24, 1
3021; SI-NEXT:    s_lshl_b32 s23, s23, 3
3022; SI-NEXT:    s_and_b32 s22, s22, 1
3023; SI-NEXT:    s_lshl_b32 s21, s21, 1
3024; SI-NEXT:    s_and_b32 s20, s20, 1
3025; SI-NEXT:    s_lshl_b32 s19, s19, 3
3026; SI-NEXT:    s_and_b32 s18, s18, 1
3027; SI-NEXT:    s_lshl_b32 s17, s17, 1
3028; SI-NEXT:    s_and_b32 s16, s16, 1
3029; SI-NEXT:    s_lshl_b32 s15, s15, 3
3030; SI-NEXT:    s_and_b32 s14, s14, 1
3031; SI-NEXT:    s_lshl_b32 s13, s13, 1
3032; SI-NEXT:    s_and_b32 s12, s12, 1
3033; SI-NEXT:    s_lshl_b32 s11, s11, 3
3034; SI-NEXT:    s_and_b32 s10, s10, 1
3035; SI-NEXT:    s_lshl_b32 s9, s9, 1
3036; SI-NEXT:    s_and_b32 s8, s8, 1
3037; SI-NEXT:    s_lshl_b32 s7, s7, 3
3038; SI-NEXT:    s_and_b32 s6, s6, 1
3039; SI-NEXT:    s_lshl_b32 s5, s5, 1
3040; SI-NEXT:    s_and_b32 s4, s4, 1
3041; SI-NEXT:    s_lshl_b32 s33, s3, 3
3042; SI-NEXT:    s_and_b32 s2, s2, 1
3043; SI-NEXT:    s_lshl_b32 s1, s1, 1
3044; SI-NEXT:    s_and_b32 s0, s0, 1
3045; SI-NEXT:    s_mov_b32 s3, 0xf000
3046; SI-NEXT:    s_lshl_b32 s30, s30, 2
3047; SI-NEXT:    s_or_b32 s28, s28, s29
3048; SI-NEXT:    s_lshl_b32 s26, s26, 2
3049; SI-NEXT:    s_or_b32 s24, s24, s25
3050; SI-NEXT:    s_lshl_b32 s22, s22, 2
3051; SI-NEXT:    s_or_b32 s20, s20, s21
3052; SI-NEXT:    s_lshl_b32 s18, s18, 2
3053; SI-NEXT:    s_or_b32 s16, s16, s17
3054; SI-NEXT:    s_lshl_b32 s14, s14, 2
3055; SI-NEXT:    s_or_b32 s12, s12, s13
3056; SI-NEXT:    s_lshl_b32 s10, s10, 2
3057; SI-NEXT:    s_or_b32 s8, s8, s9
3058; SI-NEXT:    s_lshl_b32 s6, s6, 2
3059; SI-NEXT:    s_or_b32 s4, s4, s5
3060; SI-NEXT:    s_lshl_b32 s2, s2, 2
3061; SI-NEXT:    s_or_b32 s0, s0, s1
3062; SI-NEXT:    s_or_b32 s1, s31, s30
3063; SI-NEXT:    s_and_b32 s5, s28, 3
3064; SI-NEXT:    s_or_b32 s9, s27, s26
3065; SI-NEXT:    s_and_b32 s13, s24, 3
3066; SI-NEXT:    s_or_b32 s17, s23, s22
3067; SI-NEXT:    s_and_b32 s20, s20, 3
3068; SI-NEXT:    s_or_b32 s18, s19, s18
3069; SI-NEXT:    s_and_b32 s16, s16, 3
3070; SI-NEXT:    s_or_b32 s14, s15, s14
3071; SI-NEXT:    s_and_b32 s12, s12, 3
3072; SI-NEXT:    s_or_b32 s10, s11, s10
3073; SI-NEXT:    s_and_b32 s8, s8, 3
3074; SI-NEXT:    s_or_b32 s6, s7, s6
3075; SI-NEXT:    s_and_b32 s4, s4, 3
3076; SI-NEXT:    s_or_b32 s2, s33, s2
3077; SI-NEXT:    s_and_b32 s0, s0, 3
3078; SI-NEXT:    s_or_b32 s1, s5, s1
3079; SI-NEXT:    s_or_b32 s5, s13, s9
3080; SI-NEXT:    s_or_b32 s7, s20, s17
3081; SI-NEXT:    s_or_b32 s9, s16, s18
3082; SI-NEXT:    s_or_b32 s11, s12, s14
3083; SI-NEXT:    s_or_b32 s8, s8, s10
3084; SI-NEXT:    s_or_b32 s4, s4, s6
3085; SI-NEXT:    s_or_b32 s0, s0, s2
3086; SI-NEXT:    s_lshl_b32 s1, s1, 12
3087; SI-NEXT:    s_and_b32 s2, s5, 15
3088; SI-NEXT:    s_lshl_b32 s5, s7, 4
3089; SI-NEXT:    s_and_b32 s6, s9, 15
3090; SI-NEXT:    s_lshl_b32 s7, s11, 12
3091; SI-NEXT:    s_and_b32 s8, s8, 15
3092; SI-NEXT:    s_lshl_b32 s4, s4, 4
3093; SI-NEXT:    s_and_b32 s0, s0, 15
3094; SI-NEXT:    s_lshl_b32 s2, s2, 8
3095; SI-NEXT:    s_or_b32 s5, s6, s5
3096; SI-NEXT:    s_lshl_b32 s6, s8, 8
3097; SI-NEXT:    s_or_b32 s0, s0, s4
3098; SI-NEXT:    s_or_b32 s1, s1, s2
3099; SI-NEXT:    s_and_b32 s2, s5, 0xff
3100; SI-NEXT:    s_or_b32 s4, s7, s6
3101; SI-NEXT:    s_and_b32 s0, s0, 0xff
3102; SI-NEXT:    s_or_b32 s1, s2, s1
3103; SI-NEXT:    s_or_b32 s0, s0, s4
3104; SI-NEXT:    s_lshl_b32 s1, s1, 16
3105; SI-NEXT:    s_and_b32 s0, s0, 0xffff
3106; SI-NEXT:    s_or_b32 s0, s0, s1
3107; SI-NEXT:    s_mov_b32 s2, -1
3108; SI-NEXT:    v_mov_b32_e32 v0, s0
3109; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3110; SI-NEXT:    s_endpgm
3111;
3112; VI-LABEL: amdgpu_cs_inreg_v32i1:
3113; VI:       ; %bb.0:
3114; VI-NEXT:    s_and_b32 s26, s26, 1
3115; VI-NEXT:    s_lshl_b32 s25, s25, 1
3116; VI-NEXT:    s_and_b32 s24, s24, 1
3117; VI-NEXT:    s_and_b32 s22, s22, 1
3118; VI-NEXT:    s_lshl_b32 s21, s21, 1
3119; VI-NEXT:    s_and_b32 s20, s20, 1
3120; VI-NEXT:    s_and_b32 s18, s18, 1
3121; VI-NEXT:    s_lshl_b32 s17, s17, 1
3122; VI-NEXT:    s_and_b32 s16, s16, 1
3123; VI-NEXT:    s_and_b32 s10, s10, 1
3124; VI-NEXT:    s_lshl_b32 s9, s9, 1
3125; VI-NEXT:    s_and_b32 s8, s8, 1
3126; VI-NEXT:    s_and_b32 s6, s6, 1
3127; VI-NEXT:    s_lshl_b32 s5, s5, 1
3128; VI-NEXT:    s_and_b32 s4, s4, 1
3129; VI-NEXT:    s_and_b32 s2, s2, 1
3130; VI-NEXT:    s_lshl_b32 s1, s1, 1
3131; VI-NEXT:    s_and_b32 s0, s0, 1
3132; VI-NEXT:    s_and_b32 s30, s30, 1
3133; VI-NEXT:    s_lshl_b32 s29, s29, 1
3134; VI-NEXT:    s_and_b32 s28, s28, 1
3135; VI-NEXT:    s_lshl_b32 s27, s27, 3
3136; VI-NEXT:    s_lshl_b32 s26, s26, 2
3137; VI-NEXT:    s_or_b32 s24, s24, s25
3138; VI-NEXT:    s_lshl_b32 s23, s23, 3
3139; VI-NEXT:    s_lshl_b32 s22, s22, 2
3140; VI-NEXT:    s_or_b32 s20, s20, s21
3141; VI-NEXT:    s_lshl_b32 s19, s19, 3
3142; VI-NEXT:    s_lshl_b32 s18, s18, 2
3143; VI-NEXT:    s_or_b32 s16, s16, s17
3144; VI-NEXT:    s_and_b32 s14, s14, 1
3145; VI-NEXT:    s_lshl_b32 s13, s13, 1
3146; VI-NEXT:    s_and_b32 s12, s12, 1
3147; VI-NEXT:    s_lshl_b32 s11, s11, 3
3148; VI-NEXT:    s_lshl_b32 s10, s10, 2
3149; VI-NEXT:    s_or_b32 s8, s8, s9
3150; VI-NEXT:    s_lshl_b32 s7, s7, 3
3151; VI-NEXT:    s_lshl_b32 s6, s6, 2
3152; VI-NEXT:    s_or_b32 s4, s4, s5
3153; VI-NEXT:    s_lshl_b32 s3, s3, 3
3154; VI-NEXT:    s_lshl_b32 s2, s2, 2
3155; VI-NEXT:    s_or_b32 s0, s0, s1
3156; VI-NEXT:    s_lshl_b32 s31, s31, 3
3157; VI-NEXT:    s_lshl_b32 s30, s30, 2
3158; VI-NEXT:    s_or_b32 s28, s28, s29
3159; VI-NEXT:    s_or_b32 s26, s27, s26
3160; VI-NEXT:    s_and_b32 s24, s24, 3
3161; VI-NEXT:    s_or_b32 s22, s23, s22
3162; VI-NEXT:    s_and_b32 s20, s20, 3
3163; VI-NEXT:    s_or_b32 s18, s19, s18
3164; VI-NEXT:    s_and_b32 s16, s16, 3
3165; VI-NEXT:    s_lshl_b32 s15, s15, 3
3166; VI-NEXT:    s_lshl_b32 s14, s14, 2
3167; VI-NEXT:    s_or_b32 s12, s12, s13
3168; VI-NEXT:    s_or_b32 s10, s11, s10
3169; VI-NEXT:    s_and_b32 s8, s8, 3
3170; VI-NEXT:    s_or_b32 s6, s7, s6
3171; VI-NEXT:    s_and_b32 s4, s4, 3
3172; VI-NEXT:    s_or_b32 s2, s3, s2
3173; VI-NEXT:    s_and_b32 s0, s0, 3
3174; VI-NEXT:    s_or_b32 s30, s31, s30
3175; VI-NEXT:    s_and_b32 s28, s28, 3
3176; VI-NEXT:    s_or_b32 s24, s24, s26
3177; VI-NEXT:    s_or_b32 s20, s20, s22
3178; VI-NEXT:    s_or_b32 s16, s16, s18
3179; VI-NEXT:    s_or_b32 s14, s15, s14
3180; VI-NEXT:    s_and_b32 s12, s12, 3
3181; VI-NEXT:    s_or_b32 s8, s8, s10
3182; VI-NEXT:    s_or_b32 s4, s4, s6
3183; VI-NEXT:    s_or_b32 s0, s0, s2
3184; VI-NEXT:    s_or_b32 s28, s28, s30
3185; VI-NEXT:    s_and_b32 s24, s24, 15
3186; VI-NEXT:    s_lshl_b32 s20, s20, 4
3187; VI-NEXT:    s_and_b32 s16, s16, 15
3188; VI-NEXT:    s_or_b32 s12, s12, s14
3189; VI-NEXT:    s_and_b32 s8, s8, 15
3190; VI-NEXT:    s_lshl_b32 s4, s4, 4
3191; VI-NEXT:    s_and_b32 s0, s0, 15
3192; VI-NEXT:    s_lshl_b32 s28, s28, 12
3193; VI-NEXT:    s_lshl_b32 s24, s24, 8
3194; VI-NEXT:    s_or_b32 s16, s16, s20
3195; VI-NEXT:    s_lshl_b32 s12, s12, 12
3196; VI-NEXT:    s_lshl_b32 s8, s8, 8
3197; VI-NEXT:    s_or_b32 s0, s0, s4
3198; VI-NEXT:    s_or_b32 s24, s28, s24
3199; VI-NEXT:    s_and_b32 s16, s16, 0xff
3200; VI-NEXT:    s_or_b32 s8, s12, s8
3201; VI-NEXT:    s_and_b32 s0, s0, 0xff
3202; VI-NEXT:    s_or_b32 s16, s16, s24
3203; VI-NEXT:    s_or_b32 s0, s0, s8
3204; VI-NEXT:    s_lshl_b32 s16, s16, 16
3205; VI-NEXT:    s_and_b32 s0, s0, 0xffff
3206; VI-NEXT:    s_or_b32 s0, s0, s16
3207; VI-NEXT:    v_mov_b32_e32 v0, s0
3208; VI-NEXT:    flat_store_dword v[0:1], v0
3209; VI-NEXT:    s_endpgm
3210;
3211; GFX11-LABEL: amdgpu_cs_inreg_v32i1:
3212; GFX11:       ; %bb.0:
3213; GFX11-NEXT:    s_and_b32 s10, s10, 1
3214; GFX11-NEXT:    s_lshl_b32 s9, s9, 1
3215; GFX11-NEXT:    s_and_b32 s8, s8, 1
3216; GFX11-NEXT:    s_and_b32 s14, s14, 1
3217; GFX11-NEXT:    s_lshl_b32 s13, s13, 1
3218; GFX11-NEXT:    s_and_b32 s12, s12, 1
3219; GFX11-NEXT:    s_lshl_b32 s11, s11, 3
3220; GFX11-NEXT:    s_lshl_b32 s10, s10, 2
3221; GFX11-NEXT:    s_or_b32 s8, s8, s9
3222; GFX11-NEXT:    s_and_b32 s6, s6, 1
3223; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
3224; GFX11-NEXT:    s_and_b32 s4, s4, 1
3225; GFX11-NEXT:    s_and_b32 s2, s2, 1
3226; GFX11-NEXT:    s_lshl_b32 s1, s1, 1
3227; GFX11-NEXT:    s_and_b32 s0, s0, 1
3228; GFX11-NEXT:    s_lshl_b32 s15, s15, 3
3229; GFX11-NEXT:    s_lshl_b32 s14, s14, 2
3230; GFX11-NEXT:    s_or_b32 s12, s12, s13
3231; GFX11-NEXT:    s_or_b32 s9, s11, s10
3232; GFX11-NEXT:    s_and_b32 s8, s8, 3
3233; GFX11-NEXT:    s_lshl_b32 s7, s7, 3
3234; GFX11-NEXT:    s_lshl_b32 s6, s6, 2
3235; GFX11-NEXT:    s_or_b32 s4, s4, s5
3236; GFX11-NEXT:    s_lshl_b32 s3, s3, 3
3237; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
3238; GFX11-NEXT:    s_or_b32 s0, s0, s1
3239; GFX11-NEXT:    s_or_b32 s13, s15, s14
3240; GFX11-NEXT:    s_and_b32 s12, s12, 3
3241; GFX11-NEXT:    s_or_b32 s8, s8, s9
3242; GFX11-NEXT:    s_or_b32 s5, s7, s6
3243; GFX11-NEXT:    s_and_b32 s4, s4, 3
3244; GFX11-NEXT:    s_or_b32 s1, s3, s2
3245; GFX11-NEXT:    s_and_b32 s0, s0, 3
3246; GFX11-NEXT:    s_or_b32 s10, s12, s13
3247; GFX11-NEXT:    s_and_b32 s8, s8, 15
3248; GFX11-NEXT:    s_or_b32 s2, s4, s5
3249; GFX11-NEXT:    s_or_b32 s0, s0, s1
3250; GFX11-NEXT:    s_lshl_b32 s9, s10, 12
3251; GFX11-NEXT:    s_lshl_b32 s1, s2, 4
3252; GFX11-NEXT:    s_and_b32 s0, s0, 15
3253; GFX11-NEXT:    s_lshl_b32 s2, s8, 8
3254; GFX11-NEXT:    s_and_b32 s3, s30, 1
3255; GFX11-NEXT:    s_lshl_b32 s4, s29, 1
3256; GFX11-NEXT:    s_and_b32 s5, s28, 1
3257; GFX11-NEXT:    s_or_b32 s0, s0, s1
3258; GFX11-NEXT:    s_or_b32 s1, s9, s2
3259; GFX11-NEXT:    s_lshl_b32 s2, s31, 3
3260; GFX11-NEXT:    s_lshl_b32 s3, s3, 2
3261; GFX11-NEXT:    s_or_b32 s4, s5, s4
3262; GFX11-NEXT:    s_and_b32 s5, s26, 1
3263; GFX11-NEXT:    s_lshl_b32 s6, s25, 1
3264; GFX11-NEXT:    s_and_b32 s7, s24, 1
3265; GFX11-NEXT:    s_or_b32 s2, s2, s3
3266; GFX11-NEXT:    s_and_b32 s3, s4, 3
3267; GFX11-NEXT:    s_lshl_b32 s4, s27, 3
3268; GFX11-NEXT:    s_lshl_b32 s5, s5, 2
3269; GFX11-NEXT:    s_or_b32 s6, s7, s6
3270; GFX11-NEXT:    s_or_b32 s4, s4, s5
3271; GFX11-NEXT:    s_and_b32 s5, s6, 3
3272; GFX11-NEXT:    s_or_b32 s2, s3, s2
3273; GFX11-NEXT:    s_or_b32 s3, s5, s4
3274; GFX11-NEXT:    s_and_b32 s5, s22, 1
3275; GFX11-NEXT:    s_lshl_b32 s6, s21, 1
3276; GFX11-NEXT:    s_and_b32 s7, s20, 1
3277; GFX11-NEXT:    s_lshl_b32 s4, s23, 3
3278; GFX11-NEXT:    s_lshl_b32 s5, s5, 2
3279; GFX11-NEXT:    s_or_b32 s6, s7, s6
3280; GFX11-NEXT:    s_and_b32 s7, s18, 1
3281; GFX11-NEXT:    s_lshl_b32 s8, s17, 1
3282; GFX11-NEXT:    s_and_b32 s9, s16, 1
3283; GFX11-NEXT:    s_or_b32 s4, s4, s5
3284; GFX11-NEXT:    s_and_b32 s5, s6, 3
3285; GFX11-NEXT:    s_lshl_b32 s6, s19, 3
3286; GFX11-NEXT:    s_lshl_b32 s7, s7, 2
3287; GFX11-NEXT:    s_or_b32 s8, s9, s8
3288; GFX11-NEXT:    s_or_b32 s6, s6, s7
3289; GFX11-NEXT:    s_and_b32 s7, s8, 3
3290; GFX11-NEXT:    s_or_b32 s4, s5, s4
3291; GFX11-NEXT:    s_or_b32 s5, s7, s6
3292; GFX11-NEXT:    s_and_b32 s3, s3, 15
3293; GFX11-NEXT:    s_lshl_b32 s4, s4, 4
3294; GFX11-NEXT:    s_and_b32 s5, s5, 15
3295; GFX11-NEXT:    s_lshl_b32 s2, s2, 12
3296; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
3297; GFX11-NEXT:    s_or_b32 s4, s5, s4
3298; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
3299; GFX11-NEXT:    s_or_b32 s2, s2, s3
3300; GFX11-NEXT:    s_and_b32 s3, s4, 0xff
3301; GFX11-NEXT:    s_or_b32 s0, s0, s1
3302; GFX11-NEXT:    s_or_b32 s1, s3, s2
3303; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
3304; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
3305; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3306; GFX11-NEXT:    s_or_b32 s0, s0, s1
3307; GFX11-NEXT:    v_mov_b32_e32 v0, s0
3308; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
3309; GFX11-NEXT:    s_endpgm
3310  store <32 x i1> %arg0, ptr addrspace(1) undef
3311  ret void
3312}
3313
3314define amdgpu_cs void @amdgpu_cs_i1_sext(i1 signext %arg0) {
3315; SI-LABEL: amdgpu_cs_i1_sext:
3316; SI:       ; %bb.0:
3317; SI-NEXT:    v_and_b32_e32 v0, 1, v0
3318; SI-NEXT:    s_mov_b32 s3, 0xf000
3319; SI-NEXT:    s_mov_b32 s2, -1
3320; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
3321; SI-NEXT:    s_endpgm
3322;
3323; VI-LABEL: amdgpu_cs_i1_sext:
3324; VI:       ; %bb.0:
3325; VI-NEXT:    v_and_b32_e32 v0, 1, v0
3326; VI-NEXT:    flat_store_byte v[0:1], v0
3327; VI-NEXT:    s_endpgm
3328;
3329; GFX11-LABEL: amdgpu_cs_i1_sext:
3330; GFX11:       ; %bb.0:
3331; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
3332; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
3333; GFX11-NEXT:    s_endpgm
3334  store i1 %arg0, ptr addrspace(1) undef
3335  ret void
3336}
3337
3338define amdgpu_cs void @amdgpu_cs_i1_zext(i1 zeroext %arg0) {
3339; SI-LABEL: amdgpu_cs_i1_zext:
3340; SI:       ; %bb.0:
3341; SI-NEXT:    s_mov_b32 s3, 0xf000
3342; SI-NEXT:    s_mov_b32 s2, -1
3343; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
3344; SI-NEXT:    s_endpgm
3345;
3346; VI-LABEL: amdgpu_cs_i1_zext:
3347; VI:       ; %bb.0:
3348; VI-NEXT:    flat_store_byte v[0:1], v0
3349; VI-NEXT:    s_endpgm
3350;
3351; GFX11-LABEL: amdgpu_cs_i1_zext:
3352; GFX11:       ; %bb.0:
3353; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
3354; GFX11-NEXT:    s_endpgm
3355  store i1 %arg0, ptr addrspace(1) undef
3356  ret void
3357}
3358
3359attributes #0 = { nounwind noinline }
3360