xref: /llvm-project/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (revision 7475f0a3454ce2b09c211779a33c41b6d34c8758)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
3
4define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) {
5; GCN-LABEL: float4_inselt:
6; GCN:       ; %bb.0: ; %entry
7; GCN-NEXT:    s_load_dword s6, s[4:5], 0x44
8; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
9; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
10; GCN-NEXT:    s_waitcnt lgkmcnt(0)
11; GCN-NEXT:    s_cmp_lg_u32 s6, 3
12; GCN-NEXT:    v_mov_b32_e32 v0, s3
13; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
14; GCN-NEXT:    s_cmp_lg_u32 s6, 2
15; GCN-NEXT:    v_cndmask_b32_e32 v3, 1.0, v0, vcc
16; GCN-NEXT:    v_mov_b32_e32 v0, s2
17; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
18; GCN-NEXT:    s_cmp_lg_u32 s6, 1
19; GCN-NEXT:    v_cndmask_b32_e32 v2, 1.0, v0, vcc
20; GCN-NEXT:    v_mov_b32_e32 v0, s1
21; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
22; GCN-NEXT:    s_cmp_lg_u32 s6, 0
23; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v0, vcc
24; GCN-NEXT:    v_mov_b32_e32 v0, s0
25; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
26; GCN-NEXT:    v_mov_b32_e32 v4, s4
27; GCN-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
28; GCN-NEXT:    v_mov_b32_e32 v5, s5
29; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
30; GCN-NEXT:    s_endpgm
31entry:
32  %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel
33  store <4 x float> %v, ptr addrspace(1) %out
34  ret void
35}
36
37define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) {
38; GCN-LABEL: float4_inselt_undef:
39; GCN:       ; %bb.0: ; %entry
40; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
41; GCN-NEXT:    v_mov_b32_e32 v0, 1.0
42; GCN-NEXT:    v_mov_b32_e32 v1, v0
43; GCN-NEXT:    v_mov_b32_e32 v2, v0
44; GCN-NEXT:    v_mov_b32_e32 v3, v0
45; GCN-NEXT:    s_waitcnt lgkmcnt(0)
46; GCN-NEXT:    v_mov_b32_e32 v5, s1
47; GCN-NEXT:    v_mov_b32_e32 v4, s0
48; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
49; GCN-NEXT:    s_endpgm
50entry:
51  %v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel
52  store <4 x float> %v, ptr addrspace(1) %out
53  ret void
54}
55
56define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i32 %sel) {
57; GCN-LABEL: int4_inselt:
58; GCN:       ; %bb.0: ; %entry
59; GCN-NEXT:    s_load_dword s6, s[4:5], 0x44
60; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
61; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
62; GCN-NEXT:    s_waitcnt lgkmcnt(0)
63; GCN-NEXT:    s_cmp_lg_u32 s6, 3
64; GCN-NEXT:    s_cselect_b32 s3, s3, 1
65; GCN-NEXT:    s_cmp_lg_u32 s6, 2
66; GCN-NEXT:    s_cselect_b32 s2, s2, 1
67; GCN-NEXT:    s_cmp_lg_u32 s6, 1
68; GCN-NEXT:    s_cselect_b32 s1, s1, 1
69; GCN-NEXT:    s_cmp_lg_u32 s6, 0
70; GCN-NEXT:    s_cselect_b32 s0, s0, 1
71; GCN-NEXT:    v_mov_b32_e32 v4, s4
72; GCN-NEXT:    v_mov_b32_e32 v0, s0
73; GCN-NEXT:    v_mov_b32_e32 v1, s1
74; GCN-NEXT:    v_mov_b32_e32 v2, s2
75; GCN-NEXT:    v_mov_b32_e32 v3, s3
76; GCN-NEXT:    v_mov_b32_e32 v5, s5
77; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
78; GCN-NEXT:    s_endpgm
79entry:
80  %v = insertelement <4 x i32> %vec, i32 1, i32 %sel
81  store <4 x i32> %v, ptr addrspace(1) %out
82  ret void
83}
84
85define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec, i32 %sel) {
86; GCN-LABEL: float2_inselt:
87; GCN:       ; %bb.0: ; %entry
88; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
89; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
90; GCN-NEXT:    s_waitcnt lgkmcnt(0)
91; GCN-NEXT:    s_cmp_lg_u32 s2, 1
92; GCN-NEXT:    v_mov_b32_e32 v0, s1
93; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
94; GCN-NEXT:    s_cmp_lg_u32 s2, 0
95; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v0, vcc
96; GCN-NEXT:    v_mov_b32_e32 v0, s0
97; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
98; GCN-NEXT:    v_mov_b32_e32 v2, s4
99; GCN-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
100; GCN-NEXT:    v_mov_b32_e32 v3, s5
101; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
102; GCN-NEXT:    s_endpgm
103entry:
104  %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel
105  store <2 x float> %v, ptr addrspace(1) %out
106  ret void
107}
108
109define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec, i32 %sel) {
110; GCN-LABEL: float8_inselt:
111; GCN:       ; %bb.0: ; %entry
112; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
113; GCN-NEXT:    s_load_dword s2, s[4:5], 0x64
114; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
115; GCN-NEXT:    s_waitcnt lgkmcnt(0)
116; GCN-NEXT:    v_mov_b32_e32 v0, s8
117; GCN-NEXT:    s_mov_b32 m0, s2
118; GCN-NEXT:    s_add_u32 s2, s0, 16
119; GCN-NEXT:    s_addc_u32 s3, s1, 0
120; GCN-NEXT:    v_mov_b32_e32 v1, s9
121; GCN-NEXT:    v_mov_b32_e32 v2, s10
122; GCN-NEXT:    v_mov_b32_e32 v3, s11
123; GCN-NEXT:    v_mov_b32_e32 v4, s12
124; GCN-NEXT:    v_mov_b32_e32 v5, s13
125; GCN-NEXT:    v_mov_b32_e32 v6, s14
126; GCN-NEXT:    v_mov_b32_e32 v7, s15
127; GCN-NEXT:    v_mov_b32_e32 v9, s3
128; GCN-NEXT:    v_movreld_b32_e32 v0, 1.0
129; GCN-NEXT:    v_mov_b32_e32 v8, s2
130; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
131; GCN-NEXT:    s_nop 0
132; GCN-NEXT:    v_mov_b32_e32 v5, s1
133; GCN-NEXT:    v_mov_b32_e32 v4, s0
134; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
135; GCN-NEXT:    s_endpgm
136entry:
137  %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel
138  store <8 x float> %v, ptr addrspace(1) %out
139  ret void
140}
141
142define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %vec, i32 %sel) {
143; GCN-LABEL: float16_inselt:
144; GCN:       ; %bb.0: ; %entry
145; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
146; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
147; GCN-NEXT:    s_load_dword s4, s[4:5], 0xa4
148; GCN-NEXT:    s_waitcnt lgkmcnt(0)
149; GCN-NEXT:    v_mov_b32_e32 v0, s8
150; GCN-NEXT:    s_add_u32 s2, s0, 48
151; GCN-NEXT:    s_addc_u32 s3, s1, 0
152; GCN-NEXT:    v_mov_b32_e32 v17, s3
153; GCN-NEXT:    v_mov_b32_e32 v1, s9
154; GCN-NEXT:    v_mov_b32_e32 v2, s10
155; GCN-NEXT:    v_mov_b32_e32 v3, s11
156; GCN-NEXT:    v_mov_b32_e32 v4, s12
157; GCN-NEXT:    v_mov_b32_e32 v5, s13
158; GCN-NEXT:    v_mov_b32_e32 v6, s14
159; GCN-NEXT:    v_mov_b32_e32 v7, s15
160; GCN-NEXT:    v_mov_b32_e32 v8, s16
161; GCN-NEXT:    v_mov_b32_e32 v9, s17
162; GCN-NEXT:    v_mov_b32_e32 v10, s18
163; GCN-NEXT:    v_mov_b32_e32 v11, s19
164; GCN-NEXT:    v_mov_b32_e32 v12, s20
165; GCN-NEXT:    v_mov_b32_e32 v13, s21
166; GCN-NEXT:    v_mov_b32_e32 v14, s22
167; GCN-NEXT:    v_mov_b32_e32 v15, s23
168; GCN-NEXT:    s_mov_b32 m0, s4
169; GCN-NEXT:    v_mov_b32_e32 v16, s2
170; GCN-NEXT:    s_add_u32 s2, s0, 32
171; GCN-NEXT:    v_movreld_b32_e32 v0, 1.0
172; GCN-NEXT:    s_addc_u32 s3, s1, 0
173; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
174; GCN-NEXT:    s_nop 0
175; GCN-NEXT:    v_mov_b32_e32 v13, s3
176; GCN-NEXT:    v_mov_b32_e32 v12, s2
177; GCN-NEXT:    s_add_u32 s2, s0, 16
178; GCN-NEXT:    s_addc_u32 s3, s1, 0
179; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
180; GCN-NEXT:    s_nop 0
181; GCN-NEXT:    v_mov_b32_e32 v9, s3
182; GCN-NEXT:    v_mov_b32_e32 v8, s2
183; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
184; GCN-NEXT:    s_nop 0
185; GCN-NEXT:    v_mov_b32_e32 v5, s1
186; GCN-NEXT:    v_mov_b32_e32 v4, s0
187; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
188; GCN-NEXT:    s_endpgm
189entry:
190  %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel
191  store <16 x float> %v, ptr addrspace(1) %out
192  ret void
193}
194
195define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %vec, i32 %sel) {
196; GCN-LABEL: float32_inselt:
197; GCN:       ; %bb.0: ; %entry
198; GCN-NEXT:    s_load_dword s2, s[4:5], 0x124
199; GCN-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
200; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
201; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
202; GCN-NEXT:    s_waitcnt lgkmcnt(0)
203; GCN-NEXT:    v_mov_b32_e32 v0, s36
204; GCN-NEXT:    s_mov_b32 m0, s2
205; GCN-NEXT:    s_add_u32 s2, s0, 0x70
206; GCN-NEXT:    s_addc_u32 s3, s1, 0
207; GCN-NEXT:    v_mov_b32_e32 v33, s3
208; GCN-NEXT:    v_mov_b32_e32 v1, s37
209; GCN-NEXT:    v_mov_b32_e32 v2, s38
210; GCN-NEXT:    v_mov_b32_e32 v3, s39
211; GCN-NEXT:    v_mov_b32_e32 v4, s40
212; GCN-NEXT:    v_mov_b32_e32 v5, s41
213; GCN-NEXT:    v_mov_b32_e32 v6, s42
214; GCN-NEXT:    v_mov_b32_e32 v7, s43
215; GCN-NEXT:    v_mov_b32_e32 v8, s44
216; GCN-NEXT:    v_mov_b32_e32 v9, s45
217; GCN-NEXT:    v_mov_b32_e32 v10, s46
218; GCN-NEXT:    v_mov_b32_e32 v11, s47
219; GCN-NEXT:    v_mov_b32_e32 v12, s48
220; GCN-NEXT:    v_mov_b32_e32 v13, s49
221; GCN-NEXT:    v_mov_b32_e32 v14, s50
222; GCN-NEXT:    v_mov_b32_e32 v15, s51
223; GCN-NEXT:    v_mov_b32_e32 v16, s8
224; GCN-NEXT:    v_mov_b32_e32 v17, s9
225; GCN-NEXT:    v_mov_b32_e32 v18, s10
226; GCN-NEXT:    v_mov_b32_e32 v19, s11
227; GCN-NEXT:    v_mov_b32_e32 v20, s12
228; GCN-NEXT:    v_mov_b32_e32 v21, s13
229; GCN-NEXT:    v_mov_b32_e32 v22, s14
230; GCN-NEXT:    v_mov_b32_e32 v23, s15
231; GCN-NEXT:    v_mov_b32_e32 v24, s16
232; GCN-NEXT:    v_mov_b32_e32 v25, s17
233; GCN-NEXT:    v_mov_b32_e32 v26, s18
234; GCN-NEXT:    v_mov_b32_e32 v27, s19
235; GCN-NEXT:    v_mov_b32_e32 v28, s20
236; GCN-NEXT:    v_mov_b32_e32 v29, s21
237; GCN-NEXT:    v_mov_b32_e32 v30, s22
238; GCN-NEXT:    v_mov_b32_e32 v31, s23
239; GCN-NEXT:    v_mov_b32_e32 v32, s2
240; GCN-NEXT:    s_add_u32 s2, s0, 0x60
241; GCN-NEXT:    v_movreld_b32_e32 v0, 1.0
242; GCN-NEXT:    s_addc_u32 s3, s1, 0
243; GCN-NEXT:    flat_store_dwordx4 v[32:33], v[28:31]
244; GCN-NEXT:    s_nop 0
245; GCN-NEXT:    v_mov_b32_e32 v29, s3
246; GCN-NEXT:    v_mov_b32_e32 v28, s2
247; GCN-NEXT:    s_add_u32 s2, s0, 0x50
248; GCN-NEXT:    s_addc_u32 s3, s1, 0
249; GCN-NEXT:    flat_store_dwordx4 v[28:29], v[24:27]
250; GCN-NEXT:    s_nop 0
251; GCN-NEXT:    v_mov_b32_e32 v25, s3
252; GCN-NEXT:    v_mov_b32_e32 v24, s2
253; GCN-NEXT:    s_add_u32 s2, s0, 64
254; GCN-NEXT:    s_addc_u32 s3, s1, 0
255; GCN-NEXT:    flat_store_dwordx4 v[24:25], v[20:23]
256; GCN-NEXT:    s_nop 0
257; GCN-NEXT:    v_mov_b32_e32 v21, s3
258; GCN-NEXT:    v_mov_b32_e32 v20, s2
259; GCN-NEXT:    s_add_u32 s2, s0, 48
260; GCN-NEXT:    s_addc_u32 s3, s1, 0
261; GCN-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
262; GCN-NEXT:    s_nop 0
263; GCN-NEXT:    v_mov_b32_e32 v17, s3
264; GCN-NEXT:    v_mov_b32_e32 v16, s2
265; GCN-NEXT:    s_add_u32 s2, s0, 32
266; GCN-NEXT:    s_addc_u32 s3, s1, 0
267; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
268; GCN-NEXT:    s_nop 0
269; GCN-NEXT:    v_mov_b32_e32 v13, s3
270; GCN-NEXT:    v_mov_b32_e32 v12, s2
271; GCN-NEXT:    s_add_u32 s2, s0, 16
272; GCN-NEXT:    s_addc_u32 s3, s1, 0
273; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
274; GCN-NEXT:    s_nop 0
275; GCN-NEXT:    v_mov_b32_e32 v9, s3
276; GCN-NEXT:    v_mov_b32_e32 v8, s2
277; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
278; GCN-NEXT:    s_nop 0
279; GCN-NEXT:    v_mov_b32_e32 v5, s1
280; GCN-NEXT:    v_mov_b32_e32 v4, s0
281; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
282; GCN-NEXT:    s_endpgm
283entry:
284  %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel
285  store <32 x float> %v, ptr addrspace(1) %out
286  ret void
287}
288
289define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) {
290; GCN-LABEL: half4_inselt:
291; GCN:       ; %bb.0: ; %entry
292; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
293; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
294; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
295; GCN-NEXT:    s_mov_b32 s5, s4
296; GCN-NEXT:    s_waitcnt lgkmcnt(0)
297; GCN-NEXT:    s_lshl_b32 s6, s6, 4
298; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
299; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
300; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
301; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
302; GCN-NEXT:    v_mov_b32_e32 v0, s0
303; GCN-NEXT:    v_mov_b32_e32 v2, s2
304; GCN-NEXT:    v_mov_b32_e32 v1, s1
305; GCN-NEXT:    v_mov_b32_e32 v3, s3
306; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
307; GCN-NEXT:    s_endpgm
308entry:
309  %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
310  store <4 x half> %v, ptr addrspace(1) %out
311  ret void
312}
313
314define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, i32 %sel) {
315; GCN-LABEL: half2_inselt:
316; GCN:       ; %bb.0: ; %entry
317; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
318; GCN-NEXT:    s_waitcnt lgkmcnt(0)
319; GCN-NEXT:    s_lshl_b32 s3, s3, 4
320; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
321; GCN-NEXT:    s_andn2_b32 s2, s2, s3
322; GCN-NEXT:    s_and_b32 s3, s3, 0x3c003c00
323; GCN-NEXT:    s_or_b32 s2, s3, s2
324; GCN-NEXT:    v_mov_b32_e32 v0, s0
325; GCN-NEXT:    v_mov_b32_e32 v1, s1
326; GCN-NEXT:    v_mov_b32_e32 v2, s2
327; GCN-NEXT:    flat_store_dword v[0:1], v2
328; GCN-NEXT:    s_endpgm
329entry:
330  %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel
331  store <2 x half> %v, ptr addrspace(1) %out
332  ret void
333}
334
335define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, i32 %sel) {
336; GCN-LABEL: half8_inselt:
337; GCN:       ; %bb.0: ; %entry
338; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
339; GCN-NEXT:    s_load_dword s6, s[4:5], 0x44
340; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
341; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c00
342; GCN-NEXT:    s_waitcnt lgkmcnt(0)
343; GCN-NEXT:    s_lshr_b32 s7, s3, 16
344; GCN-NEXT:    s_cmp_lg_u32 s6, 7
345; GCN-NEXT:    v_mov_b32_e32 v1, s7
346; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
347; GCN-NEXT:    s_cmp_lg_u32 s6, 6
348; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
349; GCN-NEXT:    v_mov_b32_e32 v2, s3
350; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
351; GCN-NEXT:    s_lshr_b32 s3, s2, 16
352; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
353; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
354; GCN-NEXT:    s_cmp_lg_u32 s6, 5
355; GCN-NEXT:    v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
356; GCN-NEXT:    v_mov_b32_e32 v1, s3
357; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
358; GCN-NEXT:    s_cmp_lg_u32 s6, 4
359; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
360; GCN-NEXT:    v_mov_b32_e32 v2, s2
361; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
362; GCN-NEXT:    s_lshr_b32 s2, s1, 16
363; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
364; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
365; GCN-NEXT:    s_cmp_lg_u32 s6, 3
366; GCN-NEXT:    v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
367; GCN-NEXT:    v_mov_b32_e32 v1, s2
368; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
369; GCN-NEXT:    s_cmp_lg_u32 s6, 2
370; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
371; GCN-NEXT:    v_mov_b32_e32 v4, s1
372; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
373; GCN-NEXT:    s_lshr_b32 s1, s0, 16
374; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
375; GCN-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
376; GCN-NEXT:    s_cmp_lg_u32 s6, 1
377; GCN-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
378; GCN-NEXT:    v_mov_b32_e32 v4, s1
379; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
380; GCN-NEXT:    s_cmp_lg_u32 s6, 0
381; GCN-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
382; GCN-NEXT:    v_mov_b32_e32 v5, s0
383; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
384; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
385; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
386; GCN-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
387; GCN-NEXT:    v_mov_b32_e32 v4, s4
388; GCN-NEXT:    v_mov_b32_e32 v5, s5
389; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
390; GCN-NEXT:    s_endpgm
391entry:
392  %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel
393  store <8 x half> %v, ptr addrspace(1) %out
394  ret void
395}
396
397define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, i32 %sel) {
398; GCN-LABEL: short2_inselt:
399; GCN:       ; %bb.0: ; %entry
400; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
401; GCN-NEXT:    s_waitcnt lgkmcnt(0)
402; GCN-NEXT:    s_lshl_b32 s3, s3, 4
403; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
404; GCN-NEXT:    s_andn2_b32 s2, s2, s3
405; GCN-NEXT:    s_and_b32 s3, s3, 0x10001
406; GCN-NEXT:    s_or_b32 s2, s3, s2
407; GCN-NEXT:    v_mov_b32_e32 v0, s0
408; GCN-NEXT:    v_mov_b32_e32 v1, s1
409; GCN-NEXT:    v_mov_b32_e32 v2, s2
410; GCN-NEXT:    flat_store_dword v[0:1], v2
411; GCN-NEXT:    s_endpgm
412entry:
413  %v = insertelement <2 x i16> %vec, i16 1, i32 %sel
414  store <2 x i16> %v, ptr addrspace(1) %out
415  ret void
416}
417
418define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) {
419; GCN-LABEL: short4_inselt:
420; GCN:       ; %bb.0: ; %entry
421; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
422; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
423; GCN-NEXT:    s_mov_b32 s4, 0x10001
424; GCN-NEXT:    s_mov_b32 s5, s4
425; GCN-NEXT:    s_waitcnt lgkmcnt(0)
426; GCN-NEXT:    s_lshl_b32 s6, s6, 4
427; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
428; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
429; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
430; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
431; GCN-NEXT:    v_mov_b32_e32 v0, s0
432; GCN-NEXT:    v_mov_b32_e32 v2, s2
433; GCN-NEXT:    v_mov_b32_e32 v1, s1
434; GCN-NEXT:    v_mov_b32_e32 v3, s3
435; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
436; GCN-NEXT:    s_endpgm
437entry:
438  %v = insertelement <4 x i16> %vec, i16 1, i32 %sel
439  store <4 x i16> %v, ptr addrspace(1) %out
440  ret void
441}
442
443define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) {
444; GCN-LABEL: byte8_inselt:
445; GCN:       ; %bb.0: ; %entry
446; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
447; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
448; GCN-NEXT:    s_waitcnt lgkmcnt(0)
449; GCN-NEXT:    s_lshl_b32 s4, s6, 3
450; GCN-NEXT:    s_lshl_b64 s[4:5], 0xff, s4
451; GCN-NEXT:    s_and_b32 s7, s5, 0x1010101
452; GCN-NEXT:    s_and_b32 s6, s4, 0x1010101
453; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
454; GCN-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
455; GCN-NEXT:    v_mov_b32_e32 v0, s0
456; GCN-NEXT:    v_mov_b32_e32 v2, s2
457; GCN-NEXT:    v_mov_b32_e32 v1, s1
458; GCN-NEXT:    v_mov_b32_e32 v3, s3
459; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
460; GCN-NEXT:    s_endpgm
461entry:
462  %v = insertelement <8 x i8> %vec, i8 1, i32 %sel
463  store <8 x i8> %v, ptr addrspace(1) %out
464  ret void
465}
466
467define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, i32 %sel) {
468; GCN-LABEL: byte16_inselt:
469; GCN:       ; %bb.0: ; %entry
470; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
471; GCN-NEXT:    s_load_dword s6, s[4:5], 0x44
472; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
473; GCN-NEXT:    s_waitcnt lgkmcnt(0)
474; GCN-NEXT:    s_lshr_b32 s7, s3, 24
475; GCN-NEXT:    s_cmp_lg_u32 s6, 15
476; GCN-NEXT:    s_cselect_b32 s7, s7, 1
477; GCN-NEXT:    s_lshr_b32 s8, s3, 16
478; GCN-NEXT:    s_lshl_b32 s7, s7, 8
479; GCN-NEXT:    s_cmp_lg_u32 s6, 14
480; GCN-NEXT:    s_cselect_b32 s8, s8, 1
481; GCN-NEXT:    s_and_b32 s8, s8, 0xff
482; GCN-NEXT:    s_or_b32 s7, s8, s7
483; GCN-NEXT:    s_lshr_b32 s9, s3, 8
484; GCN-NEXT:    s_lshl_b32 s7, s7, 16
485; GCN-NEXT:    s_cmp_lg_u32 s6, 13
486; GCN-NEXT:    s_cselect_b32 s8, s9, 1
487; GCN-NEXT:    s_lshl_b32 s8, s8, 8
488; GCN-NEXT:    s_cmp_lg_u32 s6, 12
489; GCN-NEXT:    s_cselect_b32 s3, s3, 1
490; GCN-NEXT:    s_and_b32 s3, s3, 0xff
491; GCN-NEXT:    s_or_b32 s3, s3, s8
492; GCN-NEXT:    s_and_b32 s3, s3, 0xffff
493; GCN-NEXT:    s_or_b32 s3, s3, s7
494; GCN-NEXT:    s_lshr_b32 s7, s2, 24
495; GCN-NEXT:    s_cmp_lg_u32 s6, 11
496; GCN-NEXT:    s_cselect_b32 s7, s7, 1
497; GCN-NEXT:    s_lshl_b32 s7, s7, 8
498; GCN-NEXT:    s_lshr_b32 s8, s2, 16
499; GCN-NEXT:    s_cmp_lg_u32 s6, 10
500; GCN-NEXT:    s_cselect_b32 s8, s8, 1
501; GCN-NEXT:    s_and_b32 s8, s8, 0xff
502; GCN-NEXT:    s_or_b32 s7, s8, s7
503; GCN-NEXT:    s_lshl_b32 s7, s7, 16
504; GCN-NEXT:    s_lshr_b32 s8, s2, 8
505; GCN-NEXT:    s_cmp_lg_u32 s6, 9
506; GCN-NEXT:    s_cselect_b32 s8, s8, 1
507; GCN-NEXT:    s_lshl_b32 s8, s8, 8
508; GCN-NEXT:    s_cmp_lg_u32 s6, 8
509; GCN-NEXT:    s_cselect_b32 s2, s2, 1
510; GCN-NEXT:    s_and_b32 s2, s2, 0xff
511; GCN-NEXT:    s_or_b32 s2, s2, s8
512; GCN-NEXT:    s_and_b32 s2, s2, 0xffff
513; GCN-NEXT:    s_or_b32 s2, s2, s7
514; GCN-NEXT:    s_lshr_b32 s7, s1, 24
515; GCN-NEXT:    s_cmp_lg_u32 s6, 7
516; GCN-NEXT:    s_cselect_b32 s7, s7, 1
517; GCN-NEXT:    s_lshl_b32 s7, s7, 8
518; GCN-NEXT:    s_lshr_b32 s8, s1, 16
519; GCN-NEXT:    s_cmp_lg_u32 s6, 6
520; GCN-NEXT:    s_cselect_b32 s8, s8, 1
521; GCN-NEXT:    s_and_b32 s8, s8, 0xff
522; GCN-NEXT:    s_or_b32 s7, s8, s7
523; GCN-NEXT:    s_lshl_b32 s7, s7, 16
524; GCN-NEXT:    s_lshr_b32 s8, s1, 8
525; GCN-NEXT:    s_cmp_lg_u32 s6, 5
526; GCN-NEXT:    s_cselect_b32 s8, s8, 1
527; GCN-NEXT:    s_lshl_b32 s8, s8, 8
528; GCN-NEXT:    s_cmp_lg_u32 s6, 4
529; GCN-NEXT:    s_cselect_b32 s1, s1, 1
530; GCN-NEXT:    s_and_b32 s1, s1, 0xff
531; GCN-NEXT:    s_or_b32 s1, s1, s8
532; GCN-NEXT:    s_and_b32 s1, s1, 0xffff
533; GCN-NEXT:    s_or_b32 s1, s1, s7
534; GCN-NEXT:    s_lshr_b32 s7, s0, 24
535; GCN-NEXT:    s_cmp_lg_u32 s6, 3
536; GCN-NEXT:    s_cselect_b32 s7, s7, 1
537; GCN-NEXT:    s_lshl_b32 s7, s7, 8
538; GCN-NEXT:    s_lshr_b32 s8, s0, 16
539; GCN-NEXT:    s_cmp_lg_u32 s6, 2
540; GCN-NEXT:    s_cselect_b32 s8, s8, 1
541; GCN-NEXT:    s_and_b32 s8, s8, 0xff
542; GCN-NEXT:    s_or_b32 s7, s8, s7
543; GCN-NEXT:    s_lshl_b32 s7, s7, 16
544; GCN-NEXT:    s_lshr_b32 s8, s0, 8
545; GCN-NEXT:    s_cmp_lg_u32 s6, 1
546; GCN-NEXT:    s_cselect_b32 s8, s8, 1
547; GCN-NEXT:    s_lshl_b32 s8, s8, 8
548; GCN-NEXT:    s_cmp_lg_u32 s6, 0
549; GCN-NEXT:    s_cselect_b32 s0, s0, 1
550; GCN-NEXT:    s_and_b32 s0, s0, 0xff
551; GCN-NEXT:    s_or_b32 s0, s0, s8
552; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
553; GCN-NEXT:    s_or_b32 s0, s0, s7
554; GCN-NEXT:    v_mov_b32_e32 v4, s4
555; GCN-NEXT:    v_mov_b32_e32 v0, s0
556; GCN-NEXT:    v_mov_b32_e32 v1, s1
557; GCN-NEXT:    v_mov_b32_e32 v2, s2
558; GCN-NEXT:    v_mov_b32_e32 v3, s3
559; GCN-NEXT:    v_mov_b32_e32 v5, s5
560; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
561; GCN-NEXT:    s_endpgm
562entry:
563  %v = insertelement <16 x i8> %vec, i8 1, i32 %sel
564  store <16 x i8> %v, ptr addrspace(1) %out
565  ret void
566}
567
568define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %vec, i32 %sel) {
569; GCN-LABEL: double2_inselt:
570; GCN:       ; %bb.0: ; %entry
571; GCN-NEXT:    s_load_dword s6, s[4:5], 0x44
572; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
573; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
574; GCN-NEXT:    s_waitcnt lgkmcnt(0)
575; GCN-NEXT:    s_cmp_eq_u32 s6, 1
576; GCN-NEXT:    s_cselect_b32 s3, 0x3ff00000, s3
577; GCN-NEXT:    s_cselect_b32 s2, 0, s2
578; GCN-NEXT:    s_cmp_eq_u32 s6, 0
579; GCN-NEXT:    s_cselect_b32 s1, 0x3ff00000, s1
580; GCN-NEXT:    s_cselect_b32 s0, 0, s0
581; GCN-NEXT:    v_mov_b32_e32 v4, s4
582; GCN-NEXT:    v_mov_b32_e32 v0, s0
583; GCN-NEXT:    v_mov_b32_e32 v1, s1
584; GCN-NEXT:    v_mov_b32_e32 v2, s2
585; GCN-NEXT:    v_mov_b32_e32 v3, s3
586; GCN-NEXT:    v_mov_b32_e32 v5, s5
587; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
588; GCN-NEXT:    s_endpgm
589entry:
590  %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel
591  store <2 x double> %v, ptr addrspace(1) %out
592  ret void
593}
594
595define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) {
596; GCN-LABEL: double5_inselt:
597; GCN:       ; %bb.0: ; %entry
598; GCN-NEXT:    s_load_dword s12, s[4:5], 0xa4
599; GCN-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x84
600; GCN-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x24
601; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x64
602; GCN-NEXT:    s_waitcnt lgkmcnt(0)
603; GCN-NEXT:    s_cmp_eq_u32 s12, 4
604; GCN-NEXT:    s_cselect_b32 s9, 0x3ff00000, s9
605; GCN-NEXT:    s_cselect_b32 s8, 0, s8
606; GCN-NEXT:    s_cmp_eq_u32 s12, 1
607; GCN-NEXT:    s_cselect_b32 s3, 0x3ff00000, s3
608; GCN-NEXT:    s_cselect_b32 s2, 0, s2
609; GCN-NEXT:    s_cmp_eq_u32 s12, 0
610; GCN-NEXT:    s_cselect_b32 s13, 0x3ff00000, s1
611; GCN-NEXT:    s_cselect_b32 s14, 0, s0
612; GCN-NEXT:    s_cmp_eq_u32 s12, 3
613; GCN-NEXT:    s_cselect_b32 s0, 0x3ff00000, s7
614; GCN-NEXT:    s_cselect_b32 s1, 0, s6
615; GCN-NEXT:    s_cmp_eq_u32 s12, 2
616; GCN-NEXT:    s_cselect_b32 s5, 0x3ff00000, s5
617; GCN-NEXT:    s_cselect_b32 s4, 0, s4
618; GCN-NEXT:    v_mov_b32_e32 v3, s0
619; GCN-NEXT:    s_add_u32 s0, s10, 16
620; GCN-NEXT:    v_mov_b32_e32 v2, s1
621; GCN-NEXT:    s_addc_u32 s1, s11, 0
622; GCN-NEXT:    v_mov_b32_e32 v5, s1
623; GCN-NEXT:    v_mov_b32_e32 v0, s4
624; GCN-NEXT:    v_mov_b32_e32 v1, s5
625; GCN-NEXT:    v_mov_b32_e32 v4, s0
626; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
627; GCN-NEXT:    v_mov_b32_e32 v4, s10
628; GCN-NEXT:    s_add_u32 s0, s10, 32
629; GCN-NEXT:    v_mov_b32_e32 v0, s14
630; GCN-NEXT:    v_mov_b32_e32 v1, s13
631; GCN-NEXT:    v_mov_b32_e32 v2, s2
632; GCN-NEXT:    v_mov_b32_e32 v3, s3
633; GCN-NEXT:    v_mov_b32_e32 v5, s11
634; GCN-NEXT:    s_addc_u32 s1, s11, 0
635; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
636; GCN-NEXT:    s_nop 0
637; GCN-NEXT:    v_mov_b32_e32 v3, s1
638; GCN-NEXT:    v_mov_b32_e32 v0, s8
639; GCN-NEXT:    v_mov_b32_e32 v1, s9
640; GCN-NEXT:    v_mov_b32_e32 v2, s0
641; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
642; GCN-NEXT:    s_endpgm
643entry:
644  %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
645  store <5 x double> %v, ptr addrspace(1) %out
646  ret void
647}
648
649define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %vec, i32 %sel) {
650; GCN-LABEL: double8_inselt:
651; GCN:       ; %bb.0: ; %entry
652; GCN-NEXT:    s_load_dword s2, s[4:5], 0xa4
653; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
654; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
655; GCN-NEXT:    v_mov_b32_e32 v16, 0x3ff00000
656; GCN-NEXT:    s_waitcnt lgkmcnt(0)
657; GCN-NEXT:    s_lshl_b32 s2, s2, 1
658; GCN-NEXT:    v_mov_b32_e32 v0, s8
659; GCN-NEXT:    v_mov_b32_e32 v1, s9
660; GCN-NEXT:    v_mov_b32_e32 v2, s10
661; GCN-NEXT:    v_mov_b32_e32 v3, s11
662; GCN-NEXT:    v_mov_b32_e32 v4, s12
663; GCN-NEXT:    v_mov_b32_e32 v5, s13
664; GCN-NEXT:    v_mov_b32_e32 v6, s14
665; GCN-NEXT:    v_mov_b32_e32 v7, s15
666; GCN-NEXT:    v_mov_b32_e32 v8, s16
667; GCN-NEXT:    v_mov_b32_e32 v9, s17
668; GCN-NEXT:    v_mov_b32_e32 v10, s18
669; GCN-NEXT:    v_mov_b32_e32 v11, s19
670; GCN-NEXT:    v_mov_b32_e32 v12, s20
671; GCN-NEXT:    v_mov_b32_e32 v13, s21
672; GCN-NEXT:    v_mov_b32_e32 v14, s22
673; GCN-NEXT:    v_mov_b32_e32 v15, s23
674; GCN-NEXT:    s_mov_b32 m0, s2
675; GCN-NEXT:    s_add_u32 s2, s0, 48
676; GCN-NEXT:    v_movreld_b32_e32 v0, 0
677; GCN-NEXT:    s_addc_u32 s3, s1, 0
678; GCN-NEXT:    v_movreld_b32_e32 v1, v16
679; GCN-NEXT:    v_mov_b32_e32 v17, s3
680; GCN-NEXT:    v_mov_b32_e32 v16, s2
681; GCN-NEXT:    s_add_u32 s2, s0, 32
682; GCN-NEXT:    s_addc_u32 s3, s1, 0
683; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
684; GCN-NEXT:    s_nop 0
685; GCN-NEXT:    v_mov_b32_e32 v13, s3
686; GCN-NEXT:    v_mov_b32_e32 v12, s2
687; GCN-NEXT:    s_add_u32 s2, s0, 16
688; GCN-NEXT:    s_addc_u32 s3, s1, 0
689; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
690; GCN-NEXT:    s_nop 0
691; GCN-NEXT:    v_mov_b32_e32 v9, s3
692; GCN-NEXT:    v_mov_b32_e32 v8, s2
693; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
694; GCN-NEXT:    s_nop 0
695; GCN-NEXT:    v_mov_b32_e32 v5, s1
696; GCN-NEXT:    v_mov_b32_e32 v4, s0
697; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
698; GCN-NEXT:    s_endpgm
699entry:
700  %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
701  store <8 x double> %v, ptr addrspace(1) %out
702  ret void
703}
704
705define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %vec, i32 %sel) {
706; GCN-LABEL: double7_inselt:
707; GCN:       ; %bb.0: ; %entry
708; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x64
709; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
710; GCN-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x94
711; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x84
712; GCN-NEXT:    v_mov_b32_e32 v16, 0x3ff00000
713; GCN-NEXT:    s_waitcnt lgkmcnt(0)
714; GCN-NEXT:    v_mov_b32_e32 v0, s8
715; GCN-NEXT:    v_mov_b32_e32 v1, s9
716; GCN-NEXT:    v_mov_b32_e32 v2, s10
717; GCN-NEXT:    v_mov_b32_e32 v8, s0
718; GCN-NEXT:    s_load_dword s0, s[4:5], 0xa4
719; GCN-NEXT:    v_mov_b32_e32 v3, s11
720; GCN-NEXT:    v_mov_b32_e32 v4, s12
721; GCN-NEXT:    v_mov_b32_e32 v5, s13
722; GCN-NEXT:    v_mov_b32_e32 v6, s14
723; GCN-NEXT:    s_waitcnt lgkmcnt(0)
724; GCN-NEXT:    s_lshl_b32 s0, s0, 1
725; GCN-NEXT:    v_mov_b32_e32 v7, s15
726; GCN-NEXT:    v_mov_b32_e32 v9, s1
727; GCN-NEXT:    v_mov_b32_e32 v10, s2
728; GCN-NEXT:    v_mov_b32_e32 v11, s3
729; GCN-NEXT:    v_mov_b32_e32 v12, s16
730; GCN-NEXT:    v_mov_b32_e32 v13, s17
731; GCN-NEXT:    s_mov_b32 m0, s0
732; GCN-NEXT:    v_movreld_b32_e32 v0, 0
733; GCN-NEXT:    s_add_u32 s0, s6, 16
734; GCN-NEXT:    v_movreld_b32_e32 v1, v16
735; GCN-NEXT:    s_addc_u32 s1, s7, 0
736; GCN-NEXT:    v_mov_b32_e32 v15, s1
737; GCN-NEXT:    v_mov_b32_e32 v14, s0
738; GCN-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
739; GCN-NEXT:    s_add_u32 s0, s6, 48
740; GCN-NEXT:    v_mov_b32_e32 v4, s6
741; GCN-NEXT:    v_mov_b32_e32 v5, s7
742; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
743; GCN-NEXT:    s_addc_u32 s1, s7, 0
744; GCN-NEXT:    v_mov_b32_e32 v0, s0
745; GCN-NEXT:    v_mov_b32_e32 v1, s1
746; GCN-NEXT:    s_add_u32 s0, s6, 32
747; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[12:13]
748; GCN-NEXT:    s_addc_u32 s1, s7, 0
749; GCN-NEXT:    v_mov_b32_e32 v0, s0
750; GCN-NEXT:    v_mov_b32_e32 v1, s1
751; GCN-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
752; GCN-NEXT:    s_endpgm
753entry:
754  %v = insertelement <7 x double> %vec, double 1.000000e+00, i32 %sel
755  store <7 x double> %v, ptr addrspace(1) %out
756  ret void
757}
758
759define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> %vec, i32 %sel) {
760; GCN-LABEL: double16_inselt:
761; GCN:       ; %bb.0: ; %entry
762; GCN-NEXT:    s_load_dword s0, s[4:5], 0x124
763; GCN-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
764; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
765; GCN-NEXT:    v_mov_b32_e32 v32, 0x3ff00000
766; GCN-NEXT:    s_waitcnt lgkmcnt(0)
767; GCN-NEXT:    v_mov_b32_e32 v0, s36
768; GCN-NEXT:    s_lshl_b32 s0, s0, 1
769; GCN-NEXT:    s_mov_b32 m0, s0
770; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
771; GCN-NEXT:    v_mov_b32_e32 v1, s37
772; GCN-NEXT:    v_mov_b32_e32 v2, s38
773; GCN-NEXT:    v_mov_b32_e32 v3, s39
774; GCN-NEXT:    v_mov_b32_e32 v4, s40
775; GCN-NEXT:    v_mov_b32_e32 v5, s41
776; GCN-NEXT:    v_mov_b32_e32 v6, s42
777; GCN-NEXT:    v_mov_b32_e32 v7, s43
778; GCN-NEXT:    v_mov_b32_e32 v8, s44
779; GCN-NEXT:    v_mov_b32_e32 v9, s45
780; GCN-NEXT:    v_mov_b32_e32 v10, s46
781; GCN-NEXT:    v_mov_b32_e32 v11, s47
782; GCN-NEXT:    v_mov_b32_e32 v12, s48
783; GCN-NEXT:    v_mov_b32_e32 v13, s49
784; GCN-NEXT:    v_mov_b32_e32 v14, s50
785; GCN-NEXT:    v_mov_b32_e32 v15, s51
786; GCN-NEXT:    v_mov_b32_e32 v16, s8
787; GCN-NEXT:    v_mov_b32_e32 v17, s9
788; GCN-NEXT:    v_mov_b32_e32 v18, s10
789; GCN-NEXT:    v_mov_b32_e32 v19, s11
790; GCN-NEXT:    v_mov_b32_e32 v20, s12
791; GCN-NEXT:    v_mov_b32_e32 v21, s13
792; GCN-NEXT:    v_mov_b32_e32 v22, s14
793; GCN-NEXT:    v_mov_b32_e32 v23, s15
794; GCN-NEXT:    v_mov_b32_e32 v24, s16
795; GCN-NEXT:    v_mov_b32_e32 v25, s17
796; GCN-NEXT:    v_mov_b32_e32 v26, s18
797; GCN-NEXT:    v_mov_b32_e32 v27, s19
798; GCN-NEXT:    v_mov_b32_e32 v28, s20
799; GCN-NEXT:    v_mov_b32_e32 v29, s21
800; GCN-NEXT:    v_mov_b32_e32 v30, s22
801; GCN-NEXT:    v_mov_b32_e32 v31, s23
802; GCN-NEXT:    s_waitcnt lgkmcnt(0)
803; GCN-NEXT:    s_add_u32 s2, s0, 0x70
804; GCN-NEXT:    v_movreld_b32_e32 v0, 0
805; GCN-NEXT:    s_addc_u32 s3, s1, 0
806; GCN-NEXT:    v_movreld_b32_e32 v1, v32
807; GCN-NEXT:    v_mov_b32_e32 v33, s3
808; GCN-NEXT:    v_mov_b32_e32 v32, s2
809; GCN-NEXT:    s_add_u32 s2, s0, 0x60
810; GCN-NEXT:    s_addc_u32 s3, s1, 0
811; GCN-NEXT:    flat_store_dwordx4 v[32:33], v[28:31]
812; GCN-NEXT:    s_nop 0
813; GCN-NEXT:    v_mov_b32_e32 v29, s3
814; GCN-NEXT:    v_mov_b32_e32 v28, s2
815; GCN-NEXT:    s_add_u32 s2, s0, 0x50
816; GCN-NEXT:    s_addc_u32 s3, s1, 0
817; GCN-NEXT:    flat_store_dwordx4 v[28:29], v[24:27]
818; GCN-NEXT:    s_nop 0
819; GCN-NEXT:    v_mov_b32_e32 v25, s3
820; GCN-NEXT:    v_mov_b32_e32 v24, s2
821; GCN-NEXT:    s_add_u32 s2, s0, 64
822; GCN-NEXT:    s_addc_u32 s3, s1, 0
823; GCN-NEXT:    flat_store_dwordx4 v[24:25], v[20:23]
824; GCN-NEXT:    s_nop 0
825; GCN-NEXT:    v_mov_b32_e32 v21, s3
826; GCN-NEXT:    v_mov_b32_e32 v20, s2
827; GCN-NEXT:    s_add_u32 s2, s0, 48
828; GCN-NEXT:    s_addc_u32 s3, s1, 0
829; GCN-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
830; GCN-NEXT:    s_nop 0
831; GCN-NEXT:    v_mov_b32_e32 v17, s3
832; GCN-NEXT:    v_mov_b32_e32 v16, s2
833; GCN-NEXT:    s_add_u32 s2, s0, 32
834; GCN-NEXT:    s_addc_u32 s3, s1, 0
835; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
836; GCN-NEXT:    s_nop 0
837; GCN-NEXT:    v_mov_b32_e32 v13, s3
838; GCN-NEXT:    v_mov_b32_e32 v12, s2
839; GCN-NEXT:    s_add_u32 s2, s0, 16
840; GCN-NEXT:    s_addc_u32 s3, s1, 0
841; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
842; GCN-NEXT:    s_nop 0
843; GCN-NEXT:    v_mov_b32_e32 v9, s3
844; GCN-NEXT:    v_mov_b32_e32 v8, s2
845; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
846; GCN-NEXT:    s_nop 0
847; GCN-NEXT:    v_mov_b32_e32 v5, s1
848; GCN-NEXT:    v_mov_b32_e32 v4, s0
849; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
850; GCN-NEXT:    s_endpgm
851entry:
852  %v = insertelement <16 x double> %vec, double 1.000000e+00, i32 %sel
853  store <16 x double> %v, ptr addrspace(1) %out
854  ret void
855}
856
857define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> %vec, i32 %sel) {
858; GCN-LABEL: double15_inselt:
859; GCN:       ; %bb.0: ; %entry
860; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xa4
861; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x114
862; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x104
863; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0xe4
864; GCN-NEXT:    v_mov_b32_e32 v32, 0x3ff00000
865; GCN-NEXT:    s_waitcnt lgkmcnt(0)
866; GCN-NEXT:    v_mov_b32_e32 v0, s8
867; GCN-NEXT:    v_mov_b32_e32 v1, s9
868; GCN-NEXT:    v_mov_b32_e32 v24, s0
869; GCN-NEXT:    s_load_dword s0, s[4:5], 0x124
870; GCN-NEXT:    v_mov_b32_e32 v25, s1
871; GCN-NEXT:    v_mov_b32_e32 v2, s10
872; GCN-NEXT:    v_mov_b32_e32 v3, s11
873; GCN-NEXT:    v_mov_b32_e32 v4, s12
874; GCN-NEXT:    s_waitcnt lgkmcnt(0)
875; GCN-NEXT:    s_lshl_b32 s0, s0, 1
876; GCN-NEXT:    s_mov_b32 m0, s0
877; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
878; GCN-NEXT:    v_mov_b32_e32 v5, s13
879; GCN-NEXT:    v_mov_b32_e32 v6, s14
880; GCN-NEXT:    v_mov_b32_e32 v7, s15
881; GCN-NEXT:    v_mov_b32_e32 v8, s16
882; GCN-NEXT:    v_mov_b32_e32 v9, s17
883; GCN-NEXT:    v_mov_b32_e32 v10, s18
884; GCN-NEXT:    v_mov_b32_e32 v11, s19
885; GCN-NEXT:    v_mov_b32_e32 v12, s20
886; GCN-NEXT:    v_mov_b32_e32 v13, s21
887; GCN-NEXT:    v_mov_b32_e32 v14, s22
888; GCN-NEXT:    v_mov_b32_e32 v15, s23
889; GCN-NEXT:    v_mov_b32_e32 v16, s24
890; GCN-NEXT:    v_mov_b32_e32 v17, s25
891; GCN-NEXT:    v_mov_b32_e32 v18, s26
892; GCN-NEXT:    v_mov_b32_e32 v19, s27
893; GCN-NEXT:    v_mov_b32_e32 v20, s28
894; GCN-NEXT:    v_mov_b32_e32 v21, s29
895; GCN-NEXT:    v_mov_b32_e32 v22, s30
896; GCN-NEXT:    v_mov_b32_e32 v23, s31
897; GCN-NEXT:    v_mov_b32_e32 v26, s2
898; GCN-NEXT:    v_mov_b32_e32 v27, s3
899; GCN-NEXT:    v_mov_b32_e32 v28, s6
900; GCN-NEXT:    v_mov_b32_e32 v29, s7
901; GCN-NEXT:    v_movreld_b32_e32 v0, 0
902; GCN-NEXT:    s_waitcnt lgkmcnt(0)
903; GCN-NEXT:    s_add_u32 s2, s0, 0x50
904; GCN-NEXT:    v_movreld_b32_e32 v1, v32
905; GCN-NEXT:    s_addc_u32 s3, s1, 0
906; GCN-NEXT:    v_mov_b32_e32 v31, s3
907; GCN-NEXT:    v_mov_b32_e32 v30, s2
908; GCN-NEXT:    s_add_u32 s2, s0, 64
909; GCN-NEXT:    s_addc_u32 s3, s1, 0
910; GCN-NEXT:    flat_store_dwordx4 v[30:31], v[20:23]
911; GCN-NEXT:    s_nop 0
912; GCN-NEXT:    v_mov_b32_e32 v21, s3
913; GCN-NEXT:    v_mov_b32_e32 v20, s2
914; GCN-NEXT:    s_add_u32 s2, s0, 48
915; GCN-NEXT:    s_addc_u32 s3, s1, 0
916; GCN-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
917; GCN-NEXT:    s_nop 0
918; GCN-NEXT:    v_mov_b32_e32 v17, s3
919; GCN-NEXT:    v_mov_b32_e32 v16, s2
920; GCN-NEXT:    s_add_u32 s2, s0, 32
921; GCN-NEXT:    s_addc_u32 s3, s1, 0
922; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
923; GCN-NEXT:    s_nop 0
924; GCN-NEXT:    v_mov_b32_e32 v13, s3
925; GCN-NEXT:    v_mov_b32_e32 v12, s2
926; GCN-NEXT:    s_add_u32 s2, s0, 16
927; GCN-NEXT:    s_addc_u32 s3, s1, 0
928; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
929; GCN-NEXT:    s_nop 0
930; GCN-NEXT:    v_mov_b32_e32 v9, s3
931; GCN-NEXT:    v_mov_b32_e32 v8, s2
932; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
933; GCN-NEXT:    s_add_u32 s2, s0, 0x70
934; GCN-NEXT:    v_mov_b32_e32 v5, s1
935; GCN-NEXT:    v_mov_b32_e32 v4, s0
936; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
937; GCN-NEXT:    s_addc_u32 s3, s1, 0
938; GCN-NEXT:    v_mov_b32_e32 v0, s2
939; GCN-NEXT:    v_mov_b32_e32 v1, s3
940; GCN-NEXT:    s_add_u32 s0, s0, 0x60
941; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[28:29]
942; GCN-NEXT:    s_addc_u32 s1, s1, 0
943; GCN-NEXT:    v_mov_b32_e32 v0, s0
944; GCN-NEXT:    v_mov_b32_e32 v1, s1
945; GCN-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
946; GCN-NEXT:    s_endpgm
947entry:
948  %v = insertelement <15 x double> %vec, double 1.000000e+00, i32 %sel
949  store <15 x double> %v, ptr addrspace(1) %out
950  ret void
951}
952
953; FIXME: Fold out s_or_b32 s2, 0, s3
954define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 %sel) {
955; GCN-LABEL: bit4_inselt:
956; GCN:       ; %bb.0: ; %entry
957; GCN-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
958; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
959; GCN-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
960; GCN-NEXT:    s_mov_b32 s14, -1
961; GCN-NEXT:    s_mov_b32 s15, 0xe80000
962; GCN-NEXT:    s_add_u32 s12, s12, s11
963; GCN-NEXT:    s_addc_u32 s13, s13, 0
964; GCN-NEXT:    s_waitcnt lgkmcnt(0)
965; GCN-NEXT:    s_bfe_u32 s6, s2, 0x10003
966; GCN-NEXT:    v_mov_b32_e32 v0, s2
967; GCN-NEXT:    s_bfe_u32 s5, s2, 0x20002
968; GCN-NEXT:    buffer_store_byte v0, off, s[12:15], 0
969; GCN-NEXT:    v_mov_b32_e32 v0, s6
970; GCN-NEXT:    s_bfe_u32 s4, s2, 0x10001
971; GCN-NEXT:    buffer_store_byte v0, off, s[12:15], 0 offset:3
972; GCN-NEXT:    v_mov_b32_e32 v0, s5
973; GCN-NEXT:    s_and_b32 s3, s3, 3
974; GCN-NEXT:    buffer_store_byte v0, off, s[12:15], 0 offset:2
975; GCN-NEXT:    v_mov_b32_e32 v0, s4
976; GCN-NEXT:    v_or_b32_e64 v1, s3, 0
977; GCN-NEXT:    buffer_store_byte v0, off, s[12:15], 0 offset:1
978; GCN-NEXT:    v_mov_b32_e32 v0, 1
979; GCN-NEXT:    buffer_store_byte v0, v1, s[12:15], 0 offen
980; GCN-NEXT:    buffer_load_ubyte v0, off, s[12:15], 0
981; GCN-NEXT:    buffer_load_ubyte v1, off, s[12:15], 0 offset:1
982; GCN-NEXT:    buffer_load_ubyte v2, off, s[12:15], 0 offset:2
983; GCN-NEXT:    buffer_load_ubyte v3, off, s[12:15], 0 offset:3
984; GCN-NEXT:    s_waitcnt vmcnt(3)
985; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
986; GCN-NEXT:    s_waitcnt vmcnt(2)
987; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
988; GCN-NEXT:    s_waitcnt vmcnt(1)
989; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
990; GCN-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
991; GCN-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
992; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
993; GCN-NEXT:    s_waitcnt vmcnt(0)
994; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
995; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
996; GCN-NEXT:    v_or_b32_e32 v0, v0, v3
997; GCN-NEXT:    v_and_b32_e32 v2, 15, v0
998; GCN-NEXT:    v_mov_b32_e32 v0, s0
999; GCN-NEXT:    v_mov_b32_e32 v1, s1
1000; GCN-NEXT:    flat_store_byte v[0:1], v2
1001; GCN-NEXT:    s_endpgm
1002entry:
1003  %v = insertelement <4 x i1> %vec, i1 1, i32 %sel
1004  store <4 x i1> %v, ptr addrspace(1) %out
1005  ret void
1006}
1007
1008define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, i32 %sel) {
1009; GCN-LABEL: bit128_inselt:
1010; GCN:       ; %bb.0: ; %entry
1011; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
1012; GCN-NEXT:    s_load_dword s6, s[4:5], 0x44
1013; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
1014; GCN-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
1015; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1016; GCN-NEXT:    s_bfe_u32 s9, s0, 0xf0001
1017; GCN-NEXT:    s_lshr_b32 s42, s1, 16
1018; GCN-NEXT:    v_writelane_b32 v6, s4, 0
1019; GCN-NEXT:    v_writelane_b32 v6, s5, 1
1020; GCN-NEXT:    s_lshr_b32 s4, s0, 16
1021; GCN-NEXT:    v_writelane_b32 v6, s4, 2
1022; GCN-NEXT:    s_lshr_b32 s4, s0, 17
1023; GCN-NEXT:    v_writelane_b32 v6, s4, 3
1024; GCN-NEXT:    s_lshr_b32 s4, s0, 18
1025; GCN-NEXT:    v_writelane_b32 v6, s4, 4
1026; GCN-NEXT:    s_lshr_b32 s4, s0, 19
1027; GCN-NEXT:    v_writelane_b32 v6, s4, 5
1028; GCN-NEXT:    s_lshr_b32 s4, s0, 20
1029; GCN-NEXT:    v_writelane_b32 v6, s4, 6
1030; GCN-NEXT:    s_lshr_b32 s4, s0, 21
1031; GCN-NEXT:    v_writelane_b32 v6, s4, 7
1032; GCN-NEXT:    s_lshr_b32 s4, s0, 22
1033; GCN-NEXT:    v_writelane_b32 v6, s4, 8
1034; GCN-NEXT:    s_lshr_b32 s4, s0, 23
1035; GCN-NEXT:    v_writelane_b32 v6, s4, 9
1036; GCN-NEXT:    s_lshr_b32 s4, s0, 24
1037; GCN-NEXT:    v_writelane_b32 v6, s4, 10
1038; GCN-NEXT:    s_lshr_b32 s4, s0, 25
1039; GCN-NEXT:    v_writelane_b32 v6, s4, 11
1040; GCN-NEXT:    s_lshr_b32 s4, s0, 26
1041; GCN-NEXT:    v_writelane_b32 v6, s4, 12
1042; GCN-NEXT:    s_lshr_b32 s4, s0, 27
1043; GCN-NEXT:    v_writelane_b32 v6, s4, 13
1044; GCN-NEXT:    s_lshr_b32 s4, s0, 28
1045; GCN-NEXT:    v_writelane_b32 v6, s4, 14
1046; GCN-NEXT:    s_lshr_b32 s4, s0, 29
1047; GCN-NEXT:    v_writelane_b32 v6, s4, 15
1048; GCN-NEXT:    s_lshr_b32 s4, s0, 30
1049; GCN-NEXT:    v_writelane_b32 v6, s4, 16
1050; GCN-NEXT:    s_lshr_b32 s4, s0, 31
1051; GCN-NEXT:    v_writelane_b32 v6, s4, 17
1052; GCN-NEXT:    v_writelane_b32 v6, s9, 18
1053; GCN-NEXT:    s_bfe_u32 s9, s0, 0xe0002
1054; GCN-NEXT:    v_writelane_b32 v6, s9, 19
1055; GCN-NEXT:    s_bfe_u32 s9, s0, 0xd0003
1056; GCN-NEXT:    v_writelane_b32 v6, s9, 20
1057; GCN-NEXT:    s_bfe_u32 s9, s0, 0xc0004
1058; GCN-NEXT:    v_writelane_b32 v6, s9, 21
1059; GCN-NEXT:    s_bfe_u32 s9, s0, 0xb0005
1060; GCN-NEXT:    v_writelane_b32 v6, s9, 22
1061; GCN-NEXT:    s_bfe_u32 s9, s0, 0xa0006
1062; GCN-NEXT:    v_writelane_b32 v6, s9, 23
1063; GCN-NEXT:    s_bfe_u32 s9, s0, 0x90007
1064; GCN-NEXT:    v_writelane_b32 v6, s9, 24
1065; GCN-NEXT:    s_bfe_u32 s9, s0, 0x80008
1066; GCN-NEXT:    v_writelane_b32 v6, s9, 25
1067; GCN-NEXT:    s_bfe_u32 s9, s0, 0x70009
1068; GCN-NEXT:    v_writelane_b32 v6, s9, 26
1069; GCN-NEXT:    s_bfe_u32 s9, s0, 0x6000a
1070; GCN-NEXT:    v_writelane_b32 v6, s9, 27
1071; GCN-NEXT:    s_bfe_u32 s9, s0, 0x5000b
1072; GCN-NEXT:    v_writelane_b32 v6, s9, 28
1073; GCN-NEXT:    s_bfe_u32 s9, s0, 0x4000c
1074; GCN-NEXT:    v_writelane_b32 v6, s9, 29
1075; GCN-NEXT:    s_bfe_u32 s9, s0, 0x3000d
1076; GCN-NEXT:    v_writelane_b32 v6, s9, 30
1077; GCN-NEXT:    s_bfe_u32 s9, s0, 0x2000e
1078; GCN-NEXT:    v_writelane_b32 v6, s9, 31
1079; GCN-NEXT:    s_bfe_u32 s9, s0, 0x1000f
1080; GCN-NEXT:    v_writelane_b32 v6, s9, 32
1081; GCN-NEXT:    s_bfe_u32 s9, s1, 0xf0001
1082; GCN-NEXT:    s_lshr_b32 s43, s1, 17
1083; GCN-NEXT:    s_lshr_b32 s45, s1, 18
1084; GCN-NEXT:    s_lshr_b32 s47, s1, 19
1085; GCN-NEXT:    s_lshr_b32 s50, s1, 20
1086; GCN-NEXT:    s_lshr_b32 s51, s1, 21
1087; GCN-NEXT:    s_lshr_b32 s53, s1, 22
1088; GCN-NEXT:    s_lshr_b32 s55, s1, 23
1089; GCN-NEXT:    s_lshr_b32 s58, s1, 24
1090; GCN-NEXT:    s_lshr_b32 s59, s1, 25
1091; GCN-NEXT:    s_lshr_b32 s61, s1, 26
1092; GCN-NEXT:    s_lshr_b32 s63, s1, 27
1093; GCN-NEXT:    s_lshr_b32 s66, s1, 28
1094; GCN-NEXT:    s_lshr_b32 s67, s1, 29
1095; GCN-NEXT:    s_lshr_b32 s68, s1, 30
1096; GCN-NEXT:    s_lshr_b32 s69, s1, 31
1097; GCN-NEXT:    s_lshr_b32 s73, s2, 16
1098; GCN-NEXT:    s_lshr_b32 s74, s2, 17
1099; GCN-NEXT:    s_lshr_b32 s77, s2, 18
1100; GCN-NEXT:    s_lshr_b32 s78, s2, 19
1101; GCN-NEXT:    s_lshr_b32 s81, s2, 20
1102; GCN-NEXT:    s_lshr_b32 s82, s2, 21
1103; GCN-NEXT:    s_lshr_b32 s84, s2, 22
1104; GCN-NEXT:    s_lshr_b32 s86, s2, 23
1105; GCN-NEXT:    s_lshr_b32 s89, s2, 24
1106; GCN-NEXT:    s_lshr_b32 s90, s2, 25
1107; GCN-NEXT:    s_lshr_b32 s93, s2, 26
1108; GCN-NEXT:    s_lshr_b32 s94, s2, 27
1109; GCN-NEXT:    s_lshr_b32 vcc_hi, s2, 28
1110; GCN-NEXT:    s_lshr_b32 s39, s2, 29
1111; GCN-NEXT:    s_lshr_b32 s38, s2, 30
1112; GCN-NEXT:    s_lshr_b32 s37, s2, 31
1113; GCN-NEXT:    s_lshr_b32 s33, s3, 16
1114; GCN-NEXT:    s_lshr_b32 s31, s3, 17
1115; GCN-NEXT:    s_lshr_b32 s28, s3, 18
1116; GCN-NEXT:    s_lshr_b32 s27, s3, 19
1117; GCN-NEXT:    s_lshr_b32 s24, s3, 20
1118; GCN-NEXT:    s_lshr_b32 s23, s3, 21
1119; GCN-NEXT:    s_lshr_b32 s20, s3, 22
1120; GCN-NEXT:    s_lshr_b32 s19, s3, 23
1121; GCN-NEXT:    s_lshr_b32 s16, s3, 24
1122; GCN-NEXT:    s_lshr_b32 s15, s3, 25
1123; GCN-NEXT:    s_lshr_b32 s12, s3, 26
1124; GCN-NEXT:    s_lshr_b32 s11, s3, 27
1125; GCN-NEXT:    s_lshr_b32 s8, s3, 28
1126; GCN-NEXT:    s_lshr_b32 s7, s3, 29
1127; GCN-NEXT:    s_lshr_b32 s5, s3, 30
1128; GCN-NEXT:    s_lshr_b32 s4, s3, 31
1129; GCN-NEXT:    v_writelane_b32 v6, s9, 33
1130; GCN-NEXT:    s_bfe_u32 s40, s1, 0xe0002
1131; GCN-NEXT:    s_bfe_u32 s41, s1, 0xd0003
1132; GCN-NEXT:    s_bfe_u32 s44, s1, 0xc0004
1133; GCN-NEXT:    s_bfe_u32 s46, s1, 0xb0005
1134; GCN-NEXT:    s_bfe_u32 s48, s1, 0xa0006
1135; GCN-NEXT:    s_bfe_u32 s49, s1, 0x90007
1136; GCN-NEXT:    s_bfe_u32 s52, s1, 0x80008
1137; GCN-NEXT:    s_bfe_u32 s54, s1, 0x70009
1138; GCN-NEXT:    s_bfe_u32 s56, s1, 0x6000a
1139; GCN-NEXT:    s_bfe_u32 s57, s1, 0x5000b
1140; GCN-NEXT:    s_bfe_u32 s60, s1, 0x4000c
1141; GCN-NEXT:    s_bfe_u32 s62, s1, 0x3000d
1142; GCN-NEXT:    s_bfe_u32 s64, s1, 0x2000e
1143; GCN-NEXT:    s_bfe_u32 s65, s1, 0x1000f
1144; GCN-NEXT:    s_bfe_u32 s70, s2, 0xf0001
1145; GCN-NEXT:    s_bfe_u32 s71, s2, 0xe0002
1146; GCN-NEXT:    s_bfe_u32 s72, s2, 0xd0003
1147; GCN-NEXT:    s_bfe_u32 s75, s2, 0xc0004
1148; GCN-NEXT:    s_bfe_u32 s76, s2, 0xb0005
1149; GCN-NEXT:    s_bfe_u32 s79, s2, 0xa0006
1150; GCN-NEXT:    s_bfe_u32 s80, s2, 0x90007
1151; GCN-NEXT:    s_bfe_u32 s83, s2, 0x80008
1152; GCN-NEXT:    s_bfe_u32 s85, s2, 0x70009
1153; GCN-NEXT:    s_bfe_u32 s87, s2, 0x6000a
1154; GCN-NEXT:    s_bfe_u32 s88, s2, 0x5000b
1155; GCN-NEXT:    s_bfe_u32 s91, s2, 0x4000c
1156; GCN-NEXT:    s_bfe_u32 s92, s2, 0x3000d
1157; GCN-NEXT:    s_bfe_u32 s95, s2, 0x2000e
1158; GCN-NEXT:    s_bfe_u32 vcc_lo, s2, 0x1000f
1159; GCN-NEXT:    s_bfe_u32 s36, s3, 0xf0001
1160; GCN-NEXT:    s_bfe_u32 s35, s3, 0xe0002
1161; GCN-NEXT:    s_bfe_u32 s34, s3, 0xd0003
1162; GCN-NEXT:    s_bfe_u32 s30, s3, 0xc0004
1163; GCN-NEXT:    s_bfe_u32 s29, s3, 0xb0005
1164; GCN-NEXT:    s_bfe_u32 s26, s3, 0xa0006
1165; GCN-NEXT:    s_bfe_u32 s25, s3, 0x90007
1166; GCN-NEXT:    s_bfe_u32 s22, s3, 0x80008
1167; GCN-NEXT:    s_bfe_u32 s21, s3, 0x70009
1168; GCN-NEXT:    s_bfe_u32 s18, s3, 0x6000a
1169; GCN-NEXT:    s_bfe_u32 s17, s3, 0x5000b
1170; GCN-NEXT:    s_bfe_u32 s14, s3, 0x4000c
1171; GCN-NEXT:    s_bfe_u32 s13, s3, 0x3000d
1172; GCN-NEXT:    s_bfe_u32 s10, s3, 0x2000e
1173; GCN-NEXT:    s_bfe_u32 s9, s3, 0x1000f
1174; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x7f
1175; GCN-NEXT:    s_cselect_b32 s4, s4, 1
1176; GCN-NEXT:    s_lshl_b32 s4, s4, 3
1177; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x7e
1178; GCN-NEXT:    s_cselect_b32 s5, s5, 1
1179; GCN-NEXT:    s_and_b32 s5, s5, 1
1180; GCN-NEXT:    s_lshl_b32 s5, s5, 2
1181; GCN-NEXT:    s_or_b32 s4, s4, s5
1182; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x7d
1183; GCN-NEXT:    s_cselect_b32 s5, s7, 1
1184; GCN-NEXT:    s_lshl_b32 s5, s5, 1
1185; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x7c
1186; GCN-NEXT:    s_cselect_b32 s7, s8, 1
1187; GCN-NEXT:    s_and_b32 s7, s7, 1
1188; GCN-NEXT:    s_or_b32 s5, s7, s5
1189; GCN-NEXT:    s_and_b32 s5, s5, 3
1190; GCN-NEXT:    s_or_b32 s4, s5, s4
1191; GCN-NEXT:    s_lshl_b32 s4, s4, 12
1192; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x7b
1193; GCN-NEXT:    s_cselect_b32 s5, s11, 1
1194; GCN-NEXT:    s_lshl_b32 s5, s5, 3
1195; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x7a
1196; GCN-NEXT:    s_cselect_b32 s7, s12, 1
1197; GCN-NEXT:    s_and_b32 s7, s7, 1
1198; GCN-NEXT:    s_lshl_b32 s7, s7, 2
1199; GCN-NEXT:    s_or_b32 s5, s5, s7
1200; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x79
1201; GCN-NEXT:    s_cselect_b32 s7, s15, 1
1202; GCN-NEXT:    s_lshl_b32 s7, s7, 1
1203; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x78
1204; GCN-NEXT:    s_cselect_b32 s8, s16, 1
1205; GCN-NEXT:    s_and_b32 s8, s8, 1
1206; GCN-NEXT:    s_or_b32 s7, s8, s7
1207; GCN-NEXT:    s_and_b32 s7, s7, 3
1208; GCN-NEXT:    s_or_b32 s5, s7, s5
1209; GCN-NEXT:    s_and_b32 s5, s5, 15
1210; GCN-NEXT:    s_lshl_b32 s5, s5, 8
1211; GCN-NEXT:    s_or_b32 s4, s4, s5
1212; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x77
1213; GCN-NEXT:    s_cselect_b32 s5, s19, 1
1214; GCN-NEXT:    s_lshl_b32 s5, s5, 3
1215; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x76
1216; GCN-NEXT:    s_cselect_b32 s7, s20, 1
1217; GCN-NEXT:    s_and_b32 s7, s7, 1
1218; GCN-NEXT:    s_lshl_b32 s7, s7, 2
1219; GCN-NEXT:    s_or_b32 s5, s5, s7
1220; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x75
1221; GCN-NEXT:    s_cselect_b32 s7, s23, 1
1222; GCN-NEXT:    s_lshl_b32 s7, s7, 1
1223; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x74
1224; GCN-NEXT:    s_cselect_b32 s8, s24, 1
1225; GCN-NEXT:    s_and_b32 s8, s8, 1
1226; GCN-NEXT:    s_or_b32 s7, s8, s7
1227; GCN-NEXT:    s_and_b32 s7, s7, 3
1228; GCN-NEXT:    s_or_b32 s5, s7, s5
1229; GCN-NEXT:    s_lshl_b32 s5, s5, 4
1230; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x73
1231; GCN-NEXT:    s_cselect_b32 s7, s27, 1
1232; GCN-NEXT:    s_lshl_b32 s7, s7, 3
1233; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x72
1234; GCN-NEXT:    s_cselect_b32 s8, s28, 1
1235; GCN-NEXT:    s_and_b32 s8, s8, 1
1236; GCN-NEXT:    s_lshl_b32 s8, s8, 2
1237; GCN-NEXT:    s_or_b32 s7, s7, s8
1238; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x71
1239; GCN-NEXT:    s_cselect_b32 s8, s31, 1
1240; GCN-NEXT:    s_lshl_b32 s8, s8, 1
1241; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x70
1242; GCN-NEXT:    s_cselect_b32 s11, s33, 1
1243; GCN-NEXT:    s_and_b32 s11, s11, 1
1244; GCN-NEXT:    s_or_b32 s8, s11, s8
1245; GCN-NEXT:    s_and_b32 s8, s8, 3
1246; GCN-NEXT:    s_or_b32 s7, s8, s7
1247; GCN-NEXT:    s_and_b32 s7, s7, 15
1248; GCN-NEXT:    s_or_b32 s5, s7, s5
1249; GCN-NEXT:    s_and_b32 s5, s5, 0xff
1250; GCN-NEXT:    s_or_b32 s4, s5, s4
1251; GCN-NEXT:    s_lshl_b32 s4, s4, 16
1252; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x6f
1253; GCN-NEXT:    s_cselect_b32 s5, s9, 1
1254; GCN-NEXT:    s_lshl_b32 s5, s5, 3
1255; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x6e
1256; GCN-NEXT:    s_cselect_b32 s7, s10, 1
1257; GCN-NEXT:    s_and_b32 s7, s7, 1
1258; GCN-NEXT:    s_lshl_b32 s7, s7, 2
1259; GCN-NEXT:    s_or_b32 s5, s5, s7
1260; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x6d
1261; GCN-NEXT:    s_cselect_b32 s7, s13, 1
1262; GCN-NEXT:    s_lshl_b32 s7, s7, 1
1263; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x6c
1264; GCN-NEXT:    s_cselect_b32 s8, s14, 1
1265; GCN-NEXT:    s_and_b32 s8, s8, 1
1266; GCN-NEXT:    s_or_b32 s7, s8, s7
1267; GCN-NEXT:    s_and_b32 s7, s7, 3
1268; GCN-NEXT:    s_or_b32 s5, s7, s5
1269; GCN-NEXT:    s_lshl_b32 s5, s5, 12
1270; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x6b
1271; GCN-NEXT:    s_cselect_b32 s7, s17, 1
1272; GCN-NEXT:    s_lshl_b32 s7, s7, 3
1273; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x6a
1274; GCN-NEXT:    s_cselect_b32 s8, s18, 1
1275; GCN-NEXT:    s_and_b32 s8, s8, 1
1276; GCN-NEXT:    s_lshl_b32 s8, s8, 2
1277; GCN-NEXT:    s_or_b32 s7, s7, s8
1278; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x69
1279; GCN-NEXT:    s_cselect_b32 s8, s21, 1
1280; GCN-NEXT:    s_lshl_b32 s8, s8, 1
1281; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x68
1282; GCN-NEXT:    s_cselect_b32 s9, s22, 1
1283; GCN-NEXT:    s_and_b32 s9, s9, 1
1284; GCN-NEXT:    s_or_b32 s8, s9, s8
1285; GCN-NEXT:    s_and_b32 s8, s8, 3
1286; GCN-NEXT:    s_or_b32 s7, s8, s7
1287; GCN-NEXT:    s_and_b32 s7, s7, 15
1288; GCN-NEXT:    s_lshl_b32 s7, s7, 8
1289; GCN-NEXT:    s_or_b32 s5, s5, s7
1290; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x67
1291; GCN-NEXT:    s_cselect_b32 s7, s25, 1
1292; GCN-NEXT:    s_lshl_b32 s7, s7, 3
1293; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x66
1294; GCN-NEXT:    s_cselect_b32 s8, s26, 1
1295; GCN-NEXT:    s_and_b32 s8, s8, 1
1296; GCN-NEXT:    s_lshl_b32 s8, s8, 2
1297; GCN-NEXT:    s_or_b32 s7, s7, s8
1298; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x65
1299; GCN-NEXT:    s_cselect_b32 s8, s29, 1
1300; GCN-NEXT:    s_lshl_b32 s8, s8, 1
1301; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x64
1302; GCN-NEXT:    s_cselect_b32 s9, s30, 1
1303; GCN-NEXT:    s_and_b32 s9, s9, 1
1304; GCN-NEXT:    s_or_b32 s8, s9, s8
1305; GCN-NEXT:    s_and_b32 s8, s8, 3
1306; GCN-NEXT:    s_or_b32 s7, s8, s7
1307; GCN-NEXT:    s_lshl_b32 s7, s7, 4
1308; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x63
1309; GCN-NEXT:    s_cselect_b32 s8, s34, 1
1310; GCN-NEXT:    s_lshl_b32 s8, s8, 3
1311; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x62
1312; GCN-NEXT:    s_cselect_b32 s9, s35, 1
1313; GCN-NEXT:    s_and_b32 s9, s9, 1
1314; GCN-NEXT:    s_lshl_b32 s9, s9, 2
1315; GCN-NEXT:    s_or_b32 s8, s8, s9
1316; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x60
1317; GCN-NEXT:    s_cselect_b32 s3, s3, 1
1318; GCN-NEXT:    s_and_b32 s3, s3, 1
1319; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x61
1320; GCN-NEXT:    s_cselect_b32 s9, s36, 1
1321; GCN-NEXT:    s_lshl_b32 s9, s9, 1
1322; GCN-NEXT:    s_or_b32 s3, s3, s9
1323; GCN-NEXT:    s_and_b32 s3, s3, 3
1324; GCN-NEXT:    s_or_b32 s3, s3, s8
1325; GCN-NEXT:    s_and_b32 s3, s3, 15
1326; GCN-NEXT:    s_or_b32 s3, s3, s7
1327; GCN-NEXT:    s_and_b32 s3, s3, 0xff
1328; GCN-NEXT:    s_or_b32 s3, s3, s5
1329; GCN-NEXT:    s_and_b32 s3, s3, 0xffff
1330; GCN-NEXT:    s_or_b32 s3, s3, s4
1331; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x5f
1332; GCN-NEXT:    s_cselect_b32 s4, s37, 1
1333; GCN-NEXT:    s_lshl_b32 s4, s4, 3
1334; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x5e
1335; GCN-NEXT:    s_cselect_b32 s5, s38, 1
1336; GCN-NEXT:    s_and_b32 s5, s5, 1
1337; GCN-NEXT:    s_lshl_b32 s5, s5, 2
1338; GCN-NEXT:    s_or_b32 s4, s4, s5
1339; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x5d
1340; GCN-NEXT:    s_cselect_b32 s5, s39, 1
1341; GCN-NEXT:    s_lshl_b32 s5, s5, 1
1342; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x5c
1343; GCN-NEXT:    s_cselect_b32 s7, vcc_hi, 1
1344; GCN-NEXT:    s_and_b32 s7, s7, 1
1345; GCN-NEXT:    s_or_b32 s5, s7, s5
1346; GCN-NEXT:    s_and_b32 s5, s5, 3
1347; GCN-NEXT:    s_or_b32 s4, s5, s4
1348; GCN-NEXT:    s_lshl_b32 s4, s4, 12
1349; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x5b
1350; GCN-NEXT:    s_cselect_b32 s5, s94, 1
1351; GCN-NEXT:    s_lshl_b32 s5, s5, 3
1352; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x5a
1353; GCN-NEXT:    s_cselect_b32 s7, s93, 1
1354; GCN-NEXT:    s_and_b32 s7, s7, 1
1355; GCN-NEXT:    s_lshl_b32 s7, s7, 2
1356; GCN-NEXT:    s_or_b32 s5, s5, s7
1357; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x59
1358; GCN-NEXT:    s_cselect_b32 s7, s90, 1
1359; GCN-NEXT:    s_lshl_b32 s7, s7, 1
1360; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x58
1361; GCN-NEXT:    s_cselect_b32 s8, s89, 1
1362; GCN-NEXT:    s_and_b32 s8, s8, 1
1363; GCN-NEXT:    s_or_b32 s7, s8, s7
1364; GCN-NEXT:    s_and_b32 s7, s7, 3
1365; GCN-NEXT:    s_or_b32 s5, s7, s5
1366; GCN-NEXT:    s_and_b32 s5, s5, 15
1367; GCN-NEXT:    s_lshl_b32 s5, s5, 8
1368; GCN-NEXT:    s_or_b32 s4, s4, s5
1369; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x57
1370; GCN-NEXT:    s_cselect_b32 s5, s86, 1
1371; GCN-NEXT:    s_lshl_b32 s5, s5, 3
1372; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x56
1373; GCN-NEXT:    s_cselect_b32 s7, s84, 1
1374; GCN-NEXT:    s_and_b32 s7, s7, 1
1375; GCN-NEXT:    s_lshl_b32 s7, s7, 2
1376; GCN-NEXT:    s_or_b32 s5, s5, s7
1377; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x55
1378; GCN-NEXT:    s_cselect_b32 s7, s82, 1
1379; GCN-NEXT:    s_lshl_b32 s7, s7, 1
1380; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x54
1381; GCN-NEXT:    s_cselect_b32 s8, s81, 1
1382; GCN-NEXT:    s_and_b32 s8, s8, 1
1383; GCN-NEXT:    s_or_b32 s7, s8, s7
1384; GCN-NEXT:    s_and_b32 s7, s7, 3
1385; GCN-NEXT:    s_or_b32 s5, s7, s5
1386; GCN-NEXT:    s_lshl_b32 s5, s5, 4
1387; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x53
1388; GCN-NEXT:    s_cselect_b32 s7, s78, 1
1389; GCN-NEXT:    s_lshl_b32 s7, s7, 3
1390; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x52
1391; GCN-NEXT:    s_cselect_b32 s8, s77, 1
1392; GCN-NEXT:    s_and_b32 s8, s8, 1
1393; GCN-NEXT:    s_lshl_b32 s8, s8, 2
1394; GCN-NEXT:    s_or_b32 s7, s7, s8
1395; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x51
1396; GCN-NEXT:    s_cselect_b32 s8, s74, 1
1397; GCN-NEXT:    s_lshl_b32 s8, s8, 1
1398; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x50
1399; GCN-NEXT:    s_cselect_b32 s9, s73, 1
1400; GCN-NEXT:    s_and_b32 s9, s9, 1
1401; GCN-NEXT:    s_or_b32 s8, s9, s8
1402; GCN-NEXT:    s_and_b32 s8, s8, 3
1403; GCN-NEXT:    s_or_b32 s7, s8, s7
1404; GCN-NEXT:    s_and_b32 s7, s7, 15
1405; GCN-NEXT:    s_or_b32 s5, s7, s5
1406; GCN-NEXT:    s_and_b32 s5, s5, 0xff
1407; GCN-NEXT:    s_or_b32 s4, s5, s4
1408; GCN-NEXT:    s_lshl_b32 s4, s4, 16
1409; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x4f
1410; GCN-NEXT:    s_cselect_b32 s5, vcc_lo, 1
1411; GCN-NEXT:    s_lshl_b32 s5, s5, 3
1412; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x4e
1413; GCN-NEXT:    s_cselect_b32 s7, s95, 1
1414; GCN-NEXT:    s_and_b32 s7, s7, 1
1415; GCN-NEXT:    s_lshl_b32 s7, s7, 2
1416; GCN-NEXT:    s_or_b32 s5, s5, s7
1417; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x4d
1418; GCN-NEXT:    s_cselect_b32 s7, s92, 1
1419; GCN-NEXT:    s_lshl_b32 s7, s7, 1
1420; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x4c
1421; GCN-NEXT:    s_cselect_b32 s8, s91, 1
1422; GCN-NEXT:    s_and_b32 s8, s8, 1
1423; GCN-NEXT:    s_or_b32 s7, s8, s7
1424; GCN-NEXT:    s_and_b32 s7, s7, 3
1425; GCN-NEXT:    s_or_b32 s5, s7, s5
1426; GCN-NEXT:    s_lshl_b32 s5, s5, 12
1427; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x4b
1428; GCN-NEXT:    s_cselect_b32 s7, s88, 1
1429; GCN-NEXT:    s_lshl_b32 s7, s7, 3
1430; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x4a
1431; GCN-NEXT:    s_cselect_b32 s8, s87, 1
1432; GCN-NEXT:    s_and_b32 s8, s8, 1
1433; GCN-NEXT:    s_lshl_b32 s8, s8, 2
1434; GCN-NEXT:    s_or_b32 s7, s7, s8
1435; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x49
1436; GCN-NEXT:    s_cselect_b32 s8, s85, 1
1437; GCN-NEXT:    s_lshl_b32 s8, s8, 1
1438; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x48
1439; GCN-NEXT:    s_cselect_b32 s9, s83, 1
1440; GCN-NEXT:    s_and_b32 s9, s9, 1
1441; GCN-NEXT:    s_or_b32 s8, s9, s8
1442; GCN-NEXT:    s_and_b32 s8, s8, 3
1443; GCN-NEXT:    s_or_b32 s7, s8, s7
1444; GCN-NEXT:    s_and_b32 s7, s7, 15
1445; GCN-NEXT:    s_lshl_b32 s7, s7, 8
1446; GCN-NEXT:    s_or_b32 s5, s5, s7
1447; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x47
1448; GCN-NEXT:    s_cselect_b32 s7, s80, 1
1449; GCN-NEXT:    s_lshl_b32 s7, s7, 3
1450; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x46
1451; GCN-NEXT:    s_cselect_b32 s8, s79, 1
1452; GCN-NEXT:    s_and_b32 s8, s8, 1
1453; GCN-NEXT:    s_lshl_b32 s8, s8, 2
1454; GCN-NEXT:    s_or_b32 s7, s7, s8
1455; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x45
1456; GCN-NEXT:    s_cselect_b32 s8, s76, 1
1457; GCN-NEXT:    s_lshl_b32 s8, s8, 1
1458; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x44
1459; GCN-NEXT:    s_cselect_b32 s9, s75, 1
1460; GCN-NEXT:    s_and_b32 s9, s9, 1
1461; GCN-NEXT:    s_or_b32 s8, s9, s8
1462; GCN-NEXT:    s_and_b32 s8, s8, 3
1463; GCN-NEXT:    s_or_b32 s7, s8, s7
1464; GCN-NEXT:    s_lshl_b32 s7, s7, 4
1465; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x43
1466; GCN-NEXT:    s_cselect_b32 s8, s72, 1
1467; GCN-NEXT:    s_lshl_b32 s8, s8, 3
1468; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x42
1469; GCN-NEXT:    s_cselect_b32 s9, s71, 1
1470; GCN-NEXT:    s_and_b32 s9, s9, 1
1471; GCN-NEXT:    s_lshl_b32 s9, s9, 2
1472; GCN-NEXT:    s_or_b32 s8, s8, s9
1473; GCN-NEXT:    s_cmp_lg_u32 s6, 64
1474; GCN-NEXT:    s_cselect_b32 s2, s2, 1
1475; GCN-NEXT:    s_and_b32 s2, s2, 1
1476; GCN-NEXT:    s_cmpk_lg_i32 s6, 0x41
1477; GCN-NEXT:    s_cselect_b32 s9, s70, 1
1478; GCN-NEXT:    s_lshl_b32 s9, s9, 1
1479; GCN-NEXT:    s_or_b32 s2, s2, s9
1480; GCN-NEXT:    s_and_b32 s2, s2, 3
1481; GCN-NEXT:    s_or_b32 s2, s2, s8
1482; GCN-NEXT:    s_and_b32 s2, s2, 15
1483; GCN-NEXT:    s_or_b32 s2, s2, s7
1484; GCN-NEXT:    s_and_b32 s2, s2, 0xff
1485; GCN-NEXT:    s_or_b32 s2, s2, s5
1486; GCN-NEXT:    s_and_b32 s2, s2, 0xffff
1487; GCN-NEXT:    s_or_b32 s2, s2, s4
1488; GCN-NEXT:    s_cmp_lg_u32 s6, 63
1489; GCN-NEXT:    s_cselect_b32 s4, s69, 1
1490; GCN-NEXT:    s_lshl_b32 s4, s4, 3
1491; GCN-NEXT:    s_cmp_lg_u32 s6, 62
1492; GCN-NEXT:    s_cselect_b32 s5, s68, 1
1493; GCN-NEXT:    s_and_b32 s5, s5, 1
1494; GCN-NEXT:    s_lshl_b32 s5, s5, 2
1495; GCN-NEXT:    s_or_b32 s4, s4, s5
1496; GCN-NEXT:    s_cmp_lg_u32 s6, 61
1497; GCN-NEXT:    s_cselect_b32 s5, s67, 1
1498; GCN-NEXT:    s_lshl_b32 s5, s5, 1
1499; GCN-NEXT:    s_cmp_lg_u32 s6, 60
1500; GCN-NEXT:    s_cselect_b32 s7, s66, 1
1501; GCN-NEXT:    s_and_b32 s7, s7, 1
1502; GCN-NEXT:    s_or_b32 s5, s7, s5
1503; GCN-NEXT:    s_and_b32 s5, s5, 3
1504; GCN-NEXT:    s_or_b32 s4, s5, s4
1505; GCN-NEXT:    s_lshl_b32 s4, s4, 12
1506; GCN-NEXT:    s_cmp_lg_u32 s6, 59
1507; GCN-NEXT:    s_cselect_b32 s5, s63, 1
1508; GCN-NEXT:    s_lshl_b32 s5, s5, 3
1509; GCN-NEXT:    s_cmp_lg_u32 s6, 58
1510; GCN-NEXT:    s_cselect_b32 s7, s61, 1
1511; GCN-NEXT:    s_and_b32 s7, s7, 1
1512; GCN-NEXT:    s_lshl_b32 s7, s7, 2
1513; GCN-NEXT:    s_or_b32 s5, s5, s7
1514; GCN-NEXT:    s_cmp_lg_u32 s6, 57
1515; GCN-NEXT:    s_cselect_b32 s7, s59, 1
1516; GCN-NEXT:    s_lshl_b32 s7, s7, 1
1517; GCN-NEXT:    s_cmp_lg_u32 s6, 56
1518; GCN-NEXT:    s_cselect_b32 s8, s58, 1
1519; GCN-NEXT:    s_and_b32 s8, s8, 1
1520; GCN-NEXT:    s_or_b32 s7, s8, s7
1521; GCN-NEXT:    s_and_b32 s7, s7, 3
1522; GCN-NEXT:    s_or_b32 s5, s7, s5
1523; GCN-NEXT:    s_and_b32 s5, s5, 15
1524; GCN-NEXT:    s_lshl_b32 s5, s5, 8
1525; GCN-NEXT:    s_or_b32 s4, s4, s5
1526; GCN-NEXT:    s_cmp_lg_u32 s6, 55
1527; GCN-NEXT:    s_cselect_b32 s5, s55, 1
1528; GCN-NEXT:    s_lshl_b32 s5, s5, 3
1529; GCN-NEXT:    s_cmp_lg_u32 s6, 54
1530; GCN-NEXT:    s_cselect_b32 s7, s53, 1
1531; GCN-NEXT:    s_and_b32 s7, s7, 1
1532; GCN-NEXT:    s_lshl_b32 s7, s7, 2
1533; GCN-NEXT:    s_or_b32 s5, s5, s7
1534; GCN-NEXT:    s_cmp_lg_u32 s6, 53
1535; GCN-NEXT:    s_cselect_b32 s7, s51, 1
1536; GCN-NEXT:    s_lshl_b32 s7, s7, 1
1537; GCN-NEXT:    s_cmp_lg_u32 s6, 52
1538; GCN-NEXT:    s_cselect_b32 s8, s50, 1
1539; GCN-NEXT:    s_and_b32 s8, s8, 1
1540; GCN-NEXT:    s_or_b32 s7, s8, s7
1541; GCN-NEXT:    s_and_b32 s7, s7, 3
1542; GCN-NEXT:    s_or_b32 s5, s7, s5
1543; GCN-NEXT:    s_lshl_b32 s5, s5, 4
1544; GCN-NEXT:    s_cmp_lg_u32 s6, 51
1545; GCN-NEXT:    s_cselect_b32 s7, s47, 1
1546; GCN-NEXT:    s_lshl_b32 s7, s7, 3
1547; GCN-NEXT:    s_cmp_lg_u32 s6, 50
1548; GCN-NEXT:    s_cselect_b32 s8, s45, 1
1549; GCN-NEXT:    s_and_b32 s8, s8, 1
1550; GCN-NEXT:    s_lshl_b32 s8, s8, 2
1551; GCN-NEXT:    s_or_b32 s7, s7, s8
1552; GCN-NEXT:    s_cmp_lg_u32 s6, 49
1553; GCN-NEXT:    s_cselect_b32 s8, s43, 1
1554; GCN-NEXT:    s_lshl_b32 s8, s8, 1
1555; GCN-NEXT:    s_cmp_lg_u32 s6, 48
1556; GCN-NEXT:    s_cselect_b32 s9, s42, 1
1557; GCN-NEXT:    s_and_b32 s9, s9, 1
1558; GCN-NEXT:    s_or_b32 s8, s9, s8
1559; GCN-NEXT:    s_and_b32 s8, s8, 3
1560; GCN-NEXT:    s_or_b32 s7, s8, s7
1561; GCN-NEXT:    s_and_b32 s7, s7, 15
1562; GCN-NEXT:    s_or_b32 s5, s7, s5
1563; GCN-NEXT:    s_and_b32 s5, s5, 0xff
1564; GCN-NEXT:    s_or_b32 s4, s5, s4
1565; GCN-NEXT:    s_lshl_b32 s4, s4, 16
1566; GCN-NEXT:    s_cmp_lg_u32 s6, 47
1567; GCN-NEXT:    s_cselect_b32 s5, s65, 1
1568; GCN-NEXT:    s_lshl_b32 s5, s5, 3
1569; GCN-NEXT:    s_cmp_lg_u32 s6, 46
1570; GCN-NEXT:    s_cselect_b32 s7, s64, 1
1571; GCN-NEXT:    s_and_b32 s7, s7, 1
1572; GCN-NEXT:    s_lshl_b32 s7, s7, 2
1573; GCN-NEXT:    s_or_b32 s5, s5, s7
1574; GCN-NEXT:    s_cmp_lg_u32 s6, 45
1575; GCN-NEXT:    s_cselect_b32 s7, s62, 1
1576; GCN-NEXT:    s_lshl_b32 s7, s7, 1
1577; GCN-NEXT:    s_cmp_lg_u32 s6, 44
1578; GCN-NEXT:    s_cselect_b32 s8, s60, 1
1579; GCN-NEXT:    s_and_b32 s8, s8, 1
1580; GCN-NEXT:    s_or_b32 s7, s8, s7
1581; GCN-NEXT:    s_and_b32 s7, s7, 3
1582; GCN-NEXT:    s_or_b32 s5, s7, s5
1583; GCN-NEXT:    s_lshl_b32 s5, s5, 12
1584; GCN-NEXT:    s_cmp_lg_u32 s6, 43
1585; GCN-NEXT:    s_cselect_b32 s7, s57, 1
1586; GCN-NEXT:    s_lshl_b32 s7, s7, 3
1587; GCN-NEXT:    s_cmp_lg_u32 s6, 42
1588; GCN-NEXT:    s_cselect_b32 s8, s56, 1
1589; GCN-NEXT:    s_and_b32 s8, s8, 1
1590; GCN-NEXT:    s_lshl_b32 s8, s8, 2
1591; GCN-NEXT:    s_or_b32 s7, s7, s8
1592; GCN-NEXT:    s_cmp_lg_u32 s6, 41
1593; GCN-NEXT:    s_cselect_b32 s8, s54, 1
1594; GCN-NEXT:    s_lshl_b32 s8, s8, 1
1595; GCN-NEXT:    s_cmp_lg_u32 s6, 40
1596; GCN-NEXT:    s_cselect_b32 s9, s52, 1
1597; GCN-NEXT:    s_and_b32 s9, s9, 1
1598; GCN-NEXT:    s_or_b32 s8, s9, s8
1599; GCN-NEXT:    s_and_b32 s8, s8, 3
1600; GCN-NEXT:    s_or_b32 s7, s8, s7
1601; GCN-NEXT:    s_and_b32 s7, s7, 15
1602; GCN-NEXT:    s_lshl_b32 s7, s7, 8
1603; GCN-NEXT:    s_or_b32 s5, s5, s7
1604; GCN-NEXT:    s_cmp_lg_u32 s6, 39
1605; GCN-NEXT:    s_cselect_b32 s7, s49, 1
1606; GCN-NEXT:    s_lshl_b32 s7, s7, 3
1607; GCN-NEXT:    s_cmp_lg_u32 s6, 38
1608; GCN-NEXT:    s_cselect_b32 s8, s48, 1
1609; GCN-NEXT:    s_and_b32 s8, s8, 1
1610; GCN-NEXT:    s_lshl_b32 s8, s8, 2
1611; GCN-NEXT:    s_or_b32 s7, s7, s8
1612; GCN-NEXT:    s_cmp_lg_u32 s6, 37
1613; GCN-NEXT:    s_cselect_b32 s8, s46, 1
1614; GCN-NEXT:    s_lshl_b32 s8, s8, 1
1615; GCN-NEXT:    s_cmp_lg_u32 s6, 36
1616; GCN-NEXT:    s_cselect_b32 s9, s44, 1
1617; GCN-NEXT:    s_and_b32 s9, s9, 1
1618; GCN-NEXT:    s_or_b32 s8, s9, s8
1619; GCN-NEXT:    s_and_b32 s8, s8, 3
1620; GCN-NEXT:    s_or_b32 s7, s8, s7
1621; GCN-NEXT:    s_lshl_b32 s7, s7, 4
1622; GCN-NEXT:    s_cmp_lg_u32 s6, 35
1623; GCN-NEXT:    s_cselect_b32 s8, s41, 1
1624; GCN-NEXT:    s_lshl_b32 s8, s8, 3
1625; GCN-NEXT:    s_cmp_lg_u32 s6, 34
1626; GCN-NEXT:    s_cselect_b32 s9, s40, 1
1627; GCN-NEXT:    s_and_b32 s9, s9, 1
1628; GCN-NEXT:    s_lshl_b32 s9, s9, 2
1629; GCN-NEXT:    s_or_b32 s8, s8, s9
1630; GCN-NEXT:    s_cmp_lg_u32 s6, 32
1631; GCN-NEXT:    s_cselect_b32 s1, s1, 1
1632; GCN-NEXT:    s_and_b32 s1, s1, 1
1633; GCN-NEXT:    s_cmp_lg_u32 s6, 33
1634; GCN-NEXT:    v_readlane_b32 s9, v6, 33
1635; GCN-NEXT:    s_cselect_b32 s9, s9, 1
1636; GCN-NEXT:    s_lshl_b32 s9, s9, 1
1637; GCN-NEXT:    s_or_b32 s1, s1, s9
1638; GCN-NEXT:    s_and_b32 s1, s1, 3
1639; GCN-NEXT:    s_or_b32 s1, s1, s8
1640; GCN-NEXT:    s_and_b32 s1, s1, 15
1641; GCN-NEXT:    s_or_b32 s1, s1, s7
1642; GCN-NEXT:    s_and_b32 s1, s1, 0xff
1643; GCN-NEXT:    s_or_b32 s1, s1, s5
1644; GCN-NEXT:    s_and_b32 s1, s1, 0xffff
1645; GCN-NEXT:    s_or_b32 s1, s1, s4
1646; GCN-NEXT:    s_cmp_lg_u32 s6, 31
1647; GCN-NEXT:    v_readlane_b32 s4, v6, 17
1648; GCN-NEXT:    s_cselect_b32 s4, s4, 1
1649; GCN-NEXT:    s_lshl_b32 s4, s4, 3
1650; GCN-NEXT:    s_cmp_lg_u32 s6, 30
1651; GCN-NEXT:    v_readlane_b32 s5, v6, 16
1652; GCN-NEXT:    s_cselect_b32 s5, s5, 1
1653; GCN-NEXT:    s_and_b32 s5, s5, 1
1654; GCN-NEXT:    s_lshl_b32 s5, s5, 2
1655; GCN-NEXT:    s_or_b32 s4, s4, s5
1656; GCN-NEXT:    s_cmp_lg_u32 s6, 29
1657; GCN-NEXT:    v_readlane_b32 s5, v6, 15
1658; GCN-NEXT:    s_cselect_b32 s5, s5, 1
1659; GCN-NEXT:    s_lshl_b32 s5, s5, 1
1660; GCN-NEXT:    s_cmp_lg_u32 s6, 28
1661; GCN-NEXT:    v_readlane_b32 s7, v6, 14
1662; GCN-NEXT:    s_cselect_b32 s7, s7, 1
1663; GCN-NEXT:    s_and_b32 s7, s7, 1
1664; GCN-NEXT:    s_or_b32 s5, s7, s5
1665; GCN-NEXT:    s_and_b32 s5, s5, 3
1666; GCN-NEXT:    s_or_b32 s4, s5, s4
1667; GCN-NEXT:    s_lshl_b32 s4, s4, 12
1668; GCN-NEXT:    s_cmp_lg_u32 s6, 27
1669; GCN-NEXT:    v_readlane_b32 s5, v6, 13
1670; GCN-NEXT:    s_cselect_b32 s5, s5, 1
1671; GCN-NEXT:    s_lshl_b32 s5, s5, 3
1672; GCN-NEXT:    s_cmp_lg_u32 s6, 26
1673; GCN-NEXT:    v_readlane_b32 s7, v6, 12
1674; GCN-NEXT:    s_cselect_b32 s7, s7, 1
1675; GCN-NEXT:    s_and_b32 s7, s7, 1
1676; GCN-NEXT:    s_lshl_b32 s7, s7, 2
1677; GCN-NEXT:    s_or_b32 s5, s5, s7
1678; GCN-NEXT:    s_cmp_lg_u32 s6, 25
1679; GCN-NEXT:    v_readlane_b32 s7, v6, 11
1680; GCN-NEXT:    s_cselect_b32 s7, s7, 1
1681; GCN-NEXT:    s_lshl_b32 s7, s7, 1
1682; GCN-NEXT:    s_cmp_lg_u32 s6, 24
1683; GCN-NEXT:    v_readlane_b32 s8, v6, 10
1684; GCN-NEXT:    s_cselect_b32 s8, s8, 1
1685; GCN-NEXT:    s_and_b32 s8, s8, 1
1686; GCN-NEXT:    s_or_b32 s7, s8, s7
1687; GCN-NEXT:    s_and_b32 s7, s7, 3
1688; GCN-NEXT:    s_or_b32 s5, s7, s5
1689; GCN-NEXT:    s_and_b32 s5, s5, 15
1690; GCN-NEXT:    s_lshl_b32 s5, s5, 8
1691; GCN-NEXT:    s_or_b32 s4, s4, s5
1692; GCN-NEXT:    s_cmp_lg_u32 s6, 23
1693; GCN-NEXT:    v_readlane_b32 s5, v6, 9
1694; GCN-NEXT:    s_cselect_b32 s5, s5, 1
1695; GCN-NEXT:    s_lshl_b32 s5, s5, 3
1696; GCN-NEXT:    s_cmp_lg_u32 s6, 22
1697; GCN-NEXT:    v_readlane_b32 s7, v6, 8
1698; GCN-NEXT:    s_cselect_b32 s7, s7, 1
1699; GCN-NEXT:    s_and_b32 s7, s7, 1
1700; GCN-NEXT:    s_lshl_b32 s7, s7, 2
1701; GCN-NEXT:    s_or_b32 s5, s5, s7
1702; GCN-NEXT:    s_cmp_lg_u32 s6, 21
1703; GCN-NEXT:    v_readlane_b32 s7, v6, 7
1704; GCN-NEXT:    s_cselect_b32 s7, s7, 1
1705; GCN-NEXT:    s_lshl_b32 s7, s7, 1
1706; GCN-NEXT:    s_cmp_lg_u32 s6, 20
1707; GCN-NEXT:    v_readlane_b32 s8, v6, 6
1708; GCN-NEXT:    s_cselect_b32 s8, s8, 1
1709; GCN-NEXT:    s_and_b32 s8, s8, 1
1710; GCN-NEXT:    s_or_b32 s7, s8, s7
1711; GCN-NEXT:    s_and_b32 s7, s7, 3
1712; GCN-NEXT:    s_or_b32 s5, s7, s5
1713; GCN-NEXT:    s_lshl_b32 s5, s5, 4
1714; GCN-NEXT:    s_cmp_lg_u32 s6, 19
1715; GCN-NEXT:    v_readlane_b32 s7, v6, 5
1716; GCN-NEXT:    s_cselect_b32 s7, s7, 1
1717; GCN-NEXT:    s_lshl_b32 s7, s7, 3
1718; GCN-NEXT:    s_cmp_lg_u32 s6, 18
1719; GCN-NEXT:    v_readlane_b32 s8, v6, 4
1720; GCN-NEXT:    s_cselect_b32 s8, s8, 1
1721; GCN-NEXT:    s_and_b32 s8, s8, 1
1722; GCN-NEXT:    s_lshl_b32 s8, s8, 2
1723; GCN-NEXT:    s_or_b32 s7, s7, s8
1724; GCN-NEXT:    s_cmp_lg_u32 s6, 17
1725; GCN-NEXT:    v_readlane_b32 s8, v6, 3
1726; GCN-NEXT:    s_cselect_b32 s8, s8, 1
1727; GCN-NEXT:    s_lshl_b32 s8, s8, 1
1728; GCN-NEXT:    s_cmp_lg_u32 s6, 16
1729; GCN-NEXT:    v_readlane_b32 s9, v6, 2
1730; GCN-NEXT:    s_cselect_b32 s9, s9, 1
1731; GCN-NEXT:    s_and_b32 s9, s9, 1
1732; GCN-NEXT:    s_or_b32 s8, s9, s8
1733; GCN-NEXT:    s_and_b32 s8, s8, 3
1734; GCN-NEXT:    s_or_b32 s7, s8, s7
1735; GCN-NEXT:    s_and_b32 s7, s7, 15
1736; GCN-NEXT:    s_or_b32 s5, s7, s5
1737; GCN-NEXT:    s_and_b32 s5, s5, 0xff
1738; GCN-NEXT:    s_or_b32 s4, s5, s4
1739; GCN-NEXT:    s_lshl_b32 s4, s4, 16
1740; GCN-NEXT:    s_cmp_lg_u32 s6, 15
1741; GCN-NEXT:    v_readlane_b32 s5, v6, 32
1742; GCN-NEXT:    s_cselect_b32 s5, s5, 1
1743; GCN-NEXT:    s_lshl_b32 s5, s5, 3
1744; GCN-NEXT:    s_cmp_lg_u32 s6, 14
1745; GCN-NEXT:    v_readlane_b32 s7, v6, 31
1746; GCN-NEXT:    s_cselect_b32 s7, s7, 1
1747; GCN-NEXT:    s_and_b32 s7, s7, 1
1748; GCN-NEXT:    s_lshl_b32 s7, s7, 2
1749; GCN-NEXT:    s_or_b32 s5, s5, s7
1750; GCN-NEXT:    s_cmp_lg_u32 s6, 13
1751; GCN-NEXT:    v_readlane_b32 s7, v6, 30
1752; GCN-NEXT:    s_cselect_b32 s7, s7, 1
1753; GCN-NEXT:    s_lshl_b32 s7, s7, 1
1754; GCN-NEXT:    s_cmp_lg_u32 s6, 12
1755; GCN-NEXT:    v_readlane_b32 s8, v6, 29
1756; GCN-NEXT:    s_cselect_b32 s8, s8, 1
1757; GCN-NEXT:    s_and_b32 s8, s8, 1
1758; GCN-NEXT:    s_or_b32 s7, s8, s7
1759; GCN-NEXT:    s_and_b32 s7, s7, 3
1760; GCN-NEXT:    s_or_b32 s5, s7, s5
1761; GCN-NEXT:    s_lshl_b32 s5, s5, 12
1762; GCN-NEXT:    s_cmp_lg_u32 s6, 11
1763; GCN-NEXT:    v_readlane_b32 s7, v6, 28
1764; GCN-NEXT:    s_cselect_b32 s7, s7, 1
1765; GCN-NEXT:    s_lshl_b32 s7, s7, 3
1766; GCN-NEXT:    s_cmp_lg_u32 s6, 10
1767; GCN-NEXT:    v_readlane_b32 s8, v6, 27
1768; GCN-NEXT:    s_cselect_b32 s8, s8, 1
1769; GCN-NEXT:    s_and_b32 s8, s8, 1
1770; GCN-NEXT:    s_lshl_b32 s8, s8, 2
1771; GCN-NEXT:    s_or_b32 s7, s7, s8
1772; GCN-NEXT:    s_cmp_lg_u32 s6, 9
1773; GCN-NEXT:    v_readlane_b32 s8, v6, 26
1774; GCN-NEXT:    s_cselect_b32 s8, s8, 1
1775; GCN-NEXT:    s_lshl_b32 s8, s8, 1
1776; GCN-NEXT:    s_cmp_lg_u32 s6, 8
1777; GCN-NEXT:    v_readlane_b32 s9, v6, 25
1778; GCN-NEXT:    s_cselect_b32 s9, s9, 1
1779; GCN-NEXT:    s_and_b32 s9, s9, 1
1780; GCN-NEXT:    s_or_b32 s8, s9, s8
1781; GCN-NEXT:    s_and_b32 s8, s8, 3
1782; GCN-NEXT:    s_or_b32 s7, s8, s7
1783; GCN-NEXT:    s_and_b32 s7, s7, 15
1784; GCN-NEXT:    s_lshl_b32 s7, s7, 8
1785; GCN-NEXT:    s_or_b32 s5, s5, s7
1786; GCN-NEXT:    s_cmp_lg_u32 s6, 7
1787; GCN-NEXT:    v_readlane_b32 s7, v6, 24
1788; GCN-NEXT:    s_cselect_b32 s7, s7, 1
1789; GCN-NEXT:    s_lshl_b32 s7, s7, 3
1790; GCN-NEXT:    s_cmp_lg_u32 s6, 6
1791; GCN-NEXT:    v_readlane_b32 s8, v6, 23
1792; GCN-NEXT:    s_cselect_b32 s8, s8, 1
1793; GCN-NEXT:    s_and_b32 s8, s8, 1
1794; GCN-NEXT:    s_lshl_b32 s8, s8, 2
1795; GCN-NEXT:    s_or_b32 s7, s7, s8
1796; GCN-NEXT:    s_cmp_lg_u32 s6, 5
1797; GCN-NEXT:    v_readlane_b32 s8, v6, 22
1798; GCN-NEXT:    s_cselect_b32 s8, s8, 1
1799; GCN-NEXT:    s_lshl_b32 s8, s8, 1
1800; GCN-NEXT:    s_cmp_lg_u32 s6, 4
1801; GCN-NEXT:    v_readlane_b32 s9, v6, 21
1802; GCN-NEXT:    s_cselect_b32 s9, s9, 1
1803; GCN-NEXT:    s_and_b32 s9, s9, 1
1804; GCN-NEXT:    s_or_b32 s8, s9, s8
1805; GCN-NEXT:    s_and_b32 s8, s8, 3
1806; GCN-NEXT:    s_or_b32 s7, s8, s7
1807; GCN-NEXT:    s_lshl_b32 s7, s7, 4
1808; GCN-NEXT:    s_cmp_lg_u32 s6, 3
1809; GCN-NEXT:    v_readlane_b32 s8, v6, 20
1810; GCN-NEXT:    s_cselect_b32 s8, s8, 1
1811; GCN-NEXT:    s_lshl_b32 s8, s8, 3
1812; GCN-NEXT:    s_cmp_lg_u32 s6, 2
1813; GCN-NEXT:    v_readlane_b32 s9, v6, 19
1814; GCN-NEXT:    s_cselect_b32 s9, s9, 1
1815; GCN-NEXT:    s_and_b32 s9, s9, 1
1816; GCN-NEXT:    s_lshl_b32 s9, s9, 2
1817; GCN-NEXT:    s_or_b32 s8, s8, s9
1818; GCN-NEXT:    s_cmp_lg_u32 s6, 0
1819; GCN-NEXT:    s_cselect_b32 s0, s0, 1
1820; GCN-NEXT:    s_and_b32 s0, s0, 1
1821; GCN-NEXT:    s_cmp_lg_u32 s6, 1
1822; GCN-NEXT:    v_readlane_b32 s6, v6, 18
1823; GCN-NEXT:    s_cselect_b32 s6, s6, 1
1824; GCN-NEXT:    s_lshl_b32 s6, s6, 1
1825; GCN-NEXT:    s_or_b32 s0, s0, s6
1826; GCN-NEXT:    s_and_b32 s0, s0, 3
1827; GCN-NEXT:    s_or_b32 s0, s0, s8
1828; GCN-NEXT:    s_and_b32 s0, s0, 15
1829; GCN-NEXT:    s_or_b32 s0, s0, s7
1830; GCN-NEXT:    s_and_b32 s0, s0, 0xff
1831; GCN-NEXT:    s_or_b32 s0, s0, s5
1832; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
1833; GCN-NEXT:    s_or_b32 s0, s0, s4
1834; GCN-NEXT:    v_mov_b32_e32 v0, s0
1835; GCN-NEXT:    v_mov_b32_e32 v1, s1
1836; GCN-NEXT:    v_readlane_b32 s0, v6, 0
1837; GCN-NEXT:    v_readlane_b32 s1, v6, 1
1838; GCN-NEXT:    v_mov_b32_e32 v5, s1
1839; GCN-NEXT:    v_mov_b32_e32 v2, s2
1840; GCN-NEXT:    v_mov_b32_e32 v3, s3
1841; GCN-NEXT:    v_mov_b32_e32 v4, s0
1842; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1843; GCN-NEXT:    s_endpgm
1844entry:
1845  %v = insertelement <128 x i1> %vec, i1 1, i32 %sel
1846  store <128 x i1> %v, ptr addrspace(1) %out
1847  ret void
1848}
1849
1850define amdgpu_ps <32 x float> @float32_inselt_vec(<32 x float> %vec, i32 %sel) {
1851; GCN-LABEL: float32_inselt_vec:
1852; GCN:       ; %bb.0: ; %entry
1853; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v32
1854; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 2, v32
1855; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 3, v32
1856; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 4, v32
1857; GCN-NEXT:    v_cmp_ne_u32_e64 s[6:7], 5, v32
1858; GCN-NEXT:    v_cmp_ne_u32_e64 s[8:9], 6, v32
1859; GCN-NEXT:    v_cmp_ne_u32_e64 s[10:11], 7, v32
1860; GCN-NEXT:    v_cmp_ne_u32_e64 s[12:13], 8, v32
1861; GCN-NEXT:    v_cmp_ne_u32_e64 s[14:15], 9, v32
1862; GCN-NEXT:    v_cmp_ne_u32_e64 s[16:17], 10, v32
1863; GCN-NEXT:    v_cmp_ne_u32_e64 s[18:19], 11, v32
1864; GCN-NEXT:    v_cmp_ne_u32_e64 s[20:21], 12, v32
1865; GCN-NEXT:    v_cmp_ne_u32_e64 s[22:23], 13, v32
1866; GCN-NEXT:    v_cmp_ne_u32_e64 s[24:25], 14, v32
1867; GCN-NEXT:    v_cmp_ne_u32_e64 s[26:27], 15, v32
1868; GCN-NEXT:    v_cmp_ne_u32_e64 s[28:29], 16, v32
1869; GCN-NEXT:    v_cmp_ne_u32_e64 s[30:31], 17, v32
1870; GCN-NEXT:    v_cmp_ne_u32_e64 s[34:35], 18, v32
1871; GCN-NEXT:    v_cmp_ne_u32_e64 s[36:37], 19, v32
1872; GCN-NEXT:    v_cmp_ne_u32_e64 s[38:39], 20, v32
1873; GCN-NEXT:    v_cmp_ne_u32_e64 s[40:41], 21, v32
1874; GCN-NEXT:    v_cmp_ne_u32_e64 s[42:43], 22, v32
1875; GCN-NEXT:    v_cmp_ne_u32_e64 s[44:45], 23, v32
1876; GCN-NEXT:    v_cmp_ne_u32_e64 s[46:47], 24, v32
1877; GCN-NEXT:    v_cmp_ne_u32_e64 s[48:49], 25, v32
1878; GCN-NEXT:    v_cmp_ne_u32_e64 s[50:51], 26, v32
1879; GCN-NEXT:    v_cmp_ne_u32_e64 s[52:53], 27, v32
1880; GCN-NEXT:    v_cmp_ne_u32_e64 s[54:55], 28, v32
1881; GCN-NEXT:    v_cmp_ne_u32_e64 s[56:57], 29, v32
1882; GCN-NEXT:    v_cmp_ne_u32_e64 s[58:59], 30, v32
1883; GCN-NEXT:    v_cmp_ne_u32_e64 s[60:61], 31, v32
1884; GCN-NEXT:    v_cmp_ne_u32_e64 s[62:63], 0, v32
1885; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, v0, s[62:63]
1886; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
1887; GCN-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
1888; GCN-NEXT:    v_cndmask_b32_e64 v3, 1.0, v3, s[2:3]
1889; GCN-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[4:5]
1890; GCN-NEXT:    v_cndmask_b32_e64 v5, 1.0, v5, s[6:7]
1891; GCN-NEXT:    v_cndmask_b32_e64 v6, 1.0, v6, s[8:9]
1892; GCN-NEXT:    v_cndmask_b32_e64 v7, 1.0, v7, s[10:11]
1893; GCN-NEXT:    v_cndmask_b32_e64 v8, 1.0, v8, s[12:13]
1894; GCN-NEXT:    v_cndmask_b32_e64 v9, 1.0, v9, s[14:15]
1895; GCN-NEXT:    v_cndmask_b32_e64 v10, 1.0, v10, s[16:17]
1896; GCN-NEXT:    v_cndmask_b32_e64 v11, 1.0, v11, s[18:19]
1897; GCN-NEXT:    v_cndmask_b32_e64 v12, 1.0, v12, s[20:21]
1898; GCN-NEXT:    v_cndmask_b32_e64 v13, 1.0, v13, s[22:23]
1899; GCN-NEXT:    v_cndmask_b32_e64 v14, 1.0, v14, s[24:25]
1900; GCN-NEXT:    v_cndmask_b32_e64 v15, 1.0, v15, s[26:27]
1901; GCN-NEXT:    v_cndmask_b32_e64 v16, 1.0, v16, s[28:29]
1902; GCN-NEXT:    v_cndmask_b32_e64 v17, 1.0, v17, s[30:31]
1903; GCN-NEXT:    v_cndmask_b32_e64 v18, 1.0, v18, s[34:35]
1904; GCN-NEXT:    v_cndmask_b32_e64 v19, 1.0, v19, s[36:37]
1905; GCN-NEXT:    v_cndmask_b32_e64 v20, 1.0, v20, s[38:39]
1906; GCN-NEXT:    v_cndmask_b32_e64 v21, 1.0, v21, s[40:41]
1907; GCN-NEXT:    v_cndmask_b32_e64 v22, 1.0, v22, s[42:43]
1908; GCN-NEXT:    v_cndmask_b32_e64 v23, 1.0, v23, s[44:45]
1909; GCN-NEXT:    v_cndmask_b32_e64 v24, 1.0, v24, s[46:47]
1910; GCN-NEXT:    v_cndmask_b32_e64 v25, 1.0, v25, s[48:49]
1911; GCN-NEXT:    v_cndmask_b32_e64 v26, 1.0, v26, s[50:51]
1912; GCN-NEXT:    v_cndmask_b32_e64 v27, 1.0, v27, s[52:53]
1913; GCN-NEXT:    v_cndmask_b32_e64 v28, 1.0, v28, s[54:55]
1914; GCN-NEXT:    v_cndmask_b32_e64 v29, 1.0, v29, s[56:57]
1915; GCN-NEXT:    v_cndmask_b32_e64 v30, 1.0, v30, s[58:59]
1916; GCN-NEXT:    v_cndmask_b32_e64 v31, 1.0, v31, s[60:61]
1917; GCN-NEXT:    ; return to shader part epilog
1918entry:
1919  %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel
1920  ret <32 x float> %v
1921}
1922
1923define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) {
1924; GCN-LABEL: double8_inselt_vec:
1925; GCN:       ; %bb.0: ; %entry
1926; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1927; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
1928; GCN-NEXT:    v_mov_b32_e32 v17, 0x3ff00000
1929; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
1930; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
1931; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
1932; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
1933; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v17, vcc
1934; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v16
1935; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
1936; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v17, vcc
1937; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v16
1938; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
1939; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
1940; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v16
1941; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
1942; GCN-NEXT:    v_cndmask_b32_e32 v9, v9, v17, vcc
1943; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
1944; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
1945; GCN-NEXT:    v_cndmask_b32_e32 v11, v11, v17, vcc
1946; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
1947; GCN-NEXT:    v_cndmask_b32_e64 v12, v12, 0, vcc
1948; GCN-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
1949; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
1950; GCN-NEXT:    v_cndmask_b32_e64 v14, v14, 0, vcc
1951; GCN-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
1952; GCN-NEXT:    s_setpc_b64 s[30:31]
1953entry:
1954  %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
1955  ret <8 x double> %v
1956}
1957