xref: /llvm-project/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll (revision 5a3299a684d7d8c40f48d732e5b80a8bd29aa882)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
4
5; FIXME: Broken on evergreen
6; FIXME: For some reason the 8 and 16 vectors are being stored as
7; individual elements instead of 128-bit stores.
8
9define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x float> %a) nounwind {
10; SI-LABEL: insertelement_v2f32_0:
11; SI:       ; %bb.0:
12; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
13; SI-NEXT:    s_mov_b32 s7, 0x100f000
14; SI-NEXT:    s_mov_b32 s6, -1
15; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b32 s4, s0
18; SI-NEXT:    s_mov_b32 s5, s1
19; SI-NEXT:    v_mov_b32_e32 v1, s3
20; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
21; SI-NEXT:    s_endpgm
22;
23; VI-LABEL: insertelement_v2f32_0:
24; VI:       ; %bb.0:
25; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
26; VI-NEXT:    s_mov_b32 s7, 0x1100f000
27; VI-NEXT:    s_mov_b32 s6, -1
28; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
29; VI-NEXT:    s_waitcnt lgkmcnt(0)
30; VI-NEXT:    s_mov_b32 s4, s0
31; VI-NEXT:    s_mov_b32 s5, s1
32; VI-NEXT:    v_mov_b32_e32 v1, s3
33; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
34; VI-NEXT:    s_endpgm
35  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0
36  store <2 x float> %vecins, ptr addrspace(1) %out, align 16
37  ret void
38}
39
40define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x float> %a) nounwind {
41; SI-LABEL: insertelement_v2f32_1:
42; SI:       ; %bb.0:
43; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
44; SI-NEXT:    s_mov_b32 s7, 0x100f000
45; SI-NEXT:    s_mov_b32 s6, -1
46; SI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
47; SI-NEXT:    s_waitcnt lgkmcnt(0)
48; SI-NEXT:    s_mov_b32 s4, s0
49; SI-NEXT:    s_mov_b32 s5, s1
50; SI-NEXT:    v_mov_b32_e32 v0, s2
51; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
52; SI-NEXT:    s_endpgm
53;
54; VI-LABEL: insertelement_v2f32_1:
55; VI:       ; %bb.0:
56; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
57; VI-NEXT:    s_mov_b32 s7, 0x1100f000
58; VI-NEXT:    s_mov_b32 s6, -1
59; VI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
60; VI-NEXT:    s_waitcnt lgkmcnt(0)
61; VI-NEXT:    s_mov_b32 s4, s0
62; VI-NEXT:    s_mov_b32 s5, s1
63; VI-NEXT:    v_mov_b32_e32 v0, s2
64; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
65; VI-NEXT:    s_endpgm
66  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1
67  store <2 x float> %vecins, ptr addrspace(1) %out, align 16
68  ret void
69}
70
71define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32> %a) nounwind {
72; SI-LABEL: insertelement_v2i32_0:
73; SI:       ; %bb.0:
74; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
75; SI-NEXT:    s_mov_b32 s7, 0x100f000
76; SI-NEXT:    s_mov_b32 s6, -1
77; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
78; SI-NEXT:    s_waitcnt lgkmcnt(0)
79; SI-NEXT:    s_mov_b32 s4, s0
80; SI-NEXT:    s_mov_b32 s5, s1
81; SI-NEXT:    v_mov_b32_e32 v1, s3
82; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
83; SI-NEXT:    s_endpgm
84;
85; VI-LABEL: insertelement_v2i32_0:
86; VI:       ; %bb.0:
87; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
88; VI-NEXT:    s_mov_b32 s7, 0x1100f000
89; VI-NEXT:    s_mov_b32 s6, -1
90; VI-NEXT:    v_mov_b32_e32 v0, 0x3e7
91; VI-NEXT:    s_waitcnt lgkmcnt(0)
92; VI-NEXT:    s_mov_b32 s4, s0
93; VI-NEXT:    s_mov_b32 s5, s1
94; VI-NEXT:    v_mov_b32_e32 v1, s3
95; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
96; VI-NEXT:    s_endpgm
97  %vecins = insertelement <2 x i32> %a, i32 999, i32 0
98  store <2 x i32> %vecins, ptr addrspace(1) %out, align 16
99  ret void
100}
101
102define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32> %a) nounwind {
103; SI-LABEL: insertelement_v2i32_1:
104; SI:       ; %bb.0:
105; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
106; SI-NEXT:    s_mov_b32 s7, 0x100f000
107; SI-NEXT:    s_mov_b32 s6, -1
108; SI-NEXT:    v_mov_b32_e32 v1, 0x3e7
109; SI-NEXT:    s_waitcnt lgkmcnt(0)
110; SI-NEXT:    s_mov_b32 s4, s0
111; SI-NEXT:    s_mov_b32 s5, s1
112; SI-NEXT:    v_mov_b32_e32 v0, s2
113; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
114; SI-NEXT:    s_endpgm
115;
116; VI-LABEL: insertelement_v2i32_1:
117; VI:       ; %bb.0:
118; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
119; VI-NEXT:    s_mov_b32 s7, 0x1100f000
120; VI-NEXT:    s_mov_b32 s6, -1
121; VI-NEXT:    v_mov_b32_e32 v1, 0x3e7
122; VI-NEXT:    s_waitcnt lgkmcnt(0)
123; VI-NEXT:    s_mov_b32 s4, s0
124; VI-NEXT:    s_mov_b32 s5, s1
125; VI-NEXT:    v_mov_b32_e32 v0, s2
126; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
127; VI-NEXT:    s_endpgm
128  %vecins = insertelement <2 x i32> %a, i32 999, i32 1
129  store <2 x i32> %vecins, ptr addrspace(1) %out, align 16
130  ret void
131}
132
133; FIXME: Why is the constant moved into the intermediate register and
134; not just directly into the vector component?
135define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x float> %a) nounwind {
136; SI-LABEL: insertelement_v4f32_0:
137; SI:       ; %bb.0:
138; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
139; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
140; SI-NEXT:    s_waitcnt lgkmcnt(0)
141; SI-NEXT:    s_mov_b32 s0, 0x40a00000
142; SI-NEXT:    s_mov_b32 s7, 0x100f000
143; SI-NEXT:    s_mov_b32 s6, -1
144; SI-NEXT:    v_mov_b32_e32 v0, s0
145; SI-NEXT:    v_mov_b32_e32 v1, s1
146; SI-NEXT:    v_mov_b32_e32 v2, s2
147; SI-NEXT:    v_mov_b32_e32 v3, s3
148; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
149; SI-NEXT:    s_endpgm
150;
151; VI-LABEL: insertelement_v4f32_0:
152; VI:       ; %bb.0:
153; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
154; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
155; VI-NEXT:    s_waitcnt lgkmcnt(0)
156; VI-NEXT:    s_mov_b32 s0, 0x40a00000
157; VI-NEXT:    s_mov_b32 s7, 0x1100f000
158; VI-NEXT:    s_mov_b32 s6, -1
159; VI-NEXT:    v_mov_b32_e32 v0, s0
160; VI-NEXT:    v_mov_b32_e32 v1, s1
161; VI-NEXT:    v_mov_b32_e32 v2, s2
162; VI-NEXT:    v_mov_b32_e32 v3, s3
163; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
164; VI-NEXT:    s_endpgm
165  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
166  store <4 x float> %vecins, ptr addrspace(1) %out, align 16
167  ret void
168}
169
170define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x float> %a) nounwind {
171; SI-LABEL: insertelement_v4f32_1:
172; SI:       ; %bb.0:
173; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
174; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
175; SI-NEXT:    s_waitcnt lgkmcnt(0)
176; SI-NEXT:    s_mov_b32 s1, 0x40a00000
177; SI-NEXT:    s_mov_b32 s7, 0x100f000
178; SI-NEXT:    s_mov_b32 s6, -1
179; SI-NEXT:    v_mov_b32_e32 v0, s0
180; SI-NEXT:    v_mov_b32_e32 v1, s1
181; SI-NEXT:    v_mov_b32_e32 v2, s2
182; SI-NEXT:    v_mov_b32_e32 v3, s3
183; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
184; SI-NEXT:    s_endpgm
185;
186; VI-LABEL: insertelement_v4f32_1:
187; VI:       ; %bb.0:
188; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
189; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
190; VI-NEXT:    s_waitcnt lgkmcnt(0)
191; VI-NEXT:    s_mov_b32 s1, 0x40a00000
192; VI-NEXT:    s_mov_b32 s7, 0x1100f000
193; VI-NEXT:    s_mov_b32 s6, -1
194; VI-NEXT:    v_mov_b32_e32 v0, s0
195; VI-NEXT:    v_mov_b32_e32 v1, s1
196; VI-NEXT:    v_mov_b32_e32 v2, s2
197; VI-NEXT:    v_mov_b32_e32 v3, s3
198; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
199; VI-NEXT:    s_endpgm
200  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
201  store <4 x float> %vecins, ptr addrspace(1) %out, align 16
202  ret void
203}
204
205define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x float> %a) nounwind {
206; SI-LABEL: insertelement_v4f32_2:
207; SI:       ; %bb.0:
208; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
209; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
210; SI-NEXT:    s_waitcnt lgkmcnt(0)
211; SI-NEXT:    s_mov_b32 s2, 0x40a00000
212; SI-NEXT:    s_mov_b32 s7, 0x100f000
213; SI-NEXT:    s_mov_b32 s6, -1
214; SI-NEXT:    v_mov_b32_e32 v0, s0
215; SI-NEXT:    v_mov_b32_e32 v1, s1
216; SI-NEXT:    v_mov_b32_e32 v2, s2
217; SI-NEXT:    v_mov_b32_e32 v3, s3
218; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
219; SI-NEXT:    s_endpgm
220;
221; VI-LABEL: insertelement_v4f32_2:
222; VI:       ; %bb.0:
223; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
224; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
225; VI-NEXT:    s_waitcnt lgkmcnt(0)
226; VI-NEXT:    s_mov_b32 s2, 0x40a00000
227; VI-NEXT:    s_mov_b32 s7, 0x1100f000
228; VI-NEXT:    s_mov_b32 s6, -1
229; VI-NEXT:    v_mov_b32_e32 v0, s0
230; VI-NEXT:    v_mov_b32_e32 v1, s1
231; VI-NEXT:    v_mov_b32_e32 v2, s2
232; VI-NEXT:    v_mov_b32_e32 v3, s3
233; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
234; VI-NEXT:    s_endpgm
235  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
236  store <4 x float> %vecins, ptr addrspace(1) %out, align 16
237  ret void
238}
239
240define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x float> %a) nounwind {
241; SI-LABEL: insertelement_v4f32_3:
242; SI:       ; %bb.0:
243; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
244; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
245; SI-NEXT:    s_waitcnt lgkmcnt(0)
246; SI-NEXT:    s_mov_b32 s3, 0x40a00000
247; SI-NEXT:    s_mov_b32 s7, 0x100f000
248; SI-NEXT:    s_mov_b32 s6, -1
249; SI-NEXT:    v_mov_b32_e32 v0, s0
250; SI-NEXT:    v_mov_b32_e32 v1, s1
251; SI-NEXT:    v_mov_b32_e32 v2, s2
252; SI-NEXT:    v_mov_b32_e32 v3, s3
253; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
254; SI-NEXT:    s_endpgm
255;
256; VI-LABEL: insertelement_v4f32_3:
257; VI:       ; %bb.0:
258; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
259; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
260; VI-NEXT:    s_waitcnt lgkmcnt(0)
261; VI-NEXT:    s_mov_b32 s3, 0x40a00000
262; VI-NEXT:    s_mov_b32 s7, 0x1100f000
263; VI-NEXT:    s_mov_b32 s6, -1
264; VI-NEXT:    v_mov_b32_e32 v0, s0
265; VI-NEXT:    v_mov_b32_e32 v1, s1
266; VI-NEXT:    v_mov_b32_e32 v2, s2
267; VI-NEXT:    v_mov_b32_e32 v3, s3
268; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
269; VI-NEXT:    s_endpgm
270  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
271  store <4 x float> %vecins, ptr addrspace(1) %out, align 16
272  ret void
273}
274
275define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32> %a) nounwind {
276; SI-LABEL: insertelement_v4i32_0:
277; SI:       ; %bb.0:
278; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
279; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
280; SI-NEXT:    s_waitcnt lgkmcnt(0)
281; SI-NEXT:    s_movk_i32 s0, 0x3e7
282; SI-NEXT:    s_mov_b32 s7, 0x100f000
283; SI-NEXT:    s_mov_b32 s6, -1
284; SI-NEXT:    v_mov_b32_e32 v0, s0
285; SI-NEXT:    v_mov_b32_e32 v1, s1
286; SI-NEXT:    v_mov_b32_e32 v2, s2
287; SI-NEXT:    v_mov_b32_e32 v3, s3
288; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
289; SI-NEXT:    s_endpgm
290;
291; VI-LABEL: insertelement_v4i32_0:
292; VI:       ; %bb.0:
293; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
294; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
295; VI-NEXT:    s_waitcnt lgkmcnt(0)
296; VI-NEXT:    s_movk_i32 s0, 0x3e7
297; VI-NEXT:    s_mov_b32 s7, 0x1100f000
298; VI-NEXT:    s_mov_b32 s6, -1
299; VI-NEXT:    v_mov_b32_e32 v0, s0
300; VI-NEXT:    v_mov_b32_e32 v1, s1
301; VI-NEXT:    v_mov_b32_e32 v2, s2
302; VI-NEXT:    v_mov_b32_e32 v3, s3
303; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
304; VI-NEXT:    s_endpgm
305  %vecins = insertelement <4 x i32> %a, i32 999, i32 0
306  store <4 x i32> %vecins, ptr addrspace(1) %out, align 16
307  ret void
308}
309
310define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x float> %a) nounwind {
311; SI-LABEL: insertelement_v3f32_1:
312; SI:       ; %bb.0:
313; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
314; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
315; SI-NEXT:    s_mov_b32 s7, 0x100f000
316; SI-NEXT:    s_mov_b32 s6, -1
317; SI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
318; SI-NEXT:    s_waitcnt lgkmcnt(0)
319; SI-NEXT:    v_mov_b32_e32 v0, s0
320; SI-NEXT:    v_mov_b32_e32 v2, s2
321; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
322; SI-NEXT:    s_endpgm
323;
324; VI-LABEL: insertelement_v3f32_1:
325; VI:       ; %bb.0:
326; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
327; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
328; VI-NEXT:    s_mov_b32 s7, 0x1100f000
329; VI-NEXT:    s_mov_b32 s6, -1
330; VI-NEXT:    v_mov_b32_e32 v1, 0x40a00000
331; VI-NEXT:    s_waitcnt lgkmcnt(0)
332; VI-NEXT:    v_mov_b32_e32 v0, s0
333; VI-NEXT:    v_mov_b32_e32 v2, s2
334; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
335; VI-NEXT:    s_endpgm
336  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
337  store <3 x float> %vecins, ptr addrspace(1) %out, align 16
338  ret void
339}
340
341define amdgpu_kernel void @insertelement_v3f32_2(ptr addrspace(1) %out, <3 x float> %a) nounwind {
342; SI-LABEL: insertelement_v3f32_2:
343; SI:       ; %bb.0:
344; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
345; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
346; SI-NEXT:    s_mov_b32 s7, 0x100f000
347; SI-NEXT:    s_mov_b32 s6, -1
348; SI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
349; SI-NEXT:    s_waitcnt lgkmcnt(0)
350; SI-NEXT:    v_mov_b32_e32 v0, s0
351; SI-NEXT:    v_mov_b32_e32 v1, s1
352; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
353; SI-NEXT:    s_endpgm
354;
355; VI-LABEL: insertelement_v3f32_2:
356; VI:       ; %bb.0:
357; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
358; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
359; VI-NEXT:    s_mov_b32 s7, 0x1100f000
360; VI-NEXT:    s_mov_b32 s6, -1
361; VI-NEXT:    v_mov_b32_e32 v2, 0x40a00000
362; VI-NEXT:    s_waitcnt lgkmcnt(0)
363; VI-NEXT:    v_mov_b32_e32 v0, s0
364; VI-NEXT:    v_mov_b32_e32 v1, s1
365; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
366; VI-NEXT:    s_endpgm
367  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
368  store <3 x float> %vecins, ptr addrspace(1) %out, align 16
369  ret void
370}
371
372define amdgpu_kernel void @insertelement_v3f32_3(ptr addrspace(1) %out, <3 x float> %a) nounwind {
373; GCN-LABEL: insertelement_v3f32_3:
374; GCN:       ; %bb.0:
375; GCN-NEXT:    s_endpgm
376  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
377  store <3 x float> %vecins, ptr addrspace(1) %out, align 16
378  ret void
379}
380
381define <4 x float> @insertelement_to_sgpr() nounwind {
382; GCN-LABEL: insertelement_to_sgpr:
383; GCN:       ; %bb.0:
384; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
385; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
386; GCN-NEXT:    s_waitcnt lgkmcnt(0)
387; GCN-NEXT:    s_mov_b32 s4, 0
388; GCN-NEXT:    image_gather4_lz v[0:3], v[0:1], s[4:11], s[4:7] dmask:0x1
389; GCN-NEXT:    s_waitcnt vmcnt(0)
390; GCN-NEXT:    s_setpc_b64 s[30:31]
391  %tmp = load <4 x i32>, ptr addrspace(4) undef
392  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
393  %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
394  ret <4 x float> %tmp2
395}
396
397define <9 x float> @insertelement_to_v9f32_undef() nounwind {
398; GCN-LABEL: insertelement_to_v9f32_undef:
399; GCN:       ; %bb.0:
400; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
401; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
402; GCN-NEXT:    v_mov_b32_e32 v0, 0x40a00000
403; GCN-NEXT:    v_mov_b32_e32 v2, 0xc0a00000
404; GCN-NEXT:    v_mov_b32_e32 v7, 0x41880000
405; GCN-NEXT:    s_waitcnt lgkmcnt(0)
406; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
407; GCN-NEXT:    v_mov_b32_e32 v1, s5
408; GCN-NEXT:    v_mov_b32_e32 v3, s7
409; GCN-NEXT:    v_mov_b32_e32 v4, s8
410; GCN-NEXT:    v_mov_b32_e32 v5, s9
411; GCN-NEXT:    v_mov_b32_e32 v6, s10
412; GCN-NEXT:    s_waitcnt lgkmcnt(0)
413; GCN-NEXT:    v_mov_b32_e32 v8, s4
414; GCN-NEXT:    s_setpc_b64 s[30:31]
415  %tmp = load <9 x float>, ptr addrspace(4) undef
416  %tmp1 = insertelement <9 x float> %tmp, float 5.000, i32 0
417  %tmp2 = insertelement <9 x float> %tmp1, float -5.000, i32 2
418  %tmp3 = insertelement <9 x float> %tmp2, float 17.000, i32 7
419  ret <9 x float> %tmp3
420}
421
422define <10 x float> @insertelement_to_v10f32_undef() nounwind {
423; GCN-LABEL: insertelement_to_v10f32_undef:
424; GCN:       ; %bb.0:
425; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
426; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
427; GCN-NEXT:    v_mov_b32_e32 v0, 2.0
428; GCN-NEXT:    s_waitcnt lgkmcnt(0)
429; GCN-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
430; GCN-NEXT:    v_mov_b32_e32 v1, s5
431; GCN-NEXT:    v_mov_b32_e32 v2, s6
432; GCN-NEXT:    v_mov_b32_e32 v3, s7
433; GCN-NEXT:    v_mov_b32_e32 v4, s8
434; GCN-NEXT:    v_mov_b32_e32 v5, s9
435; GCN-NEXT:    v_mov_b32_e32 v6, s10
436; GCN-NEXT:    v_mov_b32_e32 v7, s11
437; GCN-NEXT:    s_waitcnt lgkmcnt(0)
438; GCN-NEXT:    v_mov_b32_e32 v8, s12
439; GCN-NEXT:    v_mov_b32_e32 v9, s13
440; GCN-NEXT:    s_setpc_b64 s[30:31]
441  %tmp = load <10 x float>, ptr addrspace(4) undef
442  %tmp1 = insertelement <10 x float> %tmp, float 2.0, i32 0
443  ret <10 x float> %tmp1
444}
445
446define <11 x float> @insertelement_to_v11f32_undef() nounwind {
447; GCN-LABEL: insertelement_to_v11f32_undef:
448; GCN:       ; %bb.0:
449; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
451; GCN-NEXT:    v_mov_b32_e32 v0, 1.0
452; GCN-NEXT:    s_waitcnt lgkmcnt(0)
453; GCN-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
454; GCN-NEXT:    v_mov_b32_e32 v1, s5
455; GCN-NEXT:    v_mov_b32_e32 v2, s6
456; GCN-NEXT:    v_mov_b32_e32 v3, s7
457; GCN-NEXT:    v_mov_b32_e32 v4, s8
458; GCN-NEXT:    v_mov_b32_e32 v5, s9
459; GCN-NEXT:    v_mov_b32_e32 v6, s10
460; GCN-NEXT:    v_mov_b32_e32 v7, s11
461; GCN-NEXT:    s_waitcnt lgkmcnt(0)
462; GCN-NEXT:    v_mov_b32_e32 v8, s12
463; GCN-NEXT:    v_mov_b32_e32 v9, s13
464; GCN-NEXT:    v_mov_b32_e32 v10, s14
465; GCN-NEXT:    s_setpc_b64 s[30:31]
466  %tmp = load <11 x float>, ptr addrspace(4) undef
467  %tmp1 = insertelement <11 x float> %tmp, float 1.000, i32 0
468  ret <11 x float> %tmp1
469}
470
471define <12 x float> @insertelement_to_v12f32_undef() nounwind {
472; GCN-LABEL: insertelement_to_v12f32_undef:
473; GCN:       ; %bb.0:
474; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
475; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
476; GCN-NEXT:    v_mov_b32_e32 v0, 4.0
477; GCN-NEXT:    s_waitcnt lgkmcnt(0)
478; GCN-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
479; GCN-NEXT:    v_mov_b32_e32 v1, s5
480; GCN-NEXT:    v_mov_b32_e32 v2, s6
481; GCN-NEXT:    v_mov_b32_e32 v3, s7
482; GCN-NEXT:    v_mov_b32_e32 v4, s8
483; GCN-NEXT:    v_mov_b32_e32 v5, s9
484; GCN-NEXT:    v_mov_b32_e32 v6, s10
485; GCN-NEXT:    v_mov_b32_e32 v7, s11
486; GCN-NEXT:    s_waitcnt lgkmcnt(0)
487; GCN-NEXT:    v_mov_b32_e32 v8, s12
488; GCN-NEXT:    v_mov_b32_e32 v9, s13
489; GCN-NEXT:    v_mov_b32_e32 v10, s14
490; GCN-NEXT:    v_mov_b32_e32 v11, s15
491; GCN-NEXT:    s_setpc_b64 s[30:31]
492  %tmp = load <12 x float>, ptr addrspace(4) undef
493  %tmp1 = insertelement <12 x float> %tmp, float 4.0, i32 0
494  ret <12 x float> %tmp1
495}
496
497define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 x float> %a, i32 %b) nounwind {
498; SI-LABEL: dynamic_insertelement_v2f32:
499; SI:       ; %bb.0:
500; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x2
501; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
502; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
503; SI-NEXT:    s_mov_b32 s7, 0x100f000
504; SI-NEXT:    s_mov_b32 s6, -1
505; SI-NEXT:    s_waitcnt lgkmcnt(0)
506; SI-NEXT:    s_cmp_lg_u32 s2, 1
507; SI-NEXT:    v_mov_b32_e32 v1, s1
508; SI-NEXT:    s_cselect_b64 vcc, -1, 0
509; SI-NEXT:    s_cmp_lg_u32 s2, 0
510; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
511; SI-NEXT:    v_mov_b32_e32 v2, s0
512; SI-NEXT:    s_cselect_b64 vcc, -1, 0
513; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
514; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
515; SI-NEXT:    s_endpgm
516;
517; VI-LABEL: dynamic_insertelement_v2f32:
518; VI:       ; %bb.0:
519; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
520; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
521; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
522; VI-NEXT:    s_mov_b32 s7, 0x1100f000
523; VI-NEXT:    s_mov_b32 s6, -1
524; VI-NEXT:    s_waitcnt lgkmcnt(0)
525; VI-NEXT:    s_cmp_lg_u32 s2, 1
526; VI-NEXT:    v_mov_b32_e32 v1, s1
527; VI-NEXT:    s_cselect_b64 vcc, -1, 0
528; VI-NEXT:    s_cmp_lg_u32 s2, 0
529; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
530; VI-NEXT:    v_mov_b32_e32 v2, s0
531; VI-NEXT:    s_cselect_b64 vcc, -1, 0
532; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
533; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
534; VI-NEXT:    s_endpgm
535  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
536  store <2 x float> %vecins, ptr addrspace(1) %out, align 8
537  ret void
538}
539
540define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 x float> %a, i32 %b) nounwind {
541; SI-LABEL: dynamic_insertelement_v3f32:
542; SI:       ; %bb.0:
543; SI-NEXT:    s_load_dword s10, s[8:9], 0x8
544; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
545; SI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x4
546; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
547; SI-NEXT:    s_mov_b32 s3, 0x100f000
548; SI-NEXT:    s_waitcnt lgkmcnt(0)
549; SI-NEXT:    s_cmp_lg_u32 s10, 2
550; SI-NEXT:    s_cselect_b64 vcc, -1, 0
551; SI-NEXT:    v_mov_b32_e32 v1, s6
552; SI-NEXT:    s_cmp_lg_u32 s10, 1
553; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
554; SI-NEXT:    v_mov_b32_e32 v1, s5
555; SI-NEXT:    s_cselect_b64 vcc, -1, 0
556; SI-NEXT:    s_cmp_lg_u32 s10, 0
557; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
558; SI-NEXT:    v_mov_b32_e32 v3, s4
559; SI-NEXT:    s_cselect_b64 vcc, -1, 0
560; SI-NEXT:    s_mov_b32 s2, -1
561; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
562; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
563; SI-NEXT:    s_endpgm
564;
565; VI-LABEL: dynamic_insertelement_v3f32:
566; VI:       ; %bb.0:
567; VI-NEXT:    s_load_dword s10, s[8:9], 0x20
568; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
569; VI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x10
570; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
571; VI-NEXT:    s_mov_b32 s3, 0x1100f000
572; VI-NEXT:    s_waitcnt lgkmcnt(0)
573; VI-NEXT:    s_cmp_lg_u32 s10, 2
574; VI-NEXT:    s_cselect_b64 vcc, -1, 0
575; VI-NEXT:    v_mov_b32_e32 v1, s6
576; VI-NEXT:    s_cmp_lg_u32 s10, 1
577; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
578; VI-NEXT:    v_mov_b32_e32 v1, s5
579; VI-NEXT:    s_cselect_b64 vcc, -1, 0
580; VI-NEXT:    s_cmp_lg_u32 s10, 0
581; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
582; VI-NEXT:    v_mov_b32_e32 v3, s4
583; VI-NEXT:    s_cselect_b64 vcc, -1, 0
584; VI-NEXT:    s_mov_b32 s2, -1
585; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
586; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
587; VI-NEXT:    s_endpgm
588  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
589  store <3 x float> %vecins, ptr addrspace(1) %out, align 16
590  ret void
591}
592
593define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 x float> %a, i32 %b) nounwind {
594; SI-LABEL: dynamic_insertelement_v4f32:
595; SI:       ; %bb.0:
596; SI-NEXT:    s_load_dword s10, s[8:9], 0x8
597; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
598; SI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x4
599; SI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
600; SI-NEXT:    s_mov_b32 s3, 0x100f000
601; SI-NEXT:    s_waitcnt lgkmcnt(0)
602; SI-NEXT:    s_cmp_lg_u32 s10, 3
603; SI-NEXT:    s_cselect_b64 vcc, -1, 0
604; SI-NEXT:    v_mov_b32_e32 v1, s7
605; SI-NEXT:    s_cmp_lg_u32 s10, 2
606; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
607; SI-NEXT:    v_mov_b32_e32 v1, s6
608; SI-NEXT:    s_cselect_b64 vcc, -1, 0
609; SI-NEXT:    s_cmp_lg_u32 s10, 1
610; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
611; SI-NEXT:    v_mov_b32_e32 v1, s5
612; SI-NEXT:    s_cselect_b64 vcc, -1, 0
613; SI-NEXT:    s_cmp_lg_u32 s10, 0
614; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
615; SI-NEXT:    v_mov_b32_e32 v4, s4
616; SI-NEXT:    s_cselect_b64 vcc, -1, 0
617; SI-NEXT:    s_mov_b32 s2, -1
618; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
619; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
620; SI-NEXT:    s_endpgm
621;
622; VI-LABEL: dynamic_insertelement_v4f32:
623; VI:       ; %bb.0:
624; VI-NEXT:    s_load_dword s10, s[8:9], 0x20
625; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
626; VI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x10
627; VI-NEXT:    v_mov_b32_e32 v0, 0x40a00000
628; VI-NEXT:    s_mov_b32 s3, 0x1100f000
629; VI-NEXT:    s_waitcnt lgkmcnt(0)
630; VI-NEXT:    s_cmp_lg_u32 s10, 3
631; VI-NEXT:    s_cselect_b64 vcc, -1, 0
632; VI-NEXT:    v_mov_b32_e32 v1, s7
633; VI-NEXT:    s_cmp_lg_u32 s10, 2
634; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
635; VI-NEXT:    v_mov_b32_e32 v1, s6
636; VI-NEXT:    s_cselect_b64 vcc, -1, 0
637; VI-NEXT:    s_cmp_lg_u32 s10, 1
638; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
639; VI-NEXT:    v_mov_b32_e32 v1, s5
640; VI-NEXT:    s_cselect_b64 vcc, -1, 0
641; VI-NEXT:    s_cmp_lg_u32 s10, 0
642; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
643; VI-NEXT:    v_mov_b32_e32 v4, s4
644; VI-NEXT:    s_cselect_b64 vcc, -1, 0
645; VI-NEXT:    s_mov_b32 s2, -1
646; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
647; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
648; VI-NEXT:    s_endpgm
649  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
650  store <4 x float> %vecins, ptr addrspace(1) %out, align 16
651  ret void
652}
653
654define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 x float> %a, i32 %b) nounwind {
655; SI-LABEL: dynamic_insertelement_v8f32:
656; SI:       ; %bb.0:
657; SI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x8
658; SI-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x0
659; SI-NEXT:    s_load_dword s8, s[8:9], 0x10
660; SI-NEXT:    v_mov_b32_e32 v8, 0x40a00000
661; SI-NEXT:    s_mov_b32 s15, 0x100f000
662; SI-NEXT:    s_mov_b32 s14, -1
663; SI-NEXT:    s_waitcnt lgkmcnt(0)
664; SI-NEXT:    v_mov_b32_e32 v0, s0
665; SI-NEXT:    v_mov_b32_e32 v1, s1
666; SI-NEXT:    v_mov_b32_e32 v2, s2
667; SI-NEXT:    v_mov_b32_e32 v3, s3
668; SI-NEXT:    v_mov_b32_e32 v4, s4
669; SI-NEXT:    v_mov_b32_e32 v5, s5
670; SI-NEXT:    v_mov_b32_e32 v6, s6
671; SI-NEXT:    v_mov_b32_e32 v7, s7
672; SI-NEXT:    s_mov_b32 m0, s8
673; SI-NEXT:    v_movreld_b32_e32 v0, v8
674; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
675; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
676; SI-NEXT:    s_endpgm
677;
678; VI-LABEL: dynamic_insertelement_v8f32:
679; VI:       ; %bb.0:
680; VI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x20
681; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
682; VI-NEXT:    s_load_dword s4, s[8:9], 0x40
683; VI-NEXT:    v_mov_b32_e32 v8, 0x40a00000
684; VI-NEXT:    s_mov_b32 s3, 0x1100f000
685; VI-NEXT:    s_mov_b32 s2, -1
686; VI-NEXT:    s_waitcnt lgkmcnt(0)
687; VI-NEXT:    v_mov_b32_e32 v0, s12
688; VI-NEXT:    v_mov_b32_e32 v1, s13
689; VI-NEXT:    v_mov_b32_e32 v2, s14
690; VI-NEXT:    v_mov_b32_e32 v3, s15
691; VI-NEXT:    v_mov_b32_e32 v4, s16
692; VI-NEXT:    v_mov_b32_e32 v5, s17
693; VI-NEXT:    v_mov_b32_e32 v6, s18
694; VI-NEXT:    v_mov_b32_e32 v7, s19
695; VI-NEXT:    s_mov_b32 m0, s4
696; VI-NEXT:    v_movreld_b32_e32 v0, v8
697; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
698; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
699; VI-NEXT:    s_endpgm
700  %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
701  store <8 x float> %vecins, ptr addrspace(1) %out, align 32
702  ret void
703}
704
705define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 x float> %a, i32 %b) nounwind {
706; SI-LABEL: dynamic_insertelement_v9f32:
707; SI:       ; %bb.0:
708; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
709; SI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x10
710; SI-NEXT:    s_load_dword s4, s[8:9], 0x18
711; SI-NEXT:    s_load_dword s5, s[8:9], 0x20
712; SI-NEXT:    v_mov_b32_e32 v9, 0x40a00000
713; SI-NEXT:    s_mov_b32 s3, 0x100f000
714; SI-NEXT:    s_waitcnt lgkmcnt(0)
715; SI-NEXT:    v_mov_b32_e32 v0, s12
716; SI-NEXT:    v_mov_b32_e32 v1, s13
717; SI-NEXT:    v_mov_b32_e32 v2, s14
718; SI-NEXT:    v_mov_b32_e32 v3, s15
719; SI-NEXT:    v_mov_b32_e32 v4, s16
720; SI-NEXT:    v_mov_b32_e32 v5, s17
721; SI-NEXT:    v_mov_b32_e32 v6, s18
722; SI-NEXT:    v_mov_b32_e32 v7, s19
723; SI-NEXT:    v_mov_b32_e32 v8, s4
724; SI-NEXT:    s_mov_b32 m0, s5
725; SI-NEXT:    s_mov_b32 s2, -1
726; SI-NEXT:    v_movreld_b32_e32 v0, v9
727; SI-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:32
728; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
729; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
730; SI-NEXT:    s_endpgm
731;
732; VI-LABEL: dynamic_insertelement_v9f32:
733; VI:       ; %bb.0:
734; VI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x40
735; VI-NEXT:    s_load_dword s4, s[8:9], 0x60
736; VI-NEXT:    s_load_dword s5, s[8:9], 0x80
737; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
738; VI-NEXT:    v_mov_b32_e32 v9, 0x40a00000
739; VI-NEXT:    s_waitcnt lgkmcnt(0)
740; VI-NEXT:    v_mov_b32_e32 v0, s12
741; VI-NEXT:    v_mov_b32_e32 v1, s13
742; VI-NEXT:    v_mov_b32_e32 v2, s14
743; VI-NEXT:    v_mov_b32_e32 v3, s15
744; VI-NEXT:    v_mov_b32_e32 v4, s16
745; VI-NEXT:    v_mov_b32_e32 v5, s17
746; VI-NEXT:    v_mov_b32_e32 v6, s18
747; VI-NEXT:    v_mov_b32_e32 v7, s19
748; VI-NEXT:    v_mov_b32_e32 v8, s4
749; VI-NEXT:    s_mov_b32 m0, s5
750; VI-NEXT:    s_mov_b32 s3, 0x1100f000
751; VI-NEXT:    s_mov_b32 s2, -1
752; VI-NEXT:    v_movreld_b32_e32 v0, v9
753; VI-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:32
754; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
755; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
756; VI-NEXT:    s_endpgm
757  %vecins = insertelement <9 x float> %a, float 5.000000e+00, i32 %b
758  store <9 x float> %vecins, ptr addrspace(1) %out, align 32
759  ret void
760}
761
762define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, <10 x float> %a, i32 %b) nounwind {
763; SI-LABEL: dynamic_insertelement_v10f32:
764; SI:       ; %bb.0:
765; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
766; SI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x10
767; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
768; SI-NEXT:    s_load_dword s6, s[8:9], 0x20
769; SI-NEXT:    v_mov_b32_e32 v10, 0x40a00000
770; SI-NEXT:    s_mov_b32 s3, 0x100f000
771; SI-NEXT:    s_waitcnt lgkmcnt(0)
772; SI-NEXT:    v_mov_b32_e32 v0, s12
773; SI-NEXT:    v_mov_b32_e32 v1, s13
774; SI-NEXT:    v_mov_b32_e32 v2, s14
775; SI-NEXT:    v_mov_b32_e32 v3, s15
776; SI-NEXT:    v_mov_b32_e32 v4, s16
777; SI-NEXT:    v_mov_b32_e32 v5, s17
778; SI-NEXT:    v_mov_b32_e32 v6, s18
779; SI-NEXT:    v_mov_b32_e32 v7, s19
780; SI-NEXT:    v_mov_b32_e32 v8, s4
781; SI-NEXT:    v_mov_b32_e32 v9, s5
782; SI-NEXT:    s_mov_b32 m0, s6
783; SI-NEXT:    s_mov_b32 s2, -1
784; SI-NEXT:    v_movreld_b32_e32 v0, v10
785; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
786; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
787; SI-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
788; SI-NEXT:    s_endpgm
789;
790; VI-LABEL: dynamic_insertelement_v10f32:
791; VI:       ; %bb.0:
792; VI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x40
793; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x60
794; VI-NEXT:    s_load_dword s6, s[8:9], 0x80
795; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
796; VI-NEXT:    v_mov_b32_e32 v10, 0x40a00000
797; VI-NEXT:    s_waitcnt lgkmcnt(0)
798; VI-NEXT:    v_mov_b32_e32 v0, s12
799; VI-NEXT:    v_mov_b32_e32 v1, s13
800; VI-NEXT:    v_mov_b32_e32 v2, s14
801; VI-NEXT:    v_mov_b32_e32 v3, s15
802; VI-NEXT:    v_mov_b32_e32 v4, s16
803; VI-NEXT:    v_mov_b32_e32 v5, s17
804; VI-NEXT:    v_mov_b32_e32 v6, s18
805; VI-NEXT:    v_mov_b32_e32 v7, s19
806; VI-NEXT:    v_mov_b32_e32 v8, s4
807; VI-NEXT:    v_mov_b32_e32 v9, s5
808; VI-NEXT:    s_mov_b32 m0, s6
809; VI-NEXT:    s_mov_b32 s3, 0x1100f000
810; VI-NEXT:    s_mov_b32 s2, -1
811; VI-NEXT:    v_movreld_b32_e32 v0, v10
812; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
813; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
814; VI-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
815; VI-NEXT:    s_endpgm
816  %vecins = insertelement <10 x float> %a, float 5.000000e+00, i32 %b
817  store <10 x float> %vecins, ptr addrspace(1) %out, align 32
818  ret void
819}
820
821define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, <11 x float> %a, i32 %b) nounwind {
822; SI-LABEL: dynamic_insertelement_v11f32:
823; SI:       ; %bb.0:
824; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
825; SI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x10
826; SI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x18
827; SI-NEXT:    s_waitcnt lgkmcnt(0)
828; SI-NEXT:    s_load_dword s7, s[8:9], 0x20
829; SI-NEXT:    v_mov_b32_e32 v11, 0x40a00000
830; SI-NEXT:    s_mov_b32 s3, 0x100f000
831; SI-NEXT:    v_mov_b32_e32 v0, s12
832; SI-NEXT:    v_mov_b32_e32 v1, s13
833; SI-NEXT:    v_mov_b32_e32 v2, s14
834; SI-NEXT:    v_mov_b32_e32 v3, s15
835; SI-NEXT:    v_mov_b32_e32 v4, s16
836; SI-NEXT:    v_mov_b32_e32 v5, s17
837; SI-NEXT:    v_mov_b32_e32 v6, s18
838; SI-NEXT:    v_mov_b32_e32 v7, s19
839; SI-NEXT:    v_mov_b32_e32 v8, s4
840; SI-NEXT:    v_mov_b32_e32 v9, s5
841; SI-NEXT:    v_mov_b32_e32 v10, s6
842; SI-NEXT:    s_waitcnt lgkmcnt(0)
843; SI-NEXT:    s_mov_b32 m0, s7
844; SI-NEXT:    s_mov_b32 s2, -1
845; SI-NEXT:    v_movreld_b32_e32 v0, v11
846; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
847; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
848; SI-NEXT:    buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
849; SI-NEXT:    s_endpgm
850;
851; VI-LABEL: dynamic_insertelement_v11f32:
852; VI:       ; %bb.0:
853; VI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x60
854; VI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x40
855; VI-NEXT:    s_waitcnt lgkmcnt(0)
856; VI-NEXT:    s_load_dword s7, s[8:9], 0x80
857; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
858; VI-NEXT:    v_mov_b32_e32 v11, 0x40a00000
859; VI-NEXT:    v_mov_b32_e32 v8, s4
860; VI-NEXT:    v_mov_b32_e32 v0, s12
861; VI-NEXT:    v_mov_b32_e32 v1, s13
862; VI-NEXT:    v_mov_b32_e32 v2, s14
863; VI-NEXT:    v_mov_b32_e32 v3, s15
864; VI-NEXT:    v_mov_b32_e32 v4, s16
865; VI-NEXT:    v_mov_b32_e32 v5, s17
866; VI-NEXT:    v_mov_b32_e32 v6, s18
867; VI-NEXT:    v_mov_b32_e32 v7, s19
868; VI-NEXT:    v_mov_b32_e32 v9, s5
869; VI-NEXT:    v_mov_b32_e32 v10, s6
870; VI-NEXT:    s_waitcnt lgkmcnt(0)
871; VI-NEXT:    s_mov_b32 m0, s7
872; VI-NEXT:    s_mov_b32 s3, 0x1100f000
873; VI-NEXT:    s_mov_b32 s2, -1
874; VI-NEXT:    v_movreld_b32_e32 v0, v11
875; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
876; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
877; VI-NEXT:    buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
878; VI-NEXT:    s_endpgm
879  %vecins = insertelement <11 x float> %a, float 5.000000e+00, i32 %b
880  store <11 x float> %vecins, ptr addrspace(1) %out, align 32
881  ret void
882}
883
884define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, <12 x float> %a, i32 %b) nounwind {
885; SI-LABEL: dynamic_insertelement_v12f32:
886; SI:       ; %bb.0:
887; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
888; SI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x10
889; SI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x18
890; SI-NEXT:    s_load_dword s8, s[8:9], 0x20
891; SI-NEXT:    v_mov_b32_e32 v12, 0x40a00000
892; SI-NEXT:    s_mov_b32 s3, 0x100f000
893; SI-NEXT:    s_waitcnt lgkmcnt(0)
894; SI-NEXT:    v_mov_b32_e32 v0, s12
895; SI-NEXT:    v_mov_b32_e32 v1, s13
896; SI-NEXT:    v_mov_b32_e32 v2, s14
897; SI-NEXT:    v_mov_b32_e32 v3, s15
898; SI-NEXT:    v_mov_b32_e32 v4, s16
899; SI-NEXT:    v_mov_b32_e32 v5, s17
900; SI-NEXT:    v_mov_b32_e32 v6, s18
901; SI-NEXT:    v_mov_b32_e32 v7, s19
902; SI-NEXT:    v_mov_b32_e32 v8, s4
903; SI-NEXT:    v_mov_b32_e32 v9, s5
904; SI-NEXT:    v_mov_b32_e32 v10, s6
905; SI-NEXT:    v_mov_b32_e32 v11, s7
906; SI-NEXT:    s_mov_b32 m0, s8
907; SI-NEXT:    s_mov_b32 s2, -1
908; SI-NEXT:    v_movreld_b32_e32 v0, v12
909; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
910; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
911; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
912; SI-NEXT:    s_endpgm
913;
914; VI-LABEL: dynamic_insertelement_v12f32:
915; VI:       ; %bb.0:
916; VI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x40
917; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
918; VI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x60
919; VI-NEXT:    s_load_dword s8, s[8:9], 0x80
920; VI-NEXT:    v_mov_b32_e32 v12, 0x40a00000
921; VI-NEXT:    s_waitcnt lgkmcnt(0)
922; VI-NEXT:    v_mov_b32_e32 v0, s12
923; VI-NEXT:    v_mov_b32_e32 v1, s13
924; VI-NEXT:    v_mov_b32_e32 v2, s14
925; VI-NEXT:    v_mov_b32_e32 v3, s15
926; VI-NEXT:    v_mov_b32_e32 v4, s16
927; VI-NEXT:    v_mov_b32_e32 v5, s17
928; VI-NEXT:    v_mov_b32_e32 v6, s18
929; VI-NEXT:    v_mov_b32_e32 v7, s19
930; VI-NEXT:    v_mov_b32_e32 v8, s4
931; VI-NEXT:    v_mov_b32_e32 v9, s5
932; VI-NEXT:    v_mov_b32_e32 v10, s6
933; VI-NEXT:    v_mov_b32_e32 v11, s7
934; VI-NEXT:    s_mov_b32 m0, s8
935; VI-NEXT:    s_mov_b32 s3, 0x1100f000
936; VI-NEXT:    s_mov_b32 s2, -1
937; VI-NEXT:    v_movreld_b32_e32 v0, v12
938; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
939; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
940; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
941; VI-NEXT:    s_endpgm
942  %vecins = insertelement <12 x float> %a, float 5.000000e+00, i32 %b
943  store <12 x float> %vecins, ptr addrspace(1) %out, align 32
944  ret void
945}
946
947define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, <16 x float> %a, i32 %b) nounwind {
948; SI-LABEL: dynamic_insertelement_v16f32:
949; SI:       ; %bb.0:
950; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
951; SI-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x10
952; SI-NEXT:    s_load_dword s4, s[8:9], 0x20
953; SI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
954; SI-NEXT:    s_mov_b32 s3, 0x100f000
955; SI-NEXT:    s_mov_b32 s2, -1
956; SI-NEXT:    s_waitcnt lgkmcnt(0)
957; SI-NEXT:    v_mov_b32_e32 v0, s12
958; SI-NEXT:    v_mov_b32_e32 v1, s13
959; SI-NEXT:    v_mov_b32_e32 v2, s14
960; SI-NEXT:    v_mov_b32_e32 v3, s15
961; SI-NEXT:    v_mov_b32_e32 v4, s16
962; SI-NEXT:    v_mov_b32_e32 v5, s17
963; SI-NEXT:    v_mov_b32_e32 v6, s18
964; SI-NEXT:    v_mov_b32_e32 v7, s19
965; SI-NEXT:    v_mov_b32_e32 v8, s20
966; SI-NEXT:    v_mov_b32_e32 v9, s21
967; SI-NEXT:    v_mov_b32_e32 v10, s22
968; SI-NEXT:    v_mov_b32_e32 v11, s23
969; SI-NEXT:    v_mov_b32_e32 v12, s24
970; SI-NEXT:    v_mov_b32_e32 v13, s25
971; SI-NEXT:    v_mov_b32_e32 v14, s26
972; SI-NEXT:    v_mov_b32_e32 v15, s27
973; SI-NEXT:    s_mov_b32 m0, s4
974; SI-NEXT:    v_movreld_b32_e32 v0, v16
975; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
976; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
977; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
978; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
979; SI-NEXT:    s_endpgm
980;
981; VI-LABEL: dynamic_insertelement_v16f32:
982; VI:       ; %bb.0:
983; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
984; VI-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x40
985; VI-NEXT:    s_load_dword s4, s[8:9], 0x80
986; VI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
987; VI-NEXT:    s_mov_b32 s3, 0x1100f000
988; VI-NEXT:    s_mov_b32 s2, -1
989; VI-NEXT:    s_waitcnt lgkmcnt(0)
990; VI-NEXT:    v_mov_b32_e32 v0, s12
991; VI-NEXT:    v_mov_b32_e32 v1, s13
992; VI-NEXT:    v_mov_b32_e32 v2, s14
993; VI-NEXT:    v_mov_b32_e32 v3, s15
994; VI-NEXT:    v_mov_b32_e32 v4, s16
995; VI-NEXT:    v_mov_b32_e32 v5, s17
996; VI-NEXT:    v_mov_b32_e32 v6, s18
997; VI-NEXT:    v_mov_b32_e32 v7, s19
998; VI-NEXT:    v_mov_b32_e32 v8, s20
999; VI-NEXT:    v_mov_b32_e32 v9, s21
1000; VI-NEXT:    v_mov_b32_e32 v10, s22
1001; VI-NEXT:    v_mov_b32_e32 v11, s23
1002; VI-NEXT:    v_mov_b32_e32 v12, s24
1003; VI-NEXT:    v_mov_b32_e32 v13, s25
1004; VI-NEXT:    v_mov_b32_e32 v14, s26
1005; VI-NEXT:    v_mov_b32_e32 v15, s27
1006; VI-NEXT:    s_mov_b32 m0, s4
1007; VI-NEXT:    v_movreld_b32_e32 v0, v16
1008; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1009; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1010; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1011; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1012; VI-NEXT:    s_endpgm
1013  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
1014  store <16 x float> %vecins, ptr addrspace(1) %out, align 64
1015  ret void
1016}
1017
1018define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 x i32> %a, i32 %b) nounwind {
1019; SI-LABEL: dynamic_insertelement_v2i32:
1020; SI:       ; %bb.0:
1021; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x2
1022; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1023; SI-NEXT:    s_mov_b32 s7, 0x100f000
1024; SI-NEXT:    s_mov_b32 s6, -1
1025; SI-NEXT:    s_waitcnt lgkmcnt(0)
1026; SI-NEXT:    s_cmp_lg_u32 s2, 1
1027; SI-NEXT:    s_cselect_b32 s1, s1, 5
1028; SI-NEXT:    s_cmp_lg_u32 s2, 0
1029; SI-NEXT:    s_cselect_b32 s0, s0, 5
1030; SI-NEXT:    v_mov_b32_e32 v0, s0
1031; SI-NEXT:    v_mov_b32_e32 v1, s1
1032; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1033; SI-NEXT:    s_endpgm
1034;
1035; VI-LABEL: dynamic_insertelement_v2i32:
1036; VI:       ; %bb.0:
1037; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x8
1038; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1039; VI-NEXT:    s_mov_b32 s7, 0x1100f000
1040; VI-NEXT:    s_mov_b32 s6, -1
1041; VI-NEXT:    s_waitcnt lgkmcnt(0)
1042; VI-NEXT:    s_cmp_lg_u32 s2, 1
1043; VI-NEXT:    s_cselect_b32 s1, s1, 5
1044; VI-NEXT:    s_cmp_lg_u32 s2, 0
1045; VI-NEXT:    s_cselect_b32 s0, s0, 5
1046; VI-NEXT:    v_mov_b32_e32 v0, s0
1047; VI-NEXT:    v_mov_b32_e32 v1, s1
1048; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1049; VI-NEXT:    s_endpgm
1050  %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
1051  store <2 x i32> %vecins, ptr addrspace(1) %out, align 8
1052  ret void
1053}
1054
1055define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 x i32> %a, i32 %b) nounwind {
1056; SI-LABEL: dynamic_insertelement_v3i32:
1057; SI:       ; %bb.0:
1058; SI-NEXT:    s_load_dword s10, s[8:9], 0x8
1059; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
1060; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1061; SI-NEXT:    s_mov_b32 s7, 0x100f000
1062; SI-NEXT:    s_mov_b32 s6, -1
1063; SI-NEXT:    s_waitcnt lgkmcnt(0)
1064; SI-NEXT:    s_cmp_lg_u32 s10, 2
1065; SI-NEXT:    s_cselect_b32 s2, s2, 5
1066; SI-NEXT:    s_cmp_lg_u32 s10, 1
1067; SI-NEXT:    s_cselect_b32 s1, s1, 5
1068; SI-NEXT:    s_cmp_lg_u32 s10, 0
1069; SI-NEXT:    s_cselect_b32 s0, s0, 5
1070; SI-NEXT:    v_mov_b32_e32 v0, s0
1071; SI-NEXT:    v_mov_b32_e32 v1, s1
1072; SI-NEXT:    v_mov_b32_e32 v2, s2
1073; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
1074; SI-NEXT:    s_endpgm
1075;
1076; VI-LABEL: dynamic_insertelement_v3i32:
1077; VI:       ; %bb.0:
1078; VI-NEXT:    s_load_dword s10, s[8:9], 0x20
1079; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
1080; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1081; VI-NEXT:    s_mov_b32 s7, 0x1100f000
1082; VI-NEXT:    s_mov_b32 s6, -1
1083; VI-NEXT:    s_waitcnt lgkmcnt(0)
1084; VI-NEXT:    s_cmp_lg_u32 s10, 2
1085; VI-NEXT:    s_cselect_b32 s2, s2, 5
1086; VI-NEXT:    s_cmp_lg_u32 s10, 1
1087; VI-NEXT:    s_cselect_b32 s1, s1, 5
1088; VI-NEXT:    s_cmp_lg_u32 s10, 0
1089; VI-NEXT:    s_cselect_b32 s0, s0, 5
1090; VI-NEXT:    v_mov_b32_e32 v0, s0
1091; VI-NEXT:    v_mov_b32_e32 v1, s1
1092; VI-NEXT:    v_mov_b32_e32 v2, s2
1093; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
1094; VI-NEXT:    s_endpgm
1095  %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
1096  store <3 x i32> %vecins, ptr addrspace(1) %out, align 16
1097  ret void
1098}
1099
1100define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
1101; SI-LABEL: dynamic_insertelement_v4i32:
1102; SI:       ; %bb.0:
1103; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
1104; SI-NEXT:    s_load_dword s10, s[8:9], 0x8
1105; SI-NEXT:    s_load_dword s11, s[8:9], 0x11
1106; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1107; SI-NEXT:    s_mov_b32 s7, 0x100f000
1108; SI-NEXT:    s_mov_b32 s6, -1
1109; SI-NEXT:    s_waitcnt lgkmcnt(0)
1110; SI-NEXT:    s_cmp_eq_u32 s10, 3
1111; SI-NEXT:    s_cselect_b32 s3, s11, s3
1112; SI-NEXT:    s_cmp_eq_u32 s10, 2
1113; SI-NEXT:    s_cselect_b32 s2, s11, s2
1114; SI-NEXT:    s_cmp_eq_u32 s10, 1
1115; SI-NEXT:    s_cselect_b32 s1, s11, s1
1116; SI-NEXT:    s_cmp_eq_u32 s10, 0
1117; SI-NEXT:    s_cselect_b32 s0, s11, s0
1118; SI-NEXT:    v_mov_b32_e32 v0, s0
1119; SI-NEXT:    v_mov_b32_e32 v1, s1
1120; SI-NEXT:    v_mov_b32_e32 v2, s2
1121; SI-NEXT:    v_mov_b32_e32 v3, s3
1122; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1123; SI-NEXT:    s_endpgm
1124;
1125; VI-LABEL: dynamic_insertelement_v4i32:
1126; VI:       ; %bb.0:
1127; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
1128; VI-NEXT:    s_load_dword s10, s[8:9], 0x20
1129; VI-NEXT:    s_load_dword s11, s[8:9], 0x44
1130; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
1131; VI-NEXT:    s_mov_b32 s7, 0x1100f000
1132; VI-NEXT:    s_mov_b32 s6, -1
1133; VI-NEXT:    s_waitcnt lgkmcnt(0)
1134; VI-NEXT:    s_cmp_eq_u32 s10, 3
1135; VI-NEXT:    s_cselect_b32 s3, s11, s3
1136; VI-NEXT:    s_cmp_eq_u32 s10, 2
1137; VI-NEXT:    s_cselect_b32 s2, s11, s2
1138; VI-NEXT:    s_cmp_eq_u32 s10, 1
1139; VI-NEXT:    s_cselect_b32 s1, s11, s1
1140; VI-NEXT:    s_cmp_eq_u32 s10, 0
1141; VI-NEXT:    s_cselect_b32 s0, s11, s0
1142; VI-NEXT:    v_mov_b32_e32 v0, s0
1143; VI-NEXT:    v_mov_b32_e32 v1, s1
1144; VI-NEXT:    v_mov_b32_e32 v2, s2
1145; VI-NEXT:    v_mov_b32_e32 v3, s3
1146; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1147; VI-NEXT:    s_endpgm
1148  %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
1149  store <4 x i32> %vecins, ptr addrspace(1) %out, align 16
1150  ret void
1151}
1152
1153define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 x i32> %a, i32 %b) nounwind {
1154; SI-LABEL: dynamic_insertelement_v8i32:
1155; SI:       ; %bb.0:
1156; SI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x8
1157; SI-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x0
1158; SI-NEXT:    s_load_dword s8, s[8:9], 0x10
1159; SI-NEXT:    s_mov_b32 s15, 0x100f000
1160; SI-NEXT:    s_mov_b32 s14, -1
1161; SI-NEXT:    s_waitcnt lgkmcnt(0)
1162; SI-NEXT:    v_mov_b32_e32 v0, s0
1163; SI-NEXT:    v_mov_b32_e32 v1, s1
1164; SI-NEXT:    v_mov_b32_e32 v2, s2
1165; SI-NEXT:    v_mov_b32_e32 v3, s3
1166; SI-NEXT:    v_mov_b32_e32 v4, s4
1167; SI-NEXT:    v_mov_b32_e32 v5, s5
1168; SI-NEXT:    v_mov_b32_e32 v6, s6
1169; SI-NEXT:    v_mov_b32_e32 v7, s7
1170; SI-NEXT:    s_mov_b32 m0, s8
1171; SI-NEXT:    v_movreld_b32_e32 v0, 5
1172; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
1173; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
1174; SI-NEXT:    s_endpgm
1175;
1176; VI-LABEL: dynamic_insertelement_v8i32:
1177; VI:       ; %bb.0:
1178; VI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x20
1179; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1180; VI-NEXT:    s_load_dword s4, s[8:9], 0x40
1181; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1182; VI-NEXT:    s_mov_b32 s2, -1
1183; VI-NEXT:    s_waitcnt lgkmcnt(0)
1184; VI-NEXT:    v_mov_b32_e32 v0, s12
1185; VI-NEXT:    v_mov_b32_e32 v1, s13
1186; VI-NEXT:    v_mov_b32_e32 v2, s14
1187; VI-NEXT:    v_mov_b32_e32 v3, s15
1188; VI-NEXT:    v_mov_b32_e32 v4, s16
1189; VI-NEXT:    v_mov_b32_e32 v5, s17
1190; VI-NEXT:    v_mov_b32_e32 v6, s18
1191; VI-NEXT:    v_mov_b32_e32 v7, s19
1192; VI-NEXT:    s_mov_b32 m0, s4
1193; VI-NEXT:    v_movreld_b32_e32 v0, 5
1194; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1195; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1196; VI-NEXT:    s_endpgm
1197  %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
1198  store <8 x i32> %vecins, ptr addrspace(1) %out, align 32
1199  ret void
1200}
1201
1202define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 x i32> %a, i32 %b) nounwind {
1203; SI-LABEL: dynamic_insertelement_v9i32:
1204; SI:       ; %bb.0:
1205; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1206; SI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x10
1207; SI-NEXT:    s_load_dword s4, s[8:9], 0x18
1208; SI-NEXT:    s_load_dword s5, s[8:9], 0x20
1209; SI-NEXT:    s_mov_b32 s3, 0x100f000
1210; SI-NEXT:    s_mov_b32 s2, -1
1211; SI-NEXT:    s_waitcnt lgkmcnt(0)
1212; SI-NEXT:    v_mov_b32_e32 v0, s12
1213; SI-NEXT:    v_mov_b32_e32 v1, s13
1214; SI-NEXT:    v_mov_b32_e32 v2, s14
1215; SI-NEXT:    v_mov_b32_e32 v3, s15
1216; SI-NEXT:    v_mov_b32_e32 v4, s16
1217; SI-NEXT:    v_mov_b32_e32 v5, s17
1218; SI-NEXT:    v_mov_b32_e32 v6, s18
1219; SI-NEXT:    v_mov_b32_e32 v7, s19
1220; SI-NEXT:    v_mov_b32_e32 v8, s4
1221; SI-NEXT:    s_mov_b32 m0, s5
1222; SI-NEXT:    v_movreld_b32_e32 v0, 5
1223; SI-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:32
1224; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1225; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1226; SI-NEXT:    s_endpgm
1227;
1228; VI-LABEL: dynamic_insertelement_v9i32:
1229; VI:       ; %bb.0:
1230; VI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x40
1231; VI-NEXT:    s_load_dword s4, s[8:9], 0x60
1232; VI-NEXT:    s_load_dword s5, s[8:9], 0x80
1233; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1234; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1235; VI-NEXT:    s_mov_b32 s2, -1
1236; VI-NEXT:    s_waitcnt lgkmcnt(0)
1237; VI-NEXT:    v_mov_b32_e32 v0, s12
1238; VI-NEXT:    v_mov_b32_e32 v1, s13
1239; VI-NEXT:    v_mov_b32_e32 v2, s14
1240; VI-NEXT:    v_mov_b32_e32 v3, s15
1241; VI-NEXT:    v_mov_b32_e32 v4, s16
1242; VI-NEXT:    v_mov_b32_e32 v5, s17
1243; VI-NEXT:    v_mov_b32_e32 v6, s18
1244; VI-NEXT:    v_mov_b32_e32 v7, s19
1245; VI-NEXT:    v_mov_b32_e32 v8, s4
1246; VI-NEXT:    s_mov_b32 m0, s5
1247; VI-NEXT:    v_movreld_b32_e32 v0, 5
1248; VI-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:32
1249; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1250; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1251; VI-NEXT:    s_endpgm
1252  %vecins = insertelement <9 x i32> %a, i32 5, i32 %b
1253  store <9 x i32> %vecins, ptr addrspace(1) %out, align 32
1254  ret void
1255}
1256
1257define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, <10 x i32> %a, i32 %b) nounwind {
1258; SI-LABEL: dynamic_insertelement_v10i32:
1259; SI:       ; %bb.0:
1260; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1261; SI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x10
1262; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
1263; SI-NEXT:    s_load_dword s6, s[8:9], 0x20
1264; SI-NEXT:    s_mov_b32 s3, 0x100f000
1265; SI-NEXT:    s_mov_b32 s2, -1
1266; SI-NEXT:    s_waitcnt lgkmcnt(0)
1267; SI-NEXT:    v_mov_b32_e32 v0, s12
1268; SI-NEXT:    v_mov_b32_e32 v1, s13
1269; SI-NEXT:    v_mov_b32_e32 v2, s14
1270; SI-NEXT:    v_mov_b32_e32 v3, s15
1271; SI-NEXT:    v_mov_b32_e32 v4, s16
1272; SI-NEXT:    v_mov_b32_e32 v5, s17
1273; SI-NEXT:    v_mov_b32_e32 v6, s18
1274; SI-NEXT:    v_mov_b32_e32 v7, s19
1275; SI-NEXT:    v_mov_b32_e32 v8, s4
1276; SI-NEXT:    v_mov_b32_e32 v9, s5
1277; SI-NEXT:    s_mov_b32 m0, s6
1278; SI-NEXT:    v_movreld_b32_e32 v0, 5
1279; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1280; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1281; SI-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
1282; SI-NEXT:    s_endpgm
1283;
1284; VI-LABEL: dynamic_insertelement_v10i32:
1285; VI:       ; %bb.0:
1286; VI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x40
1287; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x60
1288; VI-NEXT:    s_load_dword s6, s[8:9], 0x80
1289; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1290; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1291; VI-NEXT:    s_waitcnt lgkmcnt(0)
1292; VI-NEXT:    v_mov_b32_e32 v0, s12
1293; VI-NEXT:    v_mov_b32_e32 v1, s13
1294; VI-NEXT:    v_mov_b32_e32 v2, s14
1295; VI-NEXT:    v_mov_b32_e32 v3, s15
1296; VI-NEXT:    v_mov_b32_e32 v4, s16
1297; VI-NEXT:    v_mov_b32_e32 v5, s17
1298; VI-NEXT:    v_mov_b32_e32 v6, s18
1299; VI-NEXT:    v_mov_b32_e32 v7, s19
1300; VI-NEXT:    v_mov_b32_e32 v8, s4
1301; VI-NEXT:    v_mov_b32_e32 v9, s5
1302; VI-NEXT:    s_mov_b32 m0, s6
1303; VI-NEXT:    s_mov_b32 s2, -1
1304; VI-NEXT:    v_movreld_b32_e32 v0, 5
1305; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1306; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1307; VI-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
1308; VI-NEXT:    s_endpgm
1309  %vecins = insertelement <10 x i32> %a, i32 5, i32 %b
1310  store <10 x i32> %vecins, ptr addrspace(1) %out, align 32
1311  ret void
1312}
1313
1314define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, <11 x i32> %a, i32 %b) nounwind {
1315; SI-LABEL: dynamic_insertelement_v11i32:
1316; SI:       ; %bb.0:
1317; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1318; SI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x10
1319; SI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x18
1320; SI-NEXT:    s_waitcnt lgkmcnt(0)
1321; SI-NEXT:    s_load_dword s7, s[8:9], 0x20
1322; SI-NEXT:    s_mov_b32 s3, 0x100f000
1323; SI-NEXT:    s_mov_b32 s2, -1
1324; SI-NEXT:    v_mov_b32_e32 v0, s12
1325; SI-NEXT:    v_mov_b32_e32 v1, s13
1326; SI-NEXT:    v_mov_b32_e32 v2, s14
1327; SI-NEXT:    v_mov_b32_e32 v3, s15
1328; SI-NEXT:    v_mov_b32_e32 v4, s16
1329; SI-NEXT:    v_mov_b32_e32 v5, s17
1330; SI-NEXT:    v_mov_b32_e32 v6, s18
1331; SI-NEXT:    v_mov_b32_e32 v7, s19
1332; SI-NEXT:    v_mov_b32_e32 v8, s4
1333; SI-NEXT:    v_mov_b32_e32 v9, s5
1334; SI-NEXT:    v_mov_b32_e32 v10, s6
1335; SI-NEXT:    s_waitcnt lgkmcnt(0)
1336; SI-NEXT:    s_mov_b32 m0, s7
1337; SI-NEXT:    v_movreld_b32_e32 v0, 5
1338; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1339; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1340; SI-NEXT:    buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
1341; SI-NEXT:    s_endpgm
1342;
1343; VI-LABEL: dynamic_insertelement_v11i32:
1344; VI:       ; %bb.0:
1345; VI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x60
1346; VI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x40
1347; VI-NEXT:    s_waitcnt lgkmcnt(0)
1348; VI-NEXT:    s_load_dword s7, s[8:9], 0x80
1349; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1350; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1351; VI-NEXT:    v_mov_b32_e32 v8, s4
1352; VI-NEXT:    v_mov_b32_e32 v0, s12
1353; VI-NEXT:    v_mov_b32_e32 v1, s13
1354; VI-NEXT:    v_mov_b32_e32 v2, s14
1355; VI-NEXT:    v_mov_b32_e32 v3, s15
1356; VI-NEXT:    v_mov_b32_e32 v4, s16
1357; VI-NEXT:    v_mov_b32_e32 v5, s17
1358; VI-NEXT:    v_mov_b32_e32 v6, s18
1359; VI-NEXT:    v_mov_b32_e32 v7, s19
1360; VI-NEXT:    v_mov_b32_e32 v9, s5
1361; VI-NEXT:    v_mov_b32_e32 v10, s6
1362; VI-NEXT:    s_waitcnt lgkmcnt(0)
1363; VI-NEXT:    s_mov_b32 m0, s7
1364; VI-NEXT:    s_mov_b32 s2, -1
1365; VI-NEXT:    v_movreld_b32_e32 v0, 5
1366; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1367; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1368; VI-NEXT:    buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
1369; VI-NEXT:    s_endpgm
1370  %vecins = insertelement <11 x i32> %a, i32 5, i32 %b
1371  store <11 x i32> %vecins, ptr addrspace(1) %out, align 32
1372  ret void
1373}
1374
1375define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, <12 x i32> %a, i32 %b) nounwind {
1376; SI-LABEL: dynamic_insertelement_v12i32:
1377; SI:       ; %bb.0:
1378; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1379; SI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x10
1380; SI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x18
1381; SI-NEXT:    s_load_dword s8, s[8:9], 0x20
1382; SI-NEXT:    s_mov_b32 s3, 0x100f000
1383; SI-NEXT:    s_mov_b32 s2, -1
1384; SI-NEXT:    s_waitcnt lgkmcnt(0)
1385; SI-NEXT:    v_mov_b32_e32 v0, s12
1386; SI-NEXT:    v_mov_b32_e32 v1, s13
1387; SI-NEXT:    v_mov_b32_e32 v2, s14
1388; SI-NEXT:    v_mov_b32_e32 v3, s15
1389; SI-NEXT:    v_mov_b32_e32 v4, s16
1390; SI-NEXT:    v_mov_b32_e32 v5, s17
1391; SI-NEXT:    v_mov_b32_e32 v6, s18
1392; SI-NEXT:    v_mov_b32_e32 v7, s19
1393; SI-NEXT:    v_mov_b32_e32 v8, s4
1394; SI-NEXT:    v_mov_b32_e32 v9, s5
1395; SI-NEXT:    v_mov_b32_e32 v10, s6
1396; SI-NEXT:    v_mov_b32_e32 v11, s7
1397; SI-NEXT:    s_mov_b32 m0, s8
1398; SI-NEXT:    v_movreld_b32_e32 v0, 5
1399; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1400; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1401; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1402; SI-NEXT:    s_endpgm
1403;
1404; VI-LABEL: dynamic_insertelement_v12i32:
1405; VI:       ; %bb.0:
1406; VI-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x40
1407; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1408; VI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x60
1409; VI-NEXT:    s_load_dword s8, s[8:9], 0x80
1410; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1411; VI-NEXT:    s_waitcnt lgkmcnt(0)
1412; VI-NEXT:    v_mov_b32_e32 v0, s12
1413; VI-NEXT:    v_mov_b32_e32 v1, s13
1414; VI-NEXT:    v_mov_b32_e32 v2, s14
1415; VI-NEXT:    v_mov_b32_e32 v3, s15
1416; VI-NEXT:    v_mov_b32_e32 v4, s16
1417; VI-NEXT:    v_mov_b32_e32 v5, s17
1418; VI-NEXT:    v_mov_b32_e32 v6, s18
1419; VI-NEXT:    v_mov_b32_e32 v7, s19
1420; VI-NEXT:    v_mov_b32_e32 v8, s4
1421; VI-NEXT:    v_mov_b32_e32 v9, s5
1422; VI-NEXT:    v_mov_b32_e32 v10, s6
1423; VI-NEXT:    v_mov_b32_e32 v11, s7
1424; VI-NEXT:    s_mov_b32 m0, s8
1425; VI-NEXT:    s_mov_b32 s2, -1
1426; VI-NEXT:    v_movreld_b32_e32 v0, 5
1427; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1428; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1429; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1430; VI-NEXT:    s_endpgm
1431  %vecins = insertelement <12 x i32> %a, i32 5, i32 %b
1432  store <12 x i32> %vecins, ptr addrspace(1) %out, align 32
1433  ret void
1434}
1435
1436define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, <16 x i32> %a, i32 %b) nounwind {
1437; SI-LABEL: dynamic_insertelement_v16i32:
1438; SI:       ; %bb.0:
1439; SI-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x10
1440; SI-NEXT:    s_load_dword s4, s[8:9], 0x20
1441; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1442; SI-NEXT:    s_mov_b32 s3, 0x100f000
1443; SI-NEXT:    s_mov_b32 s2, -1
1444; SI-NEXT:    s_waitcnt lgkmcnt(0)
1445; SI-NEXT:    v_mov_b32_e32 v0, s12
1446; SI-NEXT:    v_mov_b32_e32 v1, s13
1447; SI-NEXT:    v_mov_b32_e32 v2, s14
1448; SI-NEXT:    v_mov_b32_e32 v3, s15
1449; SI-NEXT:    v_mov_b32_e32 v4, s16
1450; SI-NEXT:    v_mov_b32_e32 v5, s17
1451; SI-NEXT:    v_mov_b32_e32 v6, s18
1452; SI-NEXT:    v_mov_b32_e32 v7, s19
1453; SI-NEXT:    v_mov_b32_e32 v8, s20
1454; SI-NEXT:    v_mov_b32_e32 v9, s21
1455; SI-NEXT:    v_mov_b32_e32 v10, s22
1456; SI-NEXT:    v_mov_b32_e32 v11, s23
1457; SI-NEXT:    v_mov_b32_e32 v12, s24
1458; SI-NEXT:    v_mov_b32_e32 v13, s25
1459; SI-NEXT:    v_mov_b32_e32 v14, s26
1460; SI-NEXT:    v_mov_b32_e32 v15, s27
1461; SI-NEXT:    s_mov_b32 m0, s4
1462; SI-NEXT:    v_movreld_b32_e32 v0, 5
1463; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1464; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1465; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1466; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1467; SI-NEXT:    s_endpgm
1468;
1469; VI-LABEL: dynamic_insertelement_v16i32:
1470; VI:       ; %bb.0:
1471; VI-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x40
1472; VI-NEXT:    s_load_dword s4, s[8:9], 0x80
1473; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1474; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1475; VI-NEXT:    s_mov_b32 s2, -1
1476; VI-NEXT:    s_waitcnt lgkmcnt(0)
1477; VI-NEXT:    v_mov_b32_e32 v0, s12
1478; VI-NEXT:    v_mov_b32_e32 v1, s13
1479; VI-NEXT:    v_mov_b32_e32 v2, s14
1480; VI-NEXT:    v_mov_b32_e32 v3, s15
1481; VI-NEXT:    v_mov_b32_e32 v4, s16
1482; VI-NEXT:    v_mov_b32_e32 v5, s17
1483; VI-NEXT:    v_mov_b32_e32 v6, s18
1484; VI-NEXT:    v_mov_b32_e32 v7, s19
1485; VI-NEXT:    v_mov_b32_e32 v8, s20
1486; VI-NEXT:    v_mov_b32_e32 v9, s21
1487; VI-NEXT:    v_mov_b32_e32 v10, s22
1488; VI-NEXT:    v_mov_b32_e32 v11, s23
1489; VI-NEXT:    v_mov_b32_e32 v12, s24
1490; VI-NEXT:    v_mov_b32_e32 v13, s25
1491; VI-NEXT:    v_mov_b32_e32 v14, s26
1492; VI-NEXT:    v_mov_b32_e32 v15, s27
1493; VI-NEXT:    s_mov_b32 m0, s4
1494; VI-NEXT:    v_movreld_b32_e32 v0, 5
1495; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1496; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1497; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1498; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1499; VI-NEXT:    s_endpgm
1500  %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
1501  store <16 x i32> %vecins, ptr addrspace(1) %out, align 64
1502  ret void
1503}
1504
1505define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 x i16> %a, i32 %b) nounwind {
1506; SI-LABEL: dynamic_insertelement_v2i16:
1507; SI:       ; %bb.0:
1508; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1509; SI-NEXT:    s_mov_b32 s7, 0x100f000
1510; SI-NEXT:    s_mov_b32 s6, -1
1511; SI-NEXT:    s_waitcnt lgkmcnt(0)
1512; SI-NEXT:    s_mov_b32 s4, s0
1513; SI-NEXT:    s_lshl_b32 s0, s3, 4
1514; SI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1515; SI-NEXT:    s_mov_b32 s5, s1
1516; SI-NEXT:    s_andn2_b32 s1, s2, s0
1517; SI-NEXT:    s_and_b32 s0, s0, 0x50005
1518; SI-NEXT:    s_or_b32 s0, s0, s1
1519; SI-NEXT:    v_mov_b32_e32 v0, s0
1520; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1521; SI-NEXT:    s_endpgm
1522;
1523; VI-LABEL: dynamic_insertelement_v2i16:
1524; VI:       ; %bb.0:
1525; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1526; VI-NEXT:    s_mov_b32 s7, 0x1100f000
1527; VI-NEXT:    s_mov_b32 s6, -1
1528; VI-NEXT:    s_waitcnt lgkmcnt(0)
1529; VI-NEXT:    s_mov_b32 s4, s0
1530; VI-NEXT:    s_lshl_b32 s0, s3, 4
1531; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1532; VI-NEXT:    s_mov_b32 s5, s1
1533; VI-NEXT:    s_andn2_b32 s1, s2, s0
1534; VI-NEXT:    s_and_b32 s0, s0, 0x50005
1535; VI-NEXT:    s_or_b32 s0, s0, s1
1536; VI-NEXT:    v_mov_b32_e32 v0, s0
1537; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1538; VI-NEXT:    s_endpgm
1539  %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
1540  store <2 x i16> %vecins, ptr addrspace(1) %out, align 8
1541  ret void
1542}
1543
1544define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 x i16> %a, i32 %b) nounwind {
1545; SI-LABEL: dynamic_insertelement_v3i16:
1546; SI:       ; %bb.0:
1547; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1548; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
1549; SI-NEXT:    s_mov_b32 s7, 0x100f000
1550; SI-NEXT:    s_mov_b32 s6, -1
1551; SI-NEXT:    s_waitcnt lgkmcnt(0)
1552; SI-NEXT:    s_mov_b32 s4, s0
1553; SI-NEXT:    s_lshl_b32 s0, s8, 4
1554; SI-NEXT:    s_mov_b32 s5, s1
1555; SI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
1556; SI-NEXT:    s_and_b32 s9, s1, 0x50005
1557; SI-NEXT:    s_and_b32 s8, s0, 0x50005
1558; SI-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[0:1]
1559; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
1560; SI-NEXT:    v_mov_b32_e32 v0, s1
1561; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
1562; SI-NEXT:    v_mov_b32_e32 v0, s0
1563; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1564; SI-NEXT:    s_endpgm
1565;
1566; VI-LABEL: dynamic_insertelement_v3i16:
1567; VI:       ; %bb.0:
1568; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1569; VI-NEXT:    s_load_dword s8, s[8:9], 0x10
1570; VI-NEXT:    s_mov_b32 s7, 0x1100f000
1571; VI-NEXT:    s_mov_b32 s6, -1
1572; VI-NEXT:    s_waitcnt lgkmcnt(0)
1573; VI-NEXT:    s_mov_b32 s4, s0
1574; VI-NEXT:    s_lshl_b32 s0, s8, 4
1575; VI-NEXT:    s_mov_b32 s8, 0x50005
1576; VI-NEXT:    s_mov_b32 s5, s1
1577; VI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
1578; VI-NEXT:    s_mov_b32 s9, s8
1579; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
1580; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
1581; VI-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1582; VI-NEXT:    v_mov_b32_e32 v0, s1
1583; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
1584; VI-NEXT:    v_mov_b32_e32 v0, s0
1585; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1586; VI-NEXT:    s_endpgm
1587  %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
1588  store <3 x i16> %vecins, ptr addrspace(1) %out, align 8
1589  ret void
1590}
1591
1592define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
1593; SI-LABEL: dynamic_insertelement_v2i8:
1594; SI:       ; %bb.0:
1595; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
1596; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1597; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
1598; SI-NEXT:    s_mov_b32 s3, 0x100f000
1599; SI-NEXT:    s_mov_b32 s2, -1
1600; SI-NEXT:    s_waitcnt lgkmcnt(0)
1601; SI-NEXT:    s_lshl_b32 s4, s4, 3
1602; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
1603; SI-NEXT:    s_andn2_b32 s5, s5, s4
1604; SI-NEXT:    s_and_b32 s4, s4, 0x505
1605; SI-NEXT:    s_or_b32 s4, s4, s5
1606; SI-NEXT:    v_mov_b32_e32 v0, s4
1607; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1608; SI-NEXT:    s_endpgm
1609;
1610; VI-LABEL: dynamic_insertelement_v2i8:
1611; VI:       ; %bb.0:
1612; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
1613; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1614; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
1615; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1616; VI-NEXT:    s_mov_b32 s2, -1
1617; VI-NEXT:    s_waitcnt lgkmcnt(0)
1618; VI-NEXT:    s_lshl_b32 s4, s4, 3
1619; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
1620; VI-NEXT:    s_and_b32 s6, s4, 0x505
1621; VI-NEXT:    s_xor_b32 s4, s4, 0xffff
1622; VI-NEXT:    s_and_b32 s4, s4, s5
1623; VI-NEXT:    s_or_b32 s4, s6, s4
1624; VI-NEXT:    v_mov_b32_e32 v0, s4
1625; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1626; VI-NEXT:    s_endpgm
1627  %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
1628  store <2 x i8> %vecins, ptr addrspace(1) %out, align 8
1629  ret void
1630}
1631
1632; FIXME: post legalize i16 and i32 shifts aren't merged because of
1633; isTypeDesirableForOp in SimplifyDemandedBits
1634define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
1635; SI-LABEL: dynamic_insertelement_v3i8:
1636; SI:       ; %bb.0:
1637; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
1638; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1639; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
1640; SI-NEXT:    s_mov_b32 s3, 0x100f000
1641; SI-NEXT:    s_mov_b32 s2, -1
1642; SI-NEXT:    s_waitcnt lgkmcnt(0)
1643; SI-NEXT:    s_lshl_b32 s4, s4, 3
1644; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
1645; SI-NEXT:    s_andn2_b32 s5, s5, s4
1646; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
1647; SI-NEXT:    s_or_b32 s4, s4, s5
1648; SI-NEXT:    s_lshr_b32 s5, s4, 16
1649; SI-NEXT:    v_mov_b32_e32 v0, s4
1650; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1651; SI-NEXT:    v_mov_b32_e32 v0, s5
1652; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
1653; SI-NEXT:    s_endpgm
1654;
1655; VI-LABEL: dynamic_insertelement_v3i8:
1656; VI:       ; %bb.0:
1657; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
1658; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1659; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
1660; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1661; VI-NEXT:    s_mov_b32 s2, -1
1662; VI-NEXT:    s_waitcnt lgkmcnt(0)
1663; VI-NEXT:    s_lshl_b32 s4, s4, 3
1664; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
1665; VI-NEXT:    s_andn2_b32 s5, s5, s4
1666; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
1667; VI-NEXT:    s_or_b32 s4, s4, s5
1668; VI-NEXT:    s_lshr_b32 s5, s4, 16
1669; VI-NEXT:    v_mov_b32_e32 v0, s4
1670; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1671; VI-NEXT:    v_mov_b32_e32 v0, s5
1672; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
1673; VI-NEXT:    s_endpgm
1674  %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
1675  store <3 x i8> %vecins, ptr addrspace(1) %out, align 4
1676  ret void
1677}
1678
1679define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
1680; SI-LABEL: dynamic_insertelement_v4i8:
1681; SI:       ; %bb.0:
1682; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
1683; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1684; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
1685; SI-NEXT:    s_mov_b32 s3, 0x100f000
1686; SI-NEXT:    s_mov_b32 s2, -1
1687; SI-NEXT:    s_waitcnt lgkmcnt(0)
1688; SI-NEXT:    s_lshl_b32 s4, s4, 3
1689; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
1690; SI-NEXT:    s_andn2_b32 s5, s5, s4
1691; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
1692; SI-NEXT:    s_or_b32 s4, s4, s5
1693; SI-NEXT:    v_mov_b32_e32 v0, s4
1694; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1695; SI-NEXT:    s_endpgm
1696;
1697; VI-LABEL: dynamic_insertelement_v4i8:
1698; VI:       ; %bb.0:
1699; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
1700; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1701; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
1702; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1703; VI-NEXT:    s_mov_b32 s2, -1
1704; VI-NEXT:    s_waitcnt lgkmcnt(0)
1705; VI-NEXT:    s_lshl_b32 s4, s4, 3
1706; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
1707; VI-NEXT:    s_andn2_b32 s5, s5, s4
1708; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
1709; VI-NEXT:    s_or_b32 s4, s4, s5
1710; VI-NEXT:    v_mov_b32_e32 v0, s4
1711; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1712; VI-NEXT:    s_endpgm
1713  %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
1714  store <4 x i8> %vecins, ptr addrspace(1) %out, align 4
1715  ret void
1716}
1717
1718define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %a.ptr, i32 %b) nounwind {
1719; SI-LABEL: s_dynamic_insertelement_v8i8:
1720; SI:       ; %bb.0:
1721; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1722; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
1723; SI-NEXT:    s_mov_b32 s7, 0x100f000
1724; SI-NEXT:    s_mov_b32 s6, -1
1725; SI-NEXT:    s_waitcnt lgkmcnt(0)
1726; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1727; SI-NEXT:    s_mov_b32 s4, s0
1728; SI-NEXT:    s_lshl_b32 s0, s8, 3
1729; SI-NEXT:    s_mov_b32 s5, s1
1730; SI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
1731; SI-NEXT:    s_and_b32 s9, s1, 0x5050505
1732; SI-NEXT:    s_waitcnt lgkmcnt(0)
1733; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
1734; SI-NEXT:    s_and_b32 s8, s0, 0x5050505
1735; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
1736; SI-NEXT:    v_mov_b32_e32 v0, s0
1737; SI-NEXT:    v_mov_b32_e32 v1, s1
1738; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1739; SI-NEXT:    s_endpgm
1740;
1741; VI-LABEL: s_dynamic_insertelement_v8i8:
1742; VI:       ; %bb.0:
1743; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1744; VI-NEXT:    s_load_dword s8, s[8:9], 0x10
1745; VI-NEXT:    s_mov_b32 s7, 0x1100f000
1746; VI-NEXT:    s_mov_b32 s6, -1
1747; VI-NEXT:    s_waitcnt lgkmcnt(0)
1748; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1749; VI-NEXT:    s_mov_b32 s4, s0
1750; VI-NEXT:    s_lshl_b32 s0, s8, 3
1751; VI-NEXT:    s_mov_b32 s5, s1
1752; VI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
1753; VI-NEXT:    s_and_b32 s9, s1, 0x5050505
1754; VI-NEXT:    s_waitcnt lgkmcnt(0)
1755; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
1756; VI-NEXT:    s_and_b32 s8, s0, 0x5050505
1757; VI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
1758; VI-NEXT:    v_mov_b32_e32 v0, s0
1759; VI-NEXT:    v_mov_b32_e32 v1, s1
1760; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1761; VI-NEXT:    s_endpgm
1762  %a = load <8 x i8>, ptr addrspace(4) %a.ptr, align 4
1763  %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
1764  store <8 x i8> %vecins, ptr addrspace(1) %out, align 8
1765  ret void
1766}
1767
1768define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <16 x i8> %a, i32 %b) nounwind {
1769; SI-LABEL: dynamic_insertelement_v16i8:
1770; SI:       ; %bb.0:
1771; SI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x4
1772; SI-NEXT:    s_load_dword s10, s[8:9], 0x8
1773; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1774; SI-NEXT:    s_mov_b32 s3, 0x100f000
1775; SI-NEXT:    s_mov_b32 s2, -1
1776; SI-NEXT:    s_waitcnt lgkmcnt(0)
1777; SI-NEXT:    s_lshr_b32 s8, s7, 24
1778; SI-NEXT:    s_cmp_lg_u32 s10, 15
1779; SI-NEXT:    s_cselect_b32 s8, s8, 5
1780; SI-NEXT:    s_lshl_b32 s8, s8, 24
1781; SI-NEXT:    s_lshr_b32 s9, s7, 16
1782; SI-NEXT:    s_cmp_lg_u32 s10, 14
1783; SI-NEXT:    s_cselect_b32 s9, s9, 5
1784; SI-NEXT:    s_and_b32 s9, s9, 0xff
1785; SI-NEXT:    s_lshl_b32 s9, s9, 16
1786; SI-NEXT:    s_or_b32 s8, s8, s9
1787; SI-NEXT:    s_lshr_b32 s9, s7, 8
1788; SI-NEXT:    s_cmp_lg_u32 s10, 13
1789; SI-NEXT:    s_cselect_b32 s9, s9, 5
1790; SI-NEXT:    s_lshl_b32 s9, s9, 8
1791; SI-NEXT:    s_cmp_lg_u32 s10, 12
1792; SI-NEXT:    s_cselect_b32 s7, s7, 5
1793; SI-NEXT:    s_and_b32 s7, s7, 0xff
1794; SI-NEXT:    s_or_b32 s7, s7, s9
1795; SI-NEXT:    s_and_b32 s7, s7, 0xffff
1796; SI-NEXT:    s_or_b32 s7, s7, s8
1797; SI-NEXT:    s_lshr_b32 s8, s6, 24
1798; SI-NEXT:    s_cmp_lg_u32 s10, 11
1799; SI-NEXT:    s_cselect_b32 s8, s8, 5
1800; SI-NEXT:    s_lshl_b32 s8, s8, 24
1801; SI-NEXT:    s_lshr_b32 s9, s6, 16
1802; SI-NEXT:    s_cmp_lg_u32 s10, 10
1803; SI-NEXT:    s_cselect_b32 s9, s9, 5
1804; SI-NEXT:    s_and_b32 s9, s9, 0xff
1805; SI-NEXT:    s_lshl_b32 s9, s9, 16
1806; SI-NEXT:    s_or_b32 s8, s8, s9
1807; SI-NEXT:    s_lshr_b32 s9, s6, 8
1808; SI-NEXT:    s_cmp_lg_u32 s10, 9
1809; SI-NEXT:    s_cselect_b32 s9, s9, 5
1810; SI-NEXT:    s_lshl_b32 s9, s9, 8
1811; SI-NEXT:    s_cmp_lg_u32 s10, 8
1812; SI-NEXT:    s_cselect_b32 s6, s6, 5
1813; SI-NEXT:    s_and_b32 s6, s6, 0xff
1814; SI-NEXT:    s_or_b32 s6, s6, s9
1815; SI-NEXT:    s_and_b32 s6, s6, 0xffff
1816; SI-NEXT:    s_or_b32 s6, s6, s8
1817; SI-NEXT:    s_lshr_b32 s8, s5, 24
1818; SI-NEXT:    s_cmp_lg_u32 s10, 7
1819; SI-NEXT:    s_cselect_b32 s8, s8, 5
1820; SI-NEXT:    s_lshl_b32 s8, s8, 24
1821; SI-NEXT:    s_lshr_b32 s9, s5, 16
1822; SI-NEXT:    s_cmp_lg_u32 s10, 6
1823; SI-NEXT:    s_cselect_b32 s9, s9, 5
1824; SI-NEXT:    s_and_b32 s9, s9, 0xff
1825; SI-NEXT:    s_lshl_b32 s9, s9, 16
1826; SI-NEXT:    s_or_b32 s8, s8, s9
1827; SI-NEXT:    s_lshr_b32 s9, s5, 8
1828; SI-NEXT:    s_cmp_lg_u32 s10, 5
1829; SI-NEXT:    s_cselect_b32 s9, s9, 5
1830; SI-NEXT:    s_lshl_b32 s9, s9, 8
1831; SI-NEXT:    s_cmp_lg_u32 s10, 4
1832; SI-NEXT:    s_cselect_b32 s5, s5, 5
1833; SI-NEXT:    s_and_b32 s5, s5, 0xff
1834; SI-NEXT:    s_or_b32 s5, s5, s9
1835; SI-NEXT:    s_and_b32 s5, s5, 0xffff
1836; SI-NEXT:    s_or_b32 s5, s5, s8
1837; SI-NEXT:    s_lshr_b32 s8, s4, 24
1838; SI-NEXT:    s_cmp_lg_u32 s10, 3
1839; SI-NEXT:    s_cselect_b32 s8, s8, 5
1840; SI-NEXT:    s_lshl_b32 s8, s8, 24
1841; SI-NEXT:    s_lshr_b32 s9, s4, 16
1842; SI-NEXT:    s_cmp_lg_u32 s10, 2
1843; SI-NEXT:    s_cselect_b32 s9, s9, 5
1844; SI-NEXT:    s_and_b32 s9, s9, 0xff
1845; SI-NEXT:    s_lshl_b32 s9, s9, 16
1846; SI-NEXT:    s_or_b32 s8, s8, s9
1847; SI-NEXT:    s_lshr_b32 s9, s4, 8
1848; SI-NEXT:    s_cmp_lg_u32 s10, 1
1849; SI-NEXT:    s_cselect_b32 s9, s9, 5
1850; SI-NEXT:    s_lshl_b32 s9, s9, 8
1851; SI-NEXT:    s_cmp_lg_u32 s10, 0
1852; SI-NEXT:    s_cselect_b32 s4, s4, 5
1853; SI-NEXT:    s_and_b32 s4, s4, 0xff
1854; SI-NEXT:    s_or_b32 s4, s4, s9
1855; SI-NEXT:    s_and_b32 s4, s4, 0xffff
1856; SI-NEXT:    s_or_b32 s4, s4, s8
1857; SI-NEXT:    v_mov_b32_e32 v0, s4
1858; SI-NEXT:    v_mov_b32_e32 v1, s5
1859; SI-NEXT:    v_mov_b32_e32 v2, s6
1860; SI-NEXT:    v_mov_b32_e32 v3, s7
1861; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1862; SI-NEXT:    s_endpgm
1863;
1864; VI-LABEL: dynamic_insertelement_v16i8:
1865; VI:       ; %bb.0:
1866; VI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x10
1867; VI-NEXT:    s_load_dword s10, s[8:9], 0x20
1868; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
1869; VI-NEXT:    s_mov_b32 s3, 0x1100f000
1870; VI-NEXT:    s_mov_b32 s2, -1
1871; VI-NEXT:    s_waitcnt lgkmcnt(0)
1872; VI-NEXT:    s_lshr_b32 s8, s7, 24
1873; VI-NEXT:    s_cmp_lg_u32 s10, 15
1874; VI-NEXT:    s_cselect_b32 s8, s8, 5
1875; VI-NEXT:    s_lshl_b32 s8, s8, 8
1876; VI-NEXT:    s_lshr_b32 s9, s7, 16
1877; VI-NEXT:    s_cmp_lg_u32 s10, 14
1878; VI-NEXT:    s_cselect_b32 s9, s9, 5
1879; VI-NEXT:    s_and_b32 s9, s9, 0xff
1880; VI-NEXT:    s_or_b32 s8, s9, s8
1881; VI-NEXT:    s_lshl_b32 s8, s8, 16
1882; VI-NEXT:    s_lshr_b32 s9, s7, 8
1883; VI-NEXT:    s_cmp_lg_u32 s10, 13
1884; VI-NEXT:    s_cselect_b32 s9, s9, 5
1885; VI-NEXT:    s_lshl_b32 s9, s9, 8
1886; VI-NEXT:    s_cmp_lg_u32 s10, 12
1887; VI-NEXT:    s_cselect_b32 s7, s7, 5
1888; VI-NEXT:    s_and_b32 s7, s7, 0xff
1889; VI-NEXT:    s_or_b32 s7, s7, s9
1890; VI-NEXT:    s_and_b32 s7, s7, 0xffff
1891; VI-NEXT:    s_or_b32 s7, s7, s8
1892; VI-NEXT:    s_lshr_b32 s8, s6, 24
1893; VI-NEXT:    s_cmp_lg_u32 s10, 11
1894; VI-NEXT:    s_cselect_b32 s8, s8, 5
1895; VI-NEXT:    s_lshl_b32 s8, s8, 8
1896; VI-NEXT:    s_lshr_b32 s9, s6, 16
1897; VI-NEXT:    s_cmp_lg_u32 s10, 10
1898; VI-NEXT:    s_cselect_b32 s9, s9, 5
1899; VI-NEXT:    s_and_b32 s9, s9, 0xff
1900; VI-NEXT:    s_or_b32 s8, s9, s8
1901; VI-NEXT:    s_lshl_b32 s8, s8, 16
1902; VI-NEXT:    s_lshr_b32 s9, s6, 8
1903; VI-NEXT:    s_cmp_lg_u32 s10, 9
1904; VI-NEXT:    s_cselect_b32 s9, s9, 5
1905; VI-NEXT:    s_lshl_b32 s9, s9, 8
1906; VI-NEXT:    s_cmp_lg_u32 s10, 8
1907; VI-NEXT:    s_cselect_b32 s6, s6, 5
1908; VI-NEXT:    s_and_b32 s6, s6, 0xff
1909; VI-NEXT:    s_or_b32 s6, s6, s9
1910; VI-NEXT:    s_and_b32 s6, s6, 0xffff
1911; VI-NEXT:    s_or_b32 s6, s6, s8
1912; VI-NEXT:    s_lshr_b32 s8, s5, 24
1913; VI-NEXT:    s_cmp_lg_u32 s10, 7
1914; VI-NEXT:    s_cselect_b32 s8, s8, 5
1915; VI-NEXT:    s_lshl_b32 s8, s8, 8
1916; VI-NEXT:    s_lshr_b32 s9, s5, 16
1917; VI-NEXT:    s_cmp_lg_u32 s10, 6
1918; VI-NEXT:    s_cselect_b32 s9, s9, 5
1919; VI-NEXT:    s_and_b32 s9, s9, 0xff
1920; VI-NEXT:    s_or_b32 s8, s9, s8
1921; VI-NEXT:    s_lshl_b32 s8, s8, 16
1922; VI-NEXT:    s_lshr_b32 s9, s5, 8
1923; VI-NEXT:    s_cmp_lg_u32 s10, 5
1924; VI-NEXT:    s_cselect_b32 s9, s9, 5
1925; VI-NEXT:    s_lshl_b32 s9, s9, 8
1926; VI-NEXT:    s_cmp_lg_u32 s10, 4
1927; VI-NEXT:    s_cselect_b32 s5, s5, 5
1928; VI-NEXT:    s_and_b32 s5, s5, 0xff
1929; VI-NEXT:    s_or_b32 s5, s5, s9
1930; VI-NEXT:    s_and_b32 s5, s5, 0xffff
1931; VI-NEXT:    s_or_b32 s5, s5, s8
1932; VI-NEXT:    s_lshr_b32 s8, s4, 24
1933; VI-NEXT:    s_cmp_lg_u32 s10, 3
1934; VI-NEXT:    s_cselect_b32 s8, s8, 5
1935; VI-NEXT:    s_lshl_b32 s8, s8, 8
1936; VI-NEXT:    s_lshr_b32 s9, s4, 16
1937; VI-NEXT:    s_cmp_lg_u32 s10, 2
1938; VI-NEXT:    s_cselect_b32 s9, s9, 5
1939; VI-NEXT:    s_and_b32 s9, s9, 0xff
1940; VI-NEXT:    s_or_b32 s8, s9, s8
1941; VI-NEXT:    s_lshl_b32 s8, s8, 16
1942; VI-NEXT:    s_lshr_b32 s9, s4, 8
1943; VI-NEXT:    s_cmp_lg_u32 s10, 1
1944; VI-NEXT:    s_cselect_b32 s9, s9, 5
1945; VI-NEXT:    s_lshl_b32 s9, s9, 8
1946; VI-NEXT:    s_cmp_lg_u32 s10, 0
1947; VI-NEXT:    s_cselect_b32 s4, s4, 5
1948; VI-NEXT:    s_and_b32 s4, s4, 0xff
1949; VI-NEXT:    s_or_b32 s4, s4, s9
1950; VI-NEXT:    s_and_b32 s4, s4, 0xffff
1951; VI-NEXT:    s_or_b32 s4, s4, s8
1952; VI-NEXT:    v_mov_b32_e32 v0, s4
1953; VI-NEXT:    v_mov_b32_e32 v1, s5
1954; VI-NEXT:    v_mov_b32_e32 v2, s6
1955; VI-NEXT:    v_mov_b32_e32 v3, s7
1956; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1957; VI-NEXT:    s_endpgm
1958  %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
1959  store <16 x i8> %vecins, ptr addrspace(1) %out, align 16
1960  ret void
1961}
1962
1963; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
1964; the compiler doesn't crash.
1965define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b) {
1966; SI-LABEL: insert_split_bb:
1967; SI:       ; %bb.0: ; %entry
1968; SI-NEXT:    s_load_dword s4, s[8:9], 0x4
1969; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1970; SI-NEXT:    s_waitcnt lgkmcnt(0)
1971; SI-NEXT:    s_cmp_lg_u32 s4, 0
1972; SI-NEXT:    s_cbranch_scc0 .LBB42_4
1973; SI-NEXT:  ; %bb.1: ; %else
1974; SI-NEXT:    s_load_dword s5, s[2:3], 0x1
1975; SI-NEXT:    s_mov_b64 s[6:7], 0
1976; SI-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
1977; SI-NEXT:    s_waitcnt lgkmcnt(0)
1978; SI-NEXT:    s_mov_b64 vcc, vcc
1979; SI-NEXT:    s_cbranch_vccnz .LBB42_3
1980; SI-NEXT:  .LBB42_2: ; %if
1981; SI-NEXT:    s_load_dword s5, s[2:3], 0x0
1982; SI-NEXT:  .LBB42_3: ; %endif
1983; SI-NEXT:    s_waitcnt lgkmcnt(0)
1984; SI-NEXT:    v_mov_b32_e32 v0, s4
1985; SI-NEXT:    s_mov_b32 s3, 0x100f000
1986; SI-NEXT:    s_mov_b32 s2, -1
1987; SI-NEXT:    v_mov_b32_e32 v1, s5
1988; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1989; SI-NEXT:    s_endpgm
1990; SI-NEXT:  .LBB42_4:
1991; SI-NEXT:    s_branch .LBB42_2
1992;
1993; VI-LABEL: insert_split_bb:
1994; VI:       ; %bb.0: ; %entry
1995; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
1996; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1997; VI-NEXT:    s_waitcnt lgkmcnt(0)
1998; VI-NEXT:    s_cmp_lg_u32 s4, 0
1999; VI-NEXT:    s_cbranch_scc0 .LBB42_4
2000; VI-NEXT:  ; %bb.1: ; %else
2001; VI-NEXT:    s_load_dword s5, s[2:3], 0x4
2002; VI-NEXT:    s_cbranch_execnz .LBB42_3
2003; VI-NEXT:  .LBB42_2: ; %if
2004; VI-NEXT:    s_waitcnt lgkmcnt(0)
2005; VI-NEXT:    s_load_dword s5, s[2:3], 0x0
2006; VI-NEXT:  .LBB42_3: ; %endif
2007; VI-NEXT:    s_waitcnt lgkmcnt(0)
2008; VI-NEXT:    v_mov_b32_e32 v0, s4
2009; VI-NEXT:    s_mov_b32 s3, 0x1100f000
2010; VI-NEXT:    s_mov_b32 s2, -1
2011; VI-NEXT:    v_mov_b32_e32 v1, s5
2012; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2013; VI-NEXT:    s_endpgm
2014; VI-NEXT:  .LBB42_4:
2015; VI-NEXT:    s_branch .LBB42_2
2016entry:
2017  %0 = insertelement <2 x i32> undef, i32 %a, i32 0
2018  %1 = icmp eq i32 %a, 0
2019  br i1 %1, label %if, label %else
2020
2021if:
2022  %2 = load i32, ptr addrspace(1) %in
2023  %3 = insertelement <2 x i32> %0, i32 %2, i32 1
2024  br label %endif
2025
2026else:
2027  %4 = getelementptr i32, ptr addrspace(1) %in, i32 1
2028  %5 = load i32, ptr addrspace(1) %4
2029  %6 = insertelement <2 x i32> %0, i32 %5, i32 1
2030  br label %endif
2031
2032endif:
2033  %7 = phi <2 x i32> [%3, %if], [%6, %else]
2034  store <2 x i32> %7, ptr addrspace(1) %out
2035  ret void
2036}
2037
2038define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
2039; SI-LABEL: dynamic_insertelement_v2f64:
2040; SI:       ; %bb.0:
2041; SI-NEXT:    s_load_dword s10, s[8:9], 0x18
2042; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0xc
2043; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2044; SI-NEXT:    s_mov_b32 s7, 0x100f000
2045; SI-NEXT:    s_mov_b32 s6, -1
2046; SI-NEXT:    s_waitcnt lgkmcnt(0)
2047; SI-NEXT:    s_cmp_eq_u32 s10, 1
2048; SI-NEXT:    s_cselect_b32 s3, 0x40200000, s3
2049; SI-NEXT:    s_cselect_b32 s2, 0, s2
2050; SI-NEXT:    s_cmp_eq_u32 s10, 0
2051; SI-NEXT:    s_cselect_b32 s1, 0x40200000, s1
2052; SI-NEXT:    s_cselect_b32 s0, 0, s0
2053; SI-NEXT:    v_mov_b32_e32 v0, s0
2054; SI-NEXT:    v_mov_b32_e32 v1, s1
2055; SI-NEXT:    v_mov_b32_e32 v2, s2
2056; SI-NEXT:    v_mov_b32_e32 v3, s3
2057; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2058; SI-NEXT:    s_endpgm
2059;
2060; VI-LABEL: dynamic_insertelement_v2f64:
2061; VI:       ; %bb.0:
2062; VI-NEXT:    s_load_dword s10, s[8:9], 0x60
2063; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x30
2064; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2065; VI-NEXT:    s_mov_b32 s7, 0x1100f000
2066; VI-NEXT:    s_mov_b32 s6, -1
2067; VI-NEXT:    s_waitcnt lgkmcnt(0)
2068; VI-NEXT:    s_cmp_eq_u32 s10, 1
2069; VI-NEXT:    s_cselect_b32 s3, 0x40200000, s3
2070; VI-NEXT:    s_cselect_b32 s2, 0, s2
2071; VI-NEXT:    s_cmp_eq_u32 s10, 0
2072; VI-NEXT:    s_cselect_b32 s1, 0x40200000, s1
2073; VI-NEXT:    s_cselect_b32 s0, 0, s0
2074; VI-NEXT:    v_mov_b32_e32 v0, s0
2075; VI-NEXT:    v_mov_b32_e32 v1, s1
2076; VI-NEXT:    v_mov_b32_e32 v2, s2
2077; VI-NEXT:    v_mov_b32_e32 v3, s3
2078; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2079; VI-NEXT:    s_endpgm
2080  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
2081  store <2 x double> %vecins, ptr addrspace(1) %out, align 16
2082  ret void
2083}
2084
2085define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 x i64> %a, i32 %b) nounwind {
2086; SI-LABEL: dynamic_insertelement_v2i64:
2087; SI:       ; %bb.0:
2088; SI-NEXT:    s_load_dword s10, s[8:9], 0x8
2089; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
2090; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2091; SI-NEXT:    s_mov_b32 s7, 0x100f000
2092; SI-NEXT:    s_mov_b32 s6, -1
2093; SI-NEXT:    s_waitcnt lgkmcnt(0)
2094; SI-NEXT:    s_cmp_eq_u32 s10, 1
2095; SI-NEXT:    s_cselect_b32 s3, 0, s3
2096; SI-NEXT:    s_cselect_b32 s2, 5, s2
2097; SI-NEXT:    s_cmp_eq_u32 s10, 0
2098; SI-NEXT:    s_cselect_b32 s1, 0, s1
2099; SI-NEXT:    s_cselect_b32 s0, 5, s0
2100; SI-NEXT:    v_mov_b32_e32 v0, s0
2101; SI-NEXT:    v_mov_b32_e32 v1, s1
2102; SI-NEXT:    v_mov_b32_e32 v2, s2
2103; SI-NEXT:    v_mov_b32_e32 v3, s3
2104; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2105; SI-NEXT:    s_endpgm
2106;
2107; VI-LABEL: dynamic_insertelement_v2i64:
2108; VI:       ; %bb.0:
2109; VI-NEXT:    s_load_dword s10, s[8:9], 0x20
2110; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
2111; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
2112; VI-NEXT:    s_mov_b32 s7, 0x1100f000
2113; VI-NEXT:    s_mov_b32 s6, -1
2114; VI-NEXT:    s_waitcnt lgkmcnt(0)
2115; VI-NEXT:    s_cmp_eq_u32 s10, 1
2116; VI-NEXT:    s_cselect_b32 s3, 0, s3
2117; VI-NEXT:    s_cselect_b32 s2, 5, s2
2118; VI-NEXT:    s_cmp_eq_u32 s10, 0
2119; VI-NEXT:    s_cselect_b32 s1, 0, s1
2120; VI-NEXT:    s_cselect_b32 s0, 5, s0
2121; VI-NEXT:    v_mov_b32_e32 v0, s0
2122; VI-NEXT:    v_mov_b32_e32 v1, s1
2123; VI-NEXT:    v_mov_b32_e32 v2, s2
2124; VI-NEXT:    v_mov_b32_e32 v3, s3
2125; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2126; VI-NEXT:    s_endpgm
2127  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
2128  store <2 x i64> %vecins, ptr addrspace(1) %out, align 8
2129  ret void
2130}
2131
2132define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 x i64> %a, i32 %b) nounwind {
2133; SI-LABEL: dynamic_insertelement_v3i64:
2134; SI:       ; %bb.0:
2135; SI-NEXT:    s_load_dword s10, s[8:9], 0x10
2136; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
2137; SI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x8
2138; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0xc
2139; SI-NEXT:    s_mov_b32 s3, 0x100f000
2140; SI-NEXT:    s_waitcnt lgkmcnt(0)
2141; SI-NEXT:    s_cmp_eq_u32 s10, 1
2142; SI-NEXT:    s_mov_b32 s2, -1
2143; SI-NEXT:    s_cselect_b32 s7, 0, s7
2144; SI-NEXT:    s_cselect_b32 s6, 5, s6
2145; SI-NEXT:    s_cmp_eq_u32 s10, 0
2146; SI-NEXT:    s_cselect_b32 s5, 0, s5
2147; SI-NEXT:    s_cselect_b32 s4, 5, s4
2148; SI-NEXT:    s_cmp_eq_u32 s10, 2
2149; SI-NEXT:    s_cselect_b32 s9, 0, s9
2150; SI-NEXT:    s_cselect_b32 s8, 5, s8
2151; SI-NEXT:    v_mov_b32_e32 v0, s8
2152; SI-NEXT:    v_mov_b32_e32 v1, s9
2153; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
2154; SI-NEXT:    v_mov_b32_e32 v0, s4
2155; SI-NEXT:    v_mov_b32_e32 v1, s5
2156; SI-NEXT:    v_mov_b32_e32 v2, s6
2157; SI-NEXT:    v_mov_b32_e32 v3, s7
2158; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2159; SI-NEXT:    s_endpgm
2160;
2161; VI-LABEL: dynamic_insertelement_v3i64:
2162; VI:       ; %bb.0:
2163; VI-NEXT:    s_load_dword s10, s[8:9], 0x40
2164; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
2165; VI-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x20
2166; VI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x30
2167; VI-NEXT:    s_mov_b32 s3, 0x1100f000
2168; VI-NEXT:    s_waitcnt lgkmcnt(0)
2169; VI-NEXT:    s_cmp_eq_u32 s10, 1
2170; VI-NEXT:    s_mov_b32 s2, -1
2171; VI-NEXT:    s_cselect_b32 s7, 0, s7
2172; VI-NEXT:    s_cselect_b32 s6, 5, s6
2173; VI-NEXT:    s_cmp_eq_u32 s10, 0
2174; VI-NEXT:    s_cselect_b32 s5, 0, s5
2175; VI-NEXT:    s_cselect_b32 s4, 5, s4
2176; VI-NEXT:    s_cmp_eq_u32 s10, 2
2177; VI-NEXT:    s_cselect_b32 s9, 0, s9
2178; VI-NEXT:    s_cselect_b32 s8, 5, s8
2179; VI-NEXT:    v_mov_b32_e32 v0, s8
2180; VI-NEXT:    v_mov_b32_e32 v1, s9
2181; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
2182; VI-NEXT:    v_mov_b32_e32 v0, s4
2183; VI-NEXT:    v_mov_b32_e32 v1, s5
2184; VI-NEXT:    v_mov_b32_e32 v2, s6
2185; VI-NEXT:    v_mov_b32_e32 v3, s7
2186; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2187; VI-NEXT:    s_endpgm
2188  %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
2189  store <3 x i64> %vecins, ptr addrspace(1) %out, align 32
2190  ret void
2191}
2192
2193define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 x double> %a, i32 %b) nounwind {
2194; SI-LABEL: dynamic_insertelement_v4f64:
2195; SI:       ; %bb.0:
2196; SI-NEXT:    s_load_dword s12, s[8:9], 0x10
2197; SI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x8
2198; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
2199; SI-NEXT:    s_mov_b32 s11, 0x100f000
2200; SI-NEXT:    s_mov_b32 s10, -1
2201; SI-NEXT:    s_waitcnt lgkmcnt(0)
2202; SI-NEXT:    s_cmp_eq_u32 s12, 1
2203; SI-NEXT:    s_cselect_b32 s3, 0x40200000, s3
2204; SI-NEXT:    s_cselect_b32 s2, 0, s2
2205; SI-NEXT:    s_cmp_eq_u32 s12, 0
2206; SI-NEXT:    s_cselect_b32 s1, 0x40200000, s1
2207; SI-NEXT:    s_cselect_b32 s0, 0, s0
2208; SI-NEXT:    s_cmp_eq_u32 s12, 3
2209; SI-NEXT:    s_cselect_b32 s7, 0x40200000, s7
2210; SI-NEXT:    s_cselect_b32 s6, 0, s6
2211; SI-NEXT:    s_cmp_eq_u32 s12, 2
2212; SI-NEXT:    s_cselect_b32 s5, 0x40200000, s5
2213; SI-NEXT:    s_cselect_b32 s4, 0, s4
2214; SI-NEXT:    v_mov_b32_e32 v0, s4
2215; SI-NEXT:    v_mov_b32_e32 v1, s5
2216; SI-NEXT:    v_mov_b32_e32 v2, s6
2217; SI-NEXT:    v_mov_b32_e32 v3, s7
2218; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
2219; SI-NEXT:    s_nop 0
2220; SI-NEXT:    v_mov_b32_e32 v0, s0
2221; SI-NEXT:    v_mov_b32_e32 v1, s1
2222; SI-NEXT:    v_mov_b32_e32 v2, s2
2223; SI-NEXT:    v_mov_b32_e32 v3, s3
2224; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
2225; SI-NEXT:    s_endpgm
2226;
2227; VI-LABEL: dynamic_insertelement_v4f64:
2228; VI:       ; %bb.0:
2229; VI-NEXT:    s_load_dword s12, s[8:9], 0x40
2230; VI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x20
2231; VI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
2232; VI-NEXT:    s_mov_b32 s11, 0x1100f000
2233; VI-NEXT:    s_mov_b32 s10, -1
2234; VI-NEXT:    s_waitcnt lgkmcnt(0)
2235; VI-NEXT:    s_cmp_eq_u32 s12, 1
2236; VI-NEXT:    s_cselect_b32 s3, 0x40200000, s3
2237; VI-NEXT:    s_cselect_b32 s2, 0, s2
2238; VI-NEXT:    s_cmp_eq_u32 s12, 0
2239; VI-NEXT:    s_cselect_b32 s1, 0x40200000, s1
2240; VI-NEXT:    s_cselect_b32 s0, 0, s0
2241; VI-NEXT:    s_cmp_eq_u32 s12, 3
2242; VI-NEXT:    s_cselect_b32 s7, 0x40200000, s7
2243; VI-NEXT:    s_cselect_b32 s6, 0, s6
2244; VI-NEXT:    s_cmp_eq_u32 s12, 2
2245; VI-NEXT:    s_cselect_b32 s5, 0x40200000, s5
2246; VI-NEXT:    s_cselect_b32 s4, 0, s4
2247; VI-NEXT:    v_mov_b32_e32 v0, s4
2248; VI-NEXT:    v_mov_b32_e32 v1, s5
2249; VI-NEXT:    v_mov_b32_e32 v2, s6
2250; VI-NEXT:    v_mov_b32_e32 v3, s7
2251; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
2252; VI-NEXT:    s_nop 0
2253; VI-NEXT:    v_mov_b32_e32 v0, s0
2254; VI-NEXT:    v_mov_b32_e32 v1, s1
2255; VI-NEXT:    v_mov_b32_e32 v2, s2
2256; VI-NEXT:    v_mov_b32_e32 v3, s3
2257; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
2258; VI-NEXT:    s_endpgm
2259  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
2260  store <4 x double> %vecins, ptr addrspace(1) %out, align 16
2261  ret void
2262}
2263
2264define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 x double> %a, i32 %b) #0 {
2265; SI-LABEL: dynamic_insertelement_v8f64:
2266; SI:       ; %bb.0:
2267; SI-NEXT:    s_load_dword s4, s[8:9], 0x20
2268; SI-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x10
2269; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
2270; SI-NEXT:    v_mov_b32_e32 v16, 0x40200000
2271; SI-NEXT:    s_mov_b32 s3, 0x100f000
2272; SI-NEXT:    s_waitcnt lgkmcnt(0)
2273; SI-NEXT:    s_lshl_b32 s4, s4, 1
2274; SI-NEXT:    v_mov_b32_e32 v0, s12
2275; SI-NEXT:    v_mov_b32_e32 v1, s13
2276; SI-NEXT:    v_mov_b32_e32 v2, s14
2277; SI-NEXT:    v_mov_b32_e32 v3, s15
2278; SI-NEXT:    v_mov_b32_e32 v4, s16
2279; SI-NEXT:    v_mov_b32_e32 v5, s17
2280; SI-NEXT:    v_mov_b32_e32 v6, s18
2281; SI-NEXT:    v_mov_b32_e32 v7, s19
2282; SI-NEXT:    v_mov_b32_e32 v8, s20
2283; SI-NEXT:    v_mov_b32_e32 v9, s21
2284; SI-NEXT:    v_mov_b32_e32 v10, s22
2285; SI-NEXT:    v_mov_b32_e32 v11, s23
2286; SI-NEXT:    v_mov_b32_e32 v12, s24
2287; SI-NEXT:    v_mov_b32_e32 v13, s25
2288; SI-NEXT:    v_mov_b32_e32 v14, s26
2289; SI-NEXT:    v_mov_b32_e32 v15, s27
2290; SI-NEXT:    s_mov_b32 m0, s4
2291; SI-NEXT:    v_movreld_b32_e32 v0, 0
2292; SI-NEXT:    s_mov_b32 s2, -1
2293; SI-NEXT:    v_movreld_b32_e32 v1, v16
2294; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
2295; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
2296; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
2297; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2298; SI-NEXT:    s_endpgm
2299;
2300; VI-LABEL: dynamic_insertelement_v8f64:
2301; VI:       ; %bb.0:
2302; VI-NEXT:    s_load_dword s4, s[8:9], 0x80
2303; VI-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x40
2304; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
2305; VI-NEXT:    v_mov_b32_e32 v16, 0x40200000
2306; VI-NEXT:    s_mov_b32 s3, 0x1100f000
2307; VI-NEXT:    s_waitcnt lgkmcnt(0)
2308; VI-NEXT:    s_lshl_b32 s4, s4, 1
2309; VI-NEXT:    v_mov_b32_e32 v0, s12
2310; VI-NEXT:    v_mov_b32_e32 v1, s13
2311; VI-NEXT:    v_mov_b32_e32 v2, s14
2312; VI-NEXT:    v_mov_b32_e32 v3, s15
2313; VI-NEXT:    v_mov_b32_e32 v4, s16
2314; VI-NEXT:    v_mov_b32_e32 v5, s17
2315; VI-NEXT:    v_mov_b32_e32 v6, s18
2316; VI-NEXT:    v_mov_b32_e32 v7, s19
2317; VI-NEXT:    v_mov_b32_e32 v8, s20
2318; VI-NEXT:    v_mov_b32_e32 v9, s21
2319; VI-NEXT:    v_mov_b32_e32 v10, s22
2320; VI-NEXT:    v_mov_b32_e32 v11, s23
2321; VI-NEXT:    v_mov_b32_e32 v12, s24
2322; VI-NEXT:    v_mov_b32_e32 v13, s25
2323; VI-NEXT:    v_mov_b32_e32 v14, s26
2324; VI-NEXT:    v_mov_b32_e32 v15, s27
2325; VI-NEXT:    s_mov_b32 m0, s4
2326; VI-NEXT:    v_movreld_b32_e32 v0, 0
2327; VI-NEXT:    s_mov_b32 s2, -1
2328; VI-NEXT:    v_movreld_b32_e32 v1, v16
2329; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
2330; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
2331; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
2332; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2333; VI-NEXT:    s_endpgm
2334  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
2335  store <8 x double> %vecins, ptr addrspace(1) %out, align 16
2336  ret void
2337}
2338
2339declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
2340
2341attributes #0 = { nounwind }
2342attributes #1 = { nounwind readnone }
2343