xref: /llvm-project/llvm/test/CodeGen/ARM/neon-copy.ll (revision 7b3bbd83c0c24087072ec5b22a76799ab31f87d5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=armv7a-none-eabihf -mattr=+neon -verify-machineinstrs | FileCheck %s
3
4define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) {
5; CHECK-LABEL: ins16bw:
6; CHECK:       @ %bb.0:
7; CHECK-NEXT:    vmov.8 d1[7], r0
8; CHECK-NEXT:    bx lr
9  %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 15
10  ret <16 x i8> %tmp3
11}
12
13define <8 x i16> @ins8hw(<8 x i16> %tmp1, i16 %tmp2) {
14; CHECK-LABEL: ins8hw:
15; CHECK:       @ %bb.0:
16; CHECK-NEXT:    vmov.16 d1[2], r0
17; CHECK-NEXT:    bx lr
18  %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 6
19  ret <8 x i16> %tmp3
20}
21
22define <4 x i32> @ins4sw(<4 x i32> %tmp1, i32 %tmp2) {
23; CHECK-LABEL: ins4sw:
24; CHECK:       @ %bb.0:
25; CHECK-NEXT:    vmov.32 d1[0], r0
26; CHECK-NEXT:    bx lr
27  %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 2
28  ret <4 x i32> %tmp3
29}
30
31define <2 x i64> @ins2dw(<2 x i64> %tmp1, i64 %tmp2) {
32; CHECK-LABEL: ins2dw:
33; CHECK:       @ %bb.0:
34; CHECK-NEXT:    vmov.32 d1[0], r0
35; CHECK-NEXT:    vmov.32 d1[1], r1
36; CHECK-NEXT:    bx lr
37  %tmp3 = insertelement <2 x i64> %tmp1, i64 %tmp2, i32 1
38  ret <2 x i64> %tmp3
39}
40
41define <8 x i8> @ins8bw(<8 x i8> %tmp1, i8 %tmp2) {
42; CHECK-LABEL: ins8bw:
43; CHECK:       @ %bb.0:
44; CHECK-NEXT:    vmov.8 d0[5], r0
45; CHECK-NEXT:    bx lr
46  %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 5
47  ret <8 x i8> %tmp3
48}
49
50define <4 x i16> @ins4hw(<4 x i16> %tmp1, i16 %tmp2) {
51; CHECK-LABEL: ins4hw:
52; CHECK:       @ %bb.0:
53; CHECK-NEXT:    vmov.16 d0[3], r0
54; CHECK-NEXT:    bx lr
55  %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 3
56  ret <4 x i16> %tmp3
57}
58
59define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) {
60; CHECK-LABEL: ins2sw:
61; CHECK:       @ %bb.0:
62; CHECK-NEXT:    vmov.32 d0[1], r0
63; CHECK-NEXT:    bx lr
64  %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
65  ret <2 x i32> %tmp3
66}
67
68define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) {
69; CHECK-LABEL: ins16b16:
70; CHECK:       @ %bb.0:
71; CHECK-NEXT:    vmov.u8 r0, d0[2]
72; CHECK-NEXT:    vmov.8 d3[7], r0
73; CHECK-NEXT:    vorr q0, q1, q1
74; CHECK-NEXT:    bx lr
75  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
76  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
77  ret <16 x i8> %tmp4
78}
79
80define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) {
81; CHECK-LABEL: ins8h8:
82; CHECK:       @ %bb.0:
83; CHECK-NEXT:    vmov.u16 r0, d0[2]
84; CHECK-NEXT:    vmov.16 d3[3], r0
85; CHECK-NEXT:    vorr q0, q1, q1
86; CHECK-NEXT:    bx lr
87  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
88  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
89  ret <8 x i16> %tmp4
90}
91
92define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) {
93; CHECK-LABEL: ins4s4:
94; CHECK:       @ %bb.0:
95; CHECK-NEXT:    vmov.32 r0, d1[0]
96; CHECK-NEXT:    vmov.32 d2[1], r0
97; CHECK-NEXT:    vorr q0, q1, q1
98; CHECK-NEXT:    bx lr
99  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
100  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
101  ret <4 x i32> %tmp4
102}
103
104define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) {
105; CHECK-LABEL: ins2d2:
106; CHECK:       @ %bb.0:
107; CHECK-NEXT:    vmov r0, r1, d0
108; CHECK-NEXT:    vmov.32 d3[0], r0
109; CHECK-NEXT:    vmov.32 d3[1], r1
110; CHECK-NEXT:    vorr q0, q1, q1
111; CHECK-NEXT:    bx lr
112  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
113  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
114  ret <2 x i64> %tmp4
115}
116
117define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) {
118; CHECK-LABEL: ins4f4:
119; CHECK:       @ %bb.0:
120; CHECK-NEXT:    vmov.f32 s5, s2
121; CHECK-NEXT:    vorr q0, q1, q1
122; CHECK-NEXT:    bx lr
123  %tmp3 = extractelement <4 x float> %tmp1, i32 2
124  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
125  ret <4 x float> %tmp4
126}
127
128define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) {
129; CHECK-LABEL: ins2df2:
130; CHECK:       @ %bb.0:
131; CHECK-NEXT:    vorr d3, d0, d0
132; CHECK-NEXT:    vorr q0, q1, q1
133; CHECK-NEXT:    bx lr
134  %tmp3 = extractelement <2 x double> %tmp1, i32 0
135  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
136  ret <2 x double> %tmp4
137}
138
139define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) {
140; CHECK-LABEL: ins8b16:
141; CHECK:       @ %bb.0:
142; CHECK-NEXT:    vmov.u8 r0, d0[2]
143; CHECK-NEXT:    vmov.8 d3[7], r0
144; CHECK-NEXT:    vorr q0, q1, q1
145; CHECK-NEXT:    bx lr
146  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
147  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
148  ret <16 x i8> %tmp4
149}
150
151define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) {
152; CHECK-LABEL: ins4h8:
153; CHECK:       @ %bb.0:
154; CHECK-NEXT:    vmov.u16 r0, d0[2]
155; CHECK-NEXT:    vmov.16 d3[3], r0
156; CHECK-NEXT:    vorr q0, q1, q1
157; CHECK-NEXT:    bx lr
158  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
159  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
160  ret <8 x i16> %tmp4
161}
162
163define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) {
164; CHECK-LABEL: ins2s4:
165; CHECK:       @ %bb.0:
166; CHECK-NEXT:    vmov.32 r0, d0[1]
167; CHECK-NEXT:    vmov.32 d2[1], r0
168; CHECK-NEXT:    vorr q0, q1, q1
169; CHECK-NEXT:    bx lr
170  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
171  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
172  ret <4 x i32> %tmp4
173}
174
175define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) {
176; CHECK-LABEL: ins1d2:
177; CHECK:       @ %bb.0:
178; CHECK-NEXT:    vmov.32 r0, d0[0]
179; CHECK-NEXT:    vmov.32 r1, d0[1]
180; CHECK-NEXT:    vmov.32 d3[0], r0
181; CHECK-NEXT:    vmov.32 d3[1], r1
182; CHECK-NEXT:    vorr q0, q1, q1
183; CHECK-NEXT:    bx lr
184  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
185  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
186  ret <2 x i64> %tmp4
187}
188
189define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {
190; CHECK-LABEL: ins2f4:
191; CHECK:       @ %bb.0:
192; CHECK-NEXT:    vmov.f32 s5, s1
193; CHECK-NEXT:    vorr q0, q1, q1
194; CHECK-NEXT:    bx lr
195  %tmp3 = extractelement <2 x float> %tmp1, i32 1
196  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
197  ret <4 x float> %tmp4
198}
199
200define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
201; CHECK-LABEL: ins1f2:
202; CHECK:       @ %bb.0:
203; CHECK-NEXT:    vorr d3, d0, d0
204; CHECK-NEXT:    vorr q0, q1, q1
205; CHECK-NEXT:    bx lr
206  %tmp3 = extractelement <1 x double> %tmp1, i32 0
207  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
208  ret <2 x double> %tmp4
209}
210
211define <2 x double> @ins1f2_args_flipped(<2 x double> %tmp2, <1 x double> %tmp1) {
212; CHECK-LABEL: ins1f2_args_flipped:
213; CHECK:       @ %bb.0:
214; CHECK-NEXT:    vmov.f64 d1, d2
215; CHECK-NEXT:    bx lr
216  %tmp3 = extractelement <1 x double> %tmp1, i32 0
217  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
218  ret <2 x double> %tmp4
219}
220
221define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
222; CHECK-LABEL: ins16b8:
223; CHECK:       @ %bb.0:
224; CHECK-NEXT:    vmov.u8 r0, d0[2]
225; CHECK-NEXT:    vmov.8 d2[7], r0
226; CHECK-NEXT:    vorr d0, d2, d2
227; CHECK-NEXT:    bx lr
228  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
229  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7
230  ret <8 x i8> %tmp4
231}
232
233define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) {
234; CHECK-LABEL: ins8h4:
235; CHECK:       @ %bb.0:
236; CHECK-NEXT:    vmov.u16 r0, d0[2]
237; CHECK-NEXT:    vmov.16 d2[3], r0
238; CHECK-NEXT:    vorr d0, d2, d2
239; CHECK-NEXT:    bx lr
240  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
241  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
242  ret <4 x i16> %tmp4
243}
244
245define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
246; CHECK-LABEL: ins4s2:
247; CHECK:       @ %bb.0:
248; CHECK-NEXT:    vmov.32 r0, d1[0]
249; CHECK-NEXT:    vmov.32 d2[1], r0
250; CHECK-NEXT:    vorr d0, d2, d2
251; CHECK-NEXT:    bx lr
252  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
253  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
254  ret <2 x i32> %tmp4
255}
256
257define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
258; CHECK-LABEL: ins2d1:
259; CHECK:       @ %bb.0:
260; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0
261; CHECK-NEXT:    bx lr
262  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
263  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
264  ret <1 x i64> %tmp4
265}
266
267define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
268; CHECK-LABEL: ins4f2:
269; CHECK:       @ %bb.0:
270; CHECK-NEXT:    vmov.f32 s5, s2
271; CHECK-NEXT:    vmov.f64 d0, d2
272; CHECK-NEXT:    bx lr
273  %tmp3 = extractelement <4 x float> %tmp1, i32 2
274  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
275  ret <2 x float> %tmp4
276}
277
278define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
279; CHECK-LABEL: ins2f1:
280; CHECK:       @ %bb.0:
281; CHECK-NEXT:    vmov.f64 d0, d1
282; CHECK-NEXT:    bx lr
283  %tmp3 = extractelement <2 x double> %tmp1, i32 1
284  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
285  ret <1 x double> %tmp4
286}
287
288define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) {
289; CHECK-LABEL: ins8b8:
290; CHECK:       @ %bb.0:
291; CHECK-NEXT:    vmov.u8 r0, d0[2]
292; CHECK-NEXT:    vmov.8 d1[4], r0
293; CHECK-NEXT:    vorr d0, d1, d1
294; CHECK-NEXT:    bx lr
295  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
296  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4
297  ret <8 x i8> %tmp4
298}
299
300define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) {
301; CHECK-LABEL: ins4h4:
302; CHECK:       @ %bb.0:
303; CHECK-NEXT:    vmov.u16 r0, d0[2]
304; CHECK-NEXT:    vmov.16 d1[3], r0
305; CHECK-NEXT:    vorr d0, d1, d1
306; CHECK-NEXT:    bx lr
307  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
308  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
309  ret <4 x i16> %tmp4
310}
311
312define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) {
313; CHECK-LABEL: ins2s2:
314; CHECK:       @ %bb.0:
315; CHECK-NEXT:    vmov.32 r0, d0[0]
316; CHECK-NEXT:    vmov.32 d1[1], r0
317; CHECK-NEXT:    vorr d0, d1, d1
318; CHECK-NEXT:    bx lr
319  %tmp3 = extractelement <2 x i32> %tmp1, i32 0
320  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
321  ret <2 x i32> %tmp4
322}
323
324define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) {
325; CHECK-LABEL: ins1d1:
326; CHECK:       @ %bb.0:
327; CHECK-NEXT:    bx lr
328  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
329  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
330  ret <1 x i64> %tmp4
331}
332
333define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) {
334; CHECK-LABEL: ins2f2:
335; CHECK:       @ %bb.0:
336; CHECK-NEXT:    vmov.f32 s3, s0
337; CHECK-NEXT:    vmov.f64 d0, d1
338; CHECK-NEXT:    bx lr
339  %tmp3 = extractelement <2 x float> %tmp1, i32 0
340  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
341  ret <2 x float> %tmp4
342}
343
344define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) {
345; CHECK-LABEL: ins1df1:
346; CHECK:       @ %bb.0:
347; CHECK-NEXT:    bx lr
348  %tmp3 = extractelement <1 x double> %tmp1, i32 0
349  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
350  ret <1 x double> %tmp4
351}
352
353define i32 @umovw16b(<16 x i8> %tmp1) {
354; CHECK-LABEL: umovw16b:
355; CHECK:       @ %bb.0:
356; CHECK-NEXT:    vmov.u8 r0, d1[0]
357; CHECK-NEXT:    bx lr
358  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
359  %tmp4 = zext i8 %tmp3 to i32
360  ret i32 %tmp4
361}
362
363define i32 @umovw8h(<8 x i16> %tmp1) {
364; CHECK-LABEL: umovw8h:
365; CHECK:       @ %bb.0:
366; CHECK-NEXT:    vmov.u16 r0, d0[2]
367; CHECK-NEXT:    bx lr
368  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
369  %tmp4 = zext i16 %tmp3 to i32
370  ret i32 %tmp4
371}
372
373define i32 @umovw4s(<4 x i32> %tmp1) {
374; CHECK-LABEL: umovw4s:
375; CHECK:       @ %bb.0:
376; CHECK-NEXT:    vmov.32 r0, d1[0]
377; CHECK-NEXT:    bx lr
378  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
379  ret i32 %tmp3
380}
381
382define i64 @umovx2d(<2 x i64> %tmp1) {
383; CHECK-LABEL: umovx2d:
384; CHECK:       @ %bb.0:
385; CHECK-NEXT:    vmov r0, r1, d1
386; CHECK-NEXT:    bx lr
387  %tmp3 = extractelement <2 x i64> %tmp1, i32 1
388  ret i64 %tmp3
389}
390
391define i32 @umovw8b(<8 x i8> %tmp1) {
392; CHECK-LABEL: umovw8b:
393; CHECK:       @ %bb.0:
394; CHECK-NEXT:    vmov.u8 r0, d0[7]
395; CHECK-NEXT:    bx lr
396  %tmp3 = extractelement <8 x i8> %tmp1, i32 7
397  %tmp4 = zext i8 %tmp3 to i32
398  ret i32 %tmp4
399}
400
401define i32 @umovw4h(<4 x i16> %tmp1) {
402; CHECK-LABEL: umovw4h:
403; CHECK:       @ %bb.0:
404; CHECK-NEXT:    vmov.u16 r0, d0[2]
405; CHECK-NEXT:    bx lr
406  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
407  %tmp4 = zext i16 %tmp3 to i32
408  ret i32 %tmp4
409}
410
411define i32 @umovw2s(<2 x i32> %tmp1) {
412; CHECK-LABEL: umovw2s:
413; CHECK:       @ %bb.0:
414; CHECK-NEXT:    vmov.32 r0, d0[1]
415; CHECK-NEXT:    bx lr
416  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
417  ret i32 %tmp3
418}
419
420define i64 @umovx1d(<1 x i64> %tmp1) {
421; CHECK-LABEL: umovx1d:
422; CHECK:       @ %bb.0:
423; CHECK-NEXT:    vmov.32 r0, d0[0]
424; CHECK-NEXT:    vmov.32 r1, d0[1]
425; CHECK-NEXT:    bx lr
426  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
427  ret i64 %tmp3
428}
429
430define i32 @smovw16b(<16 x i8> %tmp1) {
431; CHECK-LABEL: smovw16b:
432; CHECK:       @ %bb.0:
433; CHECK-NEXT:    vmov.s8 r0, d1[0]
434; CHECK-NEXT:    add r0, r0, r0
435; CHECK-NEXT:    bx lr
436  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
437  %tmp4 = sext i8 %tmp3 to i32
438  %tmp5 = add i32 %tmp4, %tmp4
439  ret i32 %tmp5
440}
441
442define i32 @smovw8h(<8 x i16> %tmp1) {
443; CHECK-LABEL: smovw8h:
444; CHECK:       @ %bb.0:
445; CHECK-NEXT:    vmov.s16 r0, d0[2]
446; CHECK-NEXT:    add r0, r0, r0
447; CHECK-NEXT:    bx lr
448  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
449  %tmp4 = sext i16 %tmp3 to i32
450  %tmp5 = add i32 %tmp4, %tmp4
451  ret i32 %tmp5
452}
453
454define i64 @smovx16b(<16 x i8> %tmp1) {
455; CHECK-LABEL: smovx16b:
456; CHECK:       @ %bb.0:
457; CHECK-NEXT:    vmov.s8 r0, d1[0]
458; CHECK-NEXT:    asr r1, r0, #31
459; CHECK-NEXT:    bx lr
460  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
461  %tmp4 = sext i8 %tmp3 to i64
462  ret i64 %tmp4
463}
464
465define i64 @smovx8h(<8 x i16> %tmp1) {
466; CHECK-LABEL: smovx8h:
467; CHECK:       @ %bb.0:
468; CHECK-NEXT:    vmov.s16 r0, d0[2]
469; CHECK-NEXT:    asr r1, r0, #31
470; CHECK-NEXT:    bx lr
471  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
472  %tmp4 = sext i16 %tmp3 to i64
473  ret i64 %tmp4
474}
475
476define i64 @smovx4s(<4 x i32> %tmp1) {
477; CHECK-LABEL: smovx4s:
478; CHECK:       @ %bb.0:
479; CHECK-NEXT:    vmov.32 r0, d1[0]
480; CHECK-NEXT:    asr r1, r0, #31
481; CHECK-NEXT:    bx lr
482  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
483  %tmp4 = sext i32 %tmp3 to i64
484  ret i64 %tmp4
485}
486
487define i32 @smovw8b(<8 x i8> %tmp1) {
488; CHECK-LABEL: smovw8b:
489; CHECK:       @ %bb.0:
490; CHECK-NEXT:    vmov.s8 r0, d0[4]
491; CHECK-NEXT:    add r0, r0, r0
492; CHECK-NEXT:    bx lr
493  %tmp3 = extractelement <8 x i8> %tmp1, i32 4
494  %tmp4 = sext i8 %tmp3 to i32
495  %tmp5 = add i32 %tmp4, %tmp4
496  ret i32 %tmp5
497}
498
499define i32 @smovw4h(<4 x i16> %tmp1) {
500; CHECK-LABEL: smovw4h:
501; CHECK:       @ %bb.0:
502; CHECK-NEXT:    vmov.s16 r0, d0[2]
503; CHECK-NEXT:    add r0, r0, r0
504; CHECK-NEXT:    bx lr
505  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
506  %tmp4 = sext i16 %tmp3 to i32
507  %tmp5 = add i32 %tmp4, %tmp4
508  ret i32 %tmp5
509}
510
511define i32 @smovx8b(<8 x i8> %tmp1) {
512; CHECK-LABEL: smovx8b:
513; CHECK:       @ %bb.0:
514; CHECK-NEXT:    vmov.s8 r0, d0[6]
515; CHECK-NEXT:    bx lr
516  %tmp3 = extractelement <8 x i8> %tmp1, i32 6
517  %tmp4 = sext i8 %tmp3 to i32
518  ret i32 %tmp4
519}
520
521define i32 @smovx4h(<4 x i16> %tmp1) {
522; CHECK-LABEL: smovx4h:
523; CHECK:       @ %bb.0:
524; CHECK-NEXT:    vmov.s16 r0, d0[2]
525; CHECK-NEXT:    bx lr
526  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
527  %tmp4 = sext i16 %tmp3 to i32
528  ret i32 %tmp4
529}
530
531define i64 @smovx2s(<2 x i32> %tmp1) {
532; CHECK-LABEL: smovx2s:
533; CHECK:       @ %bb.0:
534; CHECK-NEXT:    vmov.32 r0, d0[1]
535; CHECK-NEXT:    asr r1, r0, #31
536; CHECK-NEXT:    bx lr
537  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
538  %tmp4 = sext i32 %tmp3 to i64
539  ret i64 %tmp4
540}
541
542define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) {
543; CHECK-LABEL: test_vcopy_lane_s8:
544; CHECK:       @ %bb.0:
545; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0 def $q0
546; CHECK-NEXT:    vldr d16, .LCPI50_0
547; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0 def $q0
548; CHECK-NEXT:    vtbl.8 d0, {d0, d1}, d16
549; CHECK-NEXT:    bx lr
550; CHECK-NEXT:    .p2align 3
551; CHECK-NEXT:  @ %bb.1:
552; CHECK-NEXT:  .LCPI50_0:
553; CHECK-NEXT:    .byte 0 @ 0x0
554; CHECK-NEXT:    .byte 1 @ 0x1
555; CHECK-NEXT:    .byte 2 @ 0x2
556; CHECK-NEXT:    .byte 3 @ 0x3
557; CHECK-NEXT:    .byte 4 @ 0x4
558; CHECK-NEXT:    .byte 11 @ 0xb
559; CHECK-NEXT:    .byte 6 @ 0x6
560; CHECK-NEXT:    .byte 7 @ 0x7
561  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 11, i32 6, i32 7>
562  ret <8 x i8> %vset_lane
563}
564
565define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) {
566; CHECK-LABEL: test_vcopyq_laneq_s8:
567; CHECK:       @ %bb.0:
568; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
569; CHECK-NEXT:    vldr d16, .LCPI51_0
570; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
571; CHECK-NEXT:    vtbl.8 d1, {d1, d2}, d16
572; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1
573; CHECK-NEXT:    bx lr
574; CHECK-NEXT:    .p2align 3
575; CHECK-NEXT:  @ %bb.1:
576; CHECK-NEXT:  .LCPI51_0:
577; CHECK-NEXT:    .byte 0 @ 0x0
578; CHECK-NEXT:    .byte 1 @ 0x1
579; CHECK-NEXT:    .byte 2 @ 0x2
580; CHECK-NEXT:    .byte 3 @ 0x3
581; CHECK-NEXT:    .byte 4 @ 0x4
582; CHECK-NEXT:    .byte 5 @ 0x5
583; CHECK-NEXT:    .byte 14 @ 0xe
584; CHECK-NEXT:    .byte 7 @ 0x7
585  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 22, i32 15>
586  ret <16 x i8> %vset_lane
587}
588
589define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) {
590; CHECK-LABEL: test_vcopy_lane_swap_s8:
591; CHECK:       @ %bb.0:
592; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0 def $q0
593; CHECK-NEXT:    vldr d16, .LCPI52_0
594; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0 def $q0
595; CHECK-NEXT:    vtbl.8 d0, {d0, d1}, d16
596; CHECK-NEXT:    bx lr
597; CHECK-NEXT:    .p2align 3
598; CHECK-NEXT:  @ %bb.1:
599; CHECK-NEXT:  .LCPI52_0:
600; CHECK-NEXT:    .byte 8 @ 0x8
601; CHECK-NEXT:    .byte 9 @ 0x9
602; CHECK-NEXT:    .byte 10 @ 0xa
603; CHECK-NEXT:    .byte 11 @ 0xb
604; CHECK-NEXT:    .byte 12 @ 0xc
605; CHECK-NEXT:    .byte 13 @ 0xd
606; CHECK-NEXT:    .byte 14 @ 0xe
607; CHECK-NEXT:    .byte 0 @ 0x0
608  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
609  ret <8 x i8> %vset_lane
610}
611
612define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) {
613; CHECK-LABEL: test_vcopyq_laneq_swap_s8:
614; CHECK:       @ %bb.0:
615; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
616; CHECK-NEXT:    vldr d16, .LCPI53_0
617; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
618; CHECK-NEXT:    vtbl.8 d2, {d1, d2}, d16
619; CHECK-NEXT:    vorr q0, q1, q1
620; CHECK-NEXT:    bx lr
621; CHECK-NEXT:    .p2align 3
622; CHECK-NEXT:  @ %bb.1:
623; CHECK-NEXT:  .LCPI53_0:
624; CHECK-NEXT:    .byte 7 @ 0x7
625; CHECK-NEXT:    .byte 9 @ 0x9
626; CHECK-NEXT:    .byte 10 @ 0xa
627; CHECK-NEXT:    .byte 11 @ 0xb
628; CHECK-NEXT:    .byte 12 @ 0xc
629; CHECK-NEXT:    .byte 13 @ 0xd
630; CHECK-NEXT:    .byte 14 @ 0xe
631; CHECK-NEXT:    .byte 15 @ 0xf
632  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 15, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
633  ret <16 x i8> %vset_lane
634}
635
636define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
637; CHECK-LABEL: test_vdup_n_u8:
638; CHECK:       @ %bb.0:
639; CHECK-NEXT:    vdup.8 d0, r0
640; CHECK-NEXT:    bx lr
641  %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0
642  %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1
643  %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2
644  %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 %v1, i32 3
645  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 %v1, i32 4
646  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 %v1, i32 5
647  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 %v1, i32 6
648  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 %v1, i32 7
649  ret <8 x i8> %vecinit7.i
650}
651
652define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
653; CHECK-LABEL: test_vdup_n_u16:
654; CHECK:       @ %bb.0:
655; CHECK-NEXT:    vdup.16 d0, r0
656; CHECK-NEXT:    bx lr
657  %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0
658  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1
659  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2
660  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %v1, i32 3
661  ret <4 x i16> %vecinit3.i
662}
663
664define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 {
665; CHECK-LABEL: test_vdup_n_u32:
666; CHECK:       @ %bb.0:
667; CHECK-NEXT:    vdup.32 d0, r0
668; CHECK-NEXT:    bx lr
669  %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0
670  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1
671  ret <2 x i32> %vecinit1.i
672}
673
674define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 {
675; CHECK-LABEL: test_vdup_n_u64:
676; CHECK:       @ %bb.0:
677; CHECK-NEXT:    vmov.32 d0[0], r0
678; CHECK-NEXT:    vmov.32 d0[1], r1
679; CHECK-NEXT:    bx lr
680  %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0
681  ret <1 x i64> %vecinit.i
682}
683
684define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
685; CHECK-LABEL: test_vdupq_n_u8:
686; CHECK:       @ %bb.0:
687; CHECK-NEXT:    vdup.8 q0, r0
688; CHECK-NEXT:    bx lr
689  %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0
690  %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1
691  %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2
692  %vecinit3.i = insertelement <16 x i8> %vecinit2.i, i8 %v1, i32 3
693  %vecinit4.i = insertelement <16 x i8> %vecinit3.i, i8 %v1, i32 4
694  %vecinit5.i = insertelement <16 x i8> %vecinit4.i, i8 %v1, i32 5
695  %vecinit6.i = insertelement <16 x i8> %vecinit5.i, i8 %v1, i32 6
696  %vecinit7.i = insertelement <16 x i8> %vecinit6.i, i8 %v1, i32 7
697  %vecinit8.i = insertelement <16 x i8> %vecinit7.i, i8 %v1, i32 8
698  %vecinit9.i = insertelement <16 x i8> %vecinit8.i, i8 %v1, i32 9
699  %vecinit10.i = insertelement <16 x i8> %vecinit9.i, i8 %v1, i32 10
700  %vecinit11.i = insertelement <16 x i8> %vecinit10.i, i8 %v1, i32 11
701  %vecinit12.i = insertelement <16 x i8> %vecinit11.i, i8 %v1, i32 12
702  %vecinit13.i = insertelement <16 x i8> %vecinit12.i, i8 %v1, i32 13
703  %vecinit14.i = insertelement <16 x i8> %vecinit13.i, i8 %v1, i32 14
704  %vecinit15.i = insertelement <16 x i8> %vecinit14.i, i8 %v1, i32 15
705  ret <16 x i8> %vecinit15.i
706}
707
708define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
709; CHECK-LABEL: test_vdupq_n_u16:
710; CHECK:       @ %bb.0:
711; CHECK-NEXT:    vdup.16 q0, r0
712; CHECK-NEXT:    bx lr
713  %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0
714  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1
715  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2
716  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %v1, i32 3
717  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %v1, i32 4
718  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %v1, i32 5
719  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %v1, i32 6
720  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %v1, i32 7
721  ret <8 x i16> %vecinit7.i
722}
723
724define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
725; CHECK-LABEL: test_vdupq_n_u32:
726; CHECK:       @ %bb.0:
727; CHECK-NEXT:    vdup.32 q0, r0
728; CHECK-NEXT:    bx lr
729  %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0
730  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1
731  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2
732  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %v1, i32 3
733  ret <4 x i32> %vecinit3.i
734}
735
736define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 {
737; CHECK-LABEL: test_vdupq_n_u64:
738; CHECK:       @ %bb.0:
739; CHECK-NEXT:    vmov.32 d0[0], r0
740; CHECK-NEXT:    vmov.32 d0[1], r1
741; CHECK-NEXT:    vorr d1, d0, d0
742; CHECK-NEXT:    bx lr
743  %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0
744  %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1
745  ret <2 x i64> %vecinit1.i
746}
747
748define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 {
749; CHECK-LABEL: test_vdup_lane_s8:
750; CHECK:       @ %bb.0:
751; CHECK-NEXT:    vdup.8 d0, d0[5]
752; CHECK-NEXT:    bx lr
753  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
754  ret <8 x i8> %shuffle
755}
756
757define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 {
758; CHECK-LABEL: test_vdup_lane_s16:
759; CHECK:       @ %bb.0:
760; CHECK-NEXT:    vdup.16 d0, d0[2]
761; CHECK-NEXT:    bx lr
762  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
763  ret <4 x i16> %shuffle
764}
765
766define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 {
767; CHECK-LABEL: test_vdup_lane_s32:
768; CHECK:       @ %bb.0:
769; CHECK-NEXT:    vdup.32 d0, d0[1]
770; CHECK-NEXT:    bx lr
771  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
772  ret <2 x i32> %shuffle
773}
774
775define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 {
776; CHECK-LABEL: test_vdupq_lane_s8:
777; CHECK:       @ %bb.0:
778; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
779; CHECK-NEXT:    vdup.8 q0, d0[5]
780; CHECK-NEXT:    bx lr
781  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
782  ret <16 x i8> %shuffle
783}
784
785define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 {
786; CHECK-LABEL: test_vdupq_lane_s16:
787; CHECK:       @ %bb.0:
788; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
789; CHECK-NEXT:    vdup.16 q0, d0[2]
790; CHECK-NEXT:    bx lr
791  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
792  ret <8 x i16> %shuffle
793}
794
795define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 {
796; CHECK-LABEL: test_vdupq_lane_s32:
797; CHECK:       @ %bb.0:
798; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
799; CHECK-NEXT:    vdup.32 q0, d0[1]
800; CHECK-NEXT:    bx lr
801  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
802  ret <4 x i32> %shuffle
803}
804
805define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
806; CHECK-LABEL: test_vdupq_lane_s64:
807; CHECK:       @ %bb.0:
808; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
809; CHECK-NEXT:    vmov.f64 d1, d0
810; CHECK-NEXT:    bx lr
811  %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer
812  ret <2 x i64> %shuffle
813}
814
815define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
816; CHECK-LABEL: test_vdup_laneq_s8:
817; CHECK:       @ %bb.0:
818; CHECK-NEXT:    vdup.8 d0, d0[5]
819; CHECK-NEXT:    bx lr
820  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
821  ret <8 x i8> %shuffle
822}
823
824define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
825; CHECK-LABEL: test_vdup_laneq_s16:
826; CHECK:       @ %bb.0:
827; CHECK-NEXT:    vdup.16 d0, d0[2]
828; CHECK-NEXT:    bx lr
829  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
830  ret <4 x i16> %shuffle
831}
832
833define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
834; CHECK-LABEL: test_vdup_laneq_s32:
835; CHECK:       @ %bb.0:
836; CHECK-NEXT:    vdup.32 d0, d0[1]
837; CHECK-NEXT:    bx lr
838  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
839  ret <2 x i32> %shuffle
840}
841
842define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 {
843; CHECK-LABEL: test_vdupq_laneq_s8:
844; CHECK:       @ %bb.0:
845; CHECK-NEXT:    vdup.8 q0, d0[5]
846; CHECK-NEXT:    bx lr
847  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
848  ret <16 x i8> %shuffle
849}
850
851define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 {
852; CHECK-LABEL: test_vdupq_laneq_s16:
853; CHECK:       @ %bb.0:
854; CHECK-NEXT:    vdup.16 q0, d0[2]
855; CHECK-NEXT:    bx lr
856  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
857  ret <8 x i16> %shuffle
858}
859
860define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 {
861; CHECK-LABEL: test_vdupq_laneq_s32:
862; CHECK:       @ %bb.0:
863; CHECK-NEXT:    vdup.32 q0, d0[1]
864; CHECK-NEXT:    bx lr
865  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
866  ret <4 x i32> %shuffle
867}
868
869define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 {
870; CHECK-LABEL: test_vdupq_laneq_s64:
871; CHECK:       @ %bb.0:
872; CHECK-NEXT:    vmov.f64 d1, d0
873; CHECK-NEXT:    bx lr
874  %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer
875  ret <2 x i64> %shuffle
876}
877
878define i64 @test_bitcastv8i8toi64(<8 x i8> %in) {
879; CHECK-LABEL: test_bitcastv8i8toi64:
880; CHECK:       @ %bb.0:
881; CHECK-NEXT:    vmov r0, r1, d0
882; CHECK-NEXT:    bx lr
883   %res = bitcast <8 x i8> %in to i64
884   ret i64 %res
885}
886
887define i64 @test_bitcastv4i16toi64(<4 x i16> %in) {
888; CHECK-LABEL: test_bitcastv4i16toi64:
889; CHECK:       @ %bb.0:
890; CHECK-NEXT:    vmov r0, r1, d0
891; CHECK-NEXT:    bx lr
892   %res = bitcast <4 x i16> %in to i64
893   ret i64 %res
894}
895
896define i64 @test_bitcastv2i32toi64(<2 x i32> %in) {
897; CHECK-LABEL: test_bitcastv2i32toi64:
898; CHECK:       @ %bb.0:
899; CHECK-NEXT:    vmov r0, r1, d0
900; CHECK-NEXT:    bx lr
901   %res = bitcast <2 x i32> %in to i64
902   ret i64 %res
903}
904
905define i64 @test_bitcastv2f32toi64(<2 x float> %in) {
906; CHECK-LABEL: test_bitcastv2f32toi64:
907; CHECK:       @ %bb.0:
908; CHECK-NEXT:    vmov r0, r1, d0
909; CHECK-NEXT:    bx lr
910   %res = bitcast <2 x float> %in to i64
911   ret i64 %res
912}
913
914define i64 @test_bitcastv1i64toi64(<1 x i64> %in) {
915; CHECK-LABEL: test_bitcastv1i64toi64:
916; CHECK:       @ %bb.0:
917; CHECK-NEXT:    vmov r0, r1, d0
918; CHECK-NEXT:    bx lr
919   %res = bitcast <1 x i64> %in to i64
920   ret i64 %res
921}
922
923define i64 @test_bitcastv1f64toi64(<1 x double> %in) {
924; CHECK-LABEL: test_bitcastv1f64toi64:
925; CHECK:       @ %bb.0:
926; CHECK-NEXT:    vmov r0, r1, d0
927; CHECK-NEXT:    bx lr
928   %res = bitcast <1 x double> %in to i64
929   ret i64 %res
930}
931
932define <8 x i8> @test_bitcasti64tov8i8(i64 %in) {
933; CHECK-LABEL: test_bitcasti64tov8i8:
934; CHECK:       @ %bb.0:
935; CHECK-NEXT:    vmov d0, r0, r1
936; CHECK-NEXT:    bx lr
937   %res = bitcast i64 %in to <8 x i8>
938   ret <8 x i8> %res
939}
940
941define <4 x i16> @test_bitcasti64tov4i16(i64 %in) {
942; CHECK-LABEL: test_bitcasti64tov4i16:
943; CHECK:       @ %bb.0:
944; CHECK-NEXT:    vmov d0, r0, r1
945; CHECK-NEXT:    bx lr
946   %res = bitcast i64 %in to <4 x i16>
947   ret <4 x i16> %res
948}
949
950define <2 x i32> @test_bitcasti64tov2i32(i64 %in) {
951; CHECK-LABEL: test_bitcasti64tov2i32:
952; CHECK:       @ %bb.0:
953; CHECK-NEXT:    vmov d0, r0, r1
954; CHECK-NEXT:    bx lr
955   %res = bitcast i64 %in to <2 x i32>
956   ret <2 x i32> %res
957}
958
959define <2 x float> @test_bitcasti64tov2f32(i64 %in) {
960; CHECK-LABEL: test_bitcasti64tov2f32:
961; CHECK:       @ %bb.0:
962; CHECK-NEXT:    vmov d0, r0, r1
963; CHECK-NEXT:    bx lr
964   %res = bitcast i64 %in to <2 x float>
965   ret <2 x float> %res
966}
967
968define <1 x i64> @test_bitcasti64tov1i64(i64 %in) {
969; CHECK-LABEL: test_bitcasti64tov1i64:
970; CHECK:       @ %bb.0:
971; CHECK-NEXT:    vmov d0, r0, r1
972; CHECK-NEXT:    bx lr
973   %res = bitcast i64 %in to <1 x i64>
974   ret <1 x i64> %res
975}
976
977define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
978; CHECK-LABEL: test_bitcasti64tov1f64:
979; CHECK:       @ %bb.0:
980; CHECK-NEXT:    vmov d0, r0, r1
981; CHECK-NEXT:    bx lr
982   %res = bitcast i64 %in to <1 x double>
983   ret <1 x double> %res
984}
985
986define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
987; CHECK-LABEL: test_bitcastv8i8tov1f64:
988; CHECK:       @ %bb.0:
989; CHECK-NEXT:    .save {r11, lr}
990; CHECK-NEXT:    push {r11, lr}
991; CHECK-NEXT:    vneg.s8 d16, d0
992; CHECK-NEXT:    vmov r0, r1, d16
993; CHECK-NEXT:    bl __aeabi_d2lz
994; CHECK-NEXT:    vmov.32 d0[0], r0
995; CHECK-NEXT:    vmov.32 d0[1], r1
996; CHECK-NEXT:    pop {r11, pc}
997  %sub.i = sub <8 x i8> zeroinitializer, %a
998  %1 = bitcast <8 x i8> %sub.i to <1 x double>
999  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
1000  ret <1 x i64> %vcvt.i
1001}
1002
1003define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
1004; CHECK-LABEL: test_bitcastv4i16tov1f64:
1005; CHECK:       @ %bb.0:
1006; CHECK-NEXT:    .save {r11, lr}
1007; CHECK-NEXT:    push {r11, lr}
1008; CHECK-NEXT:    vneg.s16 d16, d0
1009; CHECK-NEXT:    vmov r0, r1, d16
1010; CHECK-NEXT:    bl __aeabi_d2lz
1011; CHECK-NEXT:    vmov.32 d0[0], r0
1012; CHECK-NEXT:    vmov.32 d0[1], r1
1013; CHECK-NEXT:    pop {r11, pc}
1014  %sub.i = sub <4 x i16> zeroinitializer, %a
1015  %1 = bitcast <4 x i16> %sub.i to <1 x double>
1016  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
1017  ret <1 x i64> %vcvt.i
1018}
1019
1020define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
1021; CHECK-LABEL: test_bitcastv2i32tov1f64:
1022; CHECK:       @ %bb.0:
1023; CHECK-NEXT:    .save {r11, lr}
1024; CHECK-NEXT:    push {r11, lr}
1025; CHECK-NEXT:    vneg.s32 d16, d0
1026; CHECK-NEXT:    vmov r0, r1, d16
1027; CHECK-NEXT:    bl __aeabi_d2lz
1028; CHECK-NEXT:    vmov.32 d0[0], r0
1029; CHECK-NEXT:    vmov.32 d0[1], r1
1030; CHECK-NEXT:    pop {r11, pc}
1031  %sub.i = sub <2 x i32> zeroinitializer, %a
1032  %1 = bitcast <2 x i32> %sub.i to <1 x double>
1033  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
1034  ret <1 x i64> %vcvt.i
1035}
1036
1037define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
1038; CHECK-LABEL: test_bitcastv1i64tov1f64:
1039; CHECK:       @ %bb.0:
1040; CHECK-NEXT:    .save {r11, lr}
1041; CHECK-NEXT:    push {r11, lr}
1042; CHECK-NEXT:    vmov.i32 d16, #0x0
1043; CHECK-NEXT:    vsub.i64 d16, d16, d0
1044; CHECK-NEXT:    vmov r0, r1, d16
1045; CHECK-NEXT:    bl __aeabi_d2lz
1046; CHECK-NEXT:    vmov.32 d0[0], r0
1047; CHECK-NEXT:    vmov.32 d0[1], r1
1048; CHECK-NEXT:    pop {r11, pc}
1049  %sub.i = sub <1 x i64> zeroinitializer, %a
1050  %1 = bitcast <1 x i64> %sub.i to <1 x double>
1051  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
1052  ret <1 x i64> %vcvt.i
1053}
1054
1055define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 {
1056; CHECK-LABEL: test_bitcastv2f32tov1f64:
1057; CHECK:       @ %bb.0:
1058; CHECK-NEXT:    .save {r11, lr}
1059; CHECK-NEXT:    push {r11, lr}
1060; CHECK-NEXT:    vneg.f32 d16, d0
1061; CHECK-NEXT:    vmov r0, r1, d16
1062; CHECK-NEXT:    bl __aeabi_d2lz
1063; CHECK-NEXT:    vmov.32 d0[0], r0
1064; CHECK-NEXT:    vmov.32 d0[1], r1
1065; CHECK-NEXT:    pop {r11, pc}
1066  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
1067  %1 = bitcast <2 x float> %sub.i to <1 x double>
1068  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
1069  ret <1 x i64> %vcvt.i
1070}
1071
1072define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
1073; CHECK-LABEL: test_bitcastv1f64tov8i8:
1074; CHECK:       @ %bb.0:
1075; CHECK-NEXT:    .save {r11, lr}
1076; CHECK-NEXT:    push {r11, lr}
1077; CHECK-NEXT:    vmov.32 r0, d0[0]
1078; CHECK-NEXT:    vmov.32 r1, d0[1]
1079; CHECK-NEXT:    bl __aeabi_l2d
1080; CHECK-NEXT:    vmov d16, r0, r1
1081; CHECK-NEXT:    vneg.s8 d0, d16
1082; CHECK-NEXT:    pop {r11, pc}
1083  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
1084  %1 = bitcast <1 x double> %vcvt.i to <8 x i8>
1085  %sub.i = sub <8 x i8> zeroinitializer, %1
1086  ret <8 x i8> %sub.i
1087}
1088
1089define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
1090; CHECK-LABEL: test_bitcastv1f64tov4i16:
1091; CHECK:       @ %bb.0:
1092; CHECK-NEXT:    .save {r11, lr}
1093; CHECK-NEXT:    push {r11, lr}
1094; CHECK-NEXT:    vmov.32 r0, d0[0]
1095; CHECK-NEXT:    vmov.32 r1, d0[1]
1096; CHECK-NEXT:    bl __aeabi_l2d
1097; CHECK-NEXT:    vmov d16, r0, r1
1098; CHECK-NEXT:    vneg.s16 d0, d16
1099; CHECK-NEXT:    pop {r11, pc}
1100  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
1101  %1 = bitcast <1 x double> %vcvt.i to <4 x i16>
1102  %sub.i = sub <4 x i16> zeroinitializer, %1
1103  ret <4 x i16> %sub.i
1104}
1105
1106define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
1107; CHECK-LABEL: test_bitcastv1f64tov2i32:
1108; CHECK:       @ %bb.0:
1109; CHECK-NEXT:    .save {r11, lr}
1110; CHECK-NEXT:    push {r11, lr}
1111; CHECK-NEXT:    vmov.32 r0, d0[0]
1112; CHECK-NEXT:    vmov.32 r1, d0[1]
1113; CHECK-NEXT:    bl __aeabi_l2d
1114; CHECK-NEXT:    vmov d16, r0, r1
1115; CHECK-NEXT:    vneg.s32 d0, d16
1116; CHECK-NEXT:    pop {r11, pc}
1117  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
1118  %1 = bitcast <1 x double> %vcvt.i to <2 x i32>
1119  %sub.i = sub <2 x i32> zeroinitializer, %1
1120  ret <2 x i32> %sub.i
1121}
1122
1123define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
1124; CHECK-LABEL: test_bitcastv1f64tov1i64:
1125; CHECK:       @ %bb.0:
1126; CHECK-NEXT:    .save {r11, lr}
1127; CHECK-NEXT:    push {r11, lr}
1128; CHECK-NEXT:    vmov.32 r0, d0[0]
1129; CHECK-NEXT:    vmov.32 r1, d0[1]
1130; CHECK-NEXT:    bl __aeabi_l2d
1131; CHECK-NEXT:    vmov.i32 d16, #0x0
1132; CHECK-NEXT:    vmov d17, r0, r1
1133; CHECK-NEXT:    vsub.i64 d0, d16, d17
1134; CHECK-NEXT:    pop {r11, pc}
1135  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
1136  %1 = bitcast <1 x double> %vcvt.i to <1 x i64>
1137  %sub.i = sub <1 x i64> zeroinitializer, %1
1138  ret <1 x i64> %sub.i
1139}
1140
1141define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
1142; CHECK-LABEL: test_bitcastv1f64tov2f32:
1143; CHECK:       @ %bb.0:
1144; CHECK-NEXT:    .save {r11, lr}
1145; CHECK-NEXT:    push {r11, lr}
1146; CHECK-NEXT:    vmov.32 r0, d0[0]
1147; CHECK-NEXT:    vmov.32 r1, d0[1]
1148; CHECK-NEXT:    bl __aeabi_l2d
1149; CHECK-NEXT:    vmov d16, r0, r1
1150; CHECK-NEXT:    vneg.f32 d0, d16
1151; CHECK-NEXT:    pop {r11, pc}
1152  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
1153  %1 = bitcast <1 x double> %vcvt.i to <2 x float>
1154  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %1
1155  ret <2 x float> %sub.i
1156}
1157
1158; Test insert element into an undef vector
1159define <8 x i8> @scalar_to_vector_v8i8(i8 %a) {
1160; CHECK-LABEL: scalar_to_vector_v8i8:
1161; CHECK:       @ %bb.0:
1162; CHECK-NEXT:    vmov.8 d0[0], r0
1163; CHECK-NEXT:    bx lr
1164  %b = insertelement <8 x i8> undef, i8 %a, i32 0
1165  ret <8 x i8> %b
1166}
1167
1168define <16 x i8> @scalar_to_vector_v16i8(i8 %a) {
1169; CHECK-LABEL: scalar_to_vector_v16i8:
1170; CHECK:       @ %bb.0:
1171; CHECK-NEXT:    vmov.8 d0[0], r0
1172; CHECK-NEXT:    bx lr
1173  %b = insertelement <16 x i8> undef, i8 %a, i32 0
1174  ret <16 x i8> %b
1175}
1176
1177define <4 x i16> @scalar_to_vector_v4i16(i16 %a) {
1178; CHECK-LABEL: scalar_to_vector_v4i16:
1179; CHECK:       @ %bb.0:
1180; CHECK-NEXT:    vmov.16 d0[0], r0
1181; CHECK-NEXT:    bx lr
1182  %b = insertelement <4 x i16> undef, i16 %a, i32 0
1183  ret <4 x i16> %b
1184}
1185
1186define <8 x i16> @scalar_to_vector_v8i16(i16 %a) {
1187; CHECK-LABEL: scalar_to_vector_v8i16:
1188; CHECK:       @ %bb.0:
1189; CHECK-NEXT:    vmov.16 d0[0], r0
1190; CHECK-NEXT:    bx lr
1191  %b = insertelement <8 x i16> undef, i16 %a, i32 0
1192  ret <8 x i16> %b
1193}
1194
1195define <2 x i32> @scalar_to_vector_v2i32(i32 %a) {
1196; CHECK-LABEL: scalar_to_vector_v2i32:
1197; CHECK:       @ %bb.0:
1198; CHECK-NEXT:    vmov.32 d0[0], r0
1199; CHECK-NEXT:    bx lr
1200  %b = insertelement <2 x i32> undef, i32 %a, i32 0
1201  ret <2 x i32> %b
1202}
1203
1204define <4 x i32> @scalar_to_vector_v4i32(i32 %a) {
1205; CHECK-LABEL: scalar_to_vector_v4i32:
1206; CHECK:       @ %bb.0:
1207; CHECK-NEXT:    vmov.32 d0[0], r0
1208; CHECK-NEXT:    bx lr
1209  %b = insertelement <4 x i32> undef, i32 %a, i32 0
1210  ret <4 x i32> %b
1211}
1212
1213define <2 x i64> @scalar_to_vector_v2i64(i64 %a) {
1214; CHECK-LABEL: scalar_to_vector_v2i64:
1215; CHECK:       @ %bb.0:
1216; CHECK-NEXT:    vmov.32 d0[0], r0
1217; CHECK-NEXT:    vmov.32 d0[1], r1
1218; CHECK-NEXT:    bx lr
1219  %b = insertelement <2 x i64> undef, i64 %a, i32 0
1220  ret <2 x i64> %b
1221}
1222
1223define <8 x i8> @testDUPv1i8(<1 x i8> %a) {
1224; CHECK-LABEL: testDUPv1i8:
1225; CHECK:       @ %bb.0:
1226; CHECK-NEXT:    vdup.8 d0, r0
1227; CHECK-NEXT:    bx lr
1228  %b = extractelement <1 x i8> %a, i32 0
1229  %c = insertelement <8 x i8> undef, i8 %b, i32 0
1230  %d = insertelement <8 x i8> %c, i8 %b, i32 1
1231  %e = insertelement <8 x i8> %d, i8 %b, i32 2
1232  %f = insertelement <8 x i8> %e, i8 %b, i32 3
1233  %g = insertelement <8 x i8> %f, i8 %b, i32 4
1234  %h = insertelement <8 x i8> %g, i8 %b, i32 5
1235  %i = insertelement <8 x i8> %h, i8 %b, i32 6
1236  %j = insertelement <8 x i8> %i, i8 %b, i32 7
1237  ret <8 x i8> %j
1238}
1239
1240define <8 x i16> @testDUPv1i16(<1 x i16> %a) {
1241; CHECK-LABEL: testDUPv1i16:
1242; CHECK:       @ %bb.0:
1243; CHECK-NEXT:    vdup.16 q0, r0
1244; CHECK-NEXT:    bx lr
1245  %b = extractelement <1 x i16> %a, i32 0
1246  %c = insertelement <8 x i16> undef, i16 %b, i32 0
1247  %d = insertelement <8 x i16> %c, i16 %b, i32 1
1248  %e = insertelement <8 x i16> %d, i16 %b, i32 2
1249  %f = insertelement <8 x i16> %e, i16 %b, i32 3
1250  %g = insertelement <8 x i16> %f, i16 %b, i32 4
1251  %h = insertelement <8 x i16> %g, i16 %b, i32 5
1252  %i = insertelement <8 x i16> %h, i16 %b, i32 6
1253  %j = insertelement <8 x i16> %i, i16 %b, i32 7
1254  ret <8 x i16> %j
1255}
1256
1257define <4 x i32> @testDUPv1i32(<1 x i32> %a) {
1258; CHECK-LABEL: testDUPv1i32:
1259; CHECK:       @ %bb.0:
1260; CHECK-NEXT:    vdup.32 q0, r0
1261; CHECK-NEXT:    bx lr
1262  %b = extractelement <1 x i32> %a, i32 0
1263  %c = insertelement <4 x i32> undef, i32 %b, i32 0
1264  %d = insertelement <4 x i32> %c, i32 %b, i32 1
1265  %e = insertelement <4 x i32> %d, i32 %b, i32 2
1266  %f = insertelement <4 x i32> %e, i32 %b, i32 3
1267  ret <4 x i32> %f
1268}
1269
1270define <8 x i8> @getl(<16 x i8> %x) #0 {
1271; CHECK-LABEL: getl:
1272; CHECK:       @ %bb.0:
1273; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0
1274; CHECK-NEXT:    bx lr
1275  %vecext = extractelement <16 x i8> %x, i32 0
1276  %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0
1277  %vecext1 = extractelement <16 x i8> %x, i32 1
1278  %vecinit2 = insertelement <8 x i8> %vecinit, i8 %vecext1, i32 1
1279  %vecext3 = extractelement <16 x i8> %x, i32 2
1280  %vecinit4 = insertelement <8 x i8> %vecinit2, i8 %vecext3, i32 2
1281  %vecext5 = extractelement <16 x i8> %x, i32 3
1282  %vecinit6 = insertelement <8 x i8> %vecinit4, i8 %vecext5, i32 3
1283  %vecext7 = extractelement <16 x i8> %x, i32 4
1284  %vecinit8 = insertelement <8 x i8> %vecinit6, i8 %vecext7, i32 4
1285  %vecext9 = extractelement <16 x i8> %x, i32 5
1286  %vecinit10 = insertelement <8 x i8> %vecinit8, i8 %vecext9, i32 5
1287  %vecext11 = extractelement <16 x i8> %x, i32 6
1288  %vecinit12 = insertelement <8 x i8> %vecinit10, i8 %vecext11, i32 6
1289  %vecext13 = extractelement <16 x i8> %x, i32 7
1290  %vecinit14 = insertelement <8 x i8> %vecinit12, i8 %vecext13, i32 7
1291  ret <8 x i8> %vecinit14
1292}
1293
1294define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) {
1295; CHECK-LABEL: test_extracts_inserts_varidx_extract:
1296; CHECK:       @ %bb.0:
1297; CHECK-NEXT:    .save {r11}
1298; CHECK-NEXT:    push {r11}
1299; CHECK-NEXT:    .setfp r11, sp
1300; CHECK-NEXT:    mov r11, sp
1301; CHECK-NEXT:    .pad #28
1302; CHECK-NEXT:    sub sp, sp, #28
1303; CHECK-NEXT:    bfc sp, #0, #4
1304; CHECK-NEXT:    vmov.u16 r1, d0[1]
1305; CHECK-NEXT:    and r0, r0, #7
1306; CHECK-NEXT:    vmov.u16 r2, d0[2]
1307; CHECK-NEXT:    mov r3, sp
1308; CHECK-NEXT:    vmov.u16 r12, d0[3]
1309; CHECK-NEXT:    lsl r0, r0, #1
1310; CHECK-NEXT:    vst1.64 {d0, d1}, [r3:128], r0
1311; CHECK-NEXT:    vld1.16 {d0[0]}, [r3:16]
1312; CHECK-NEXT:    vmov.16 d0[1], r1
1313; CHECK-NEXT:    vmov.16 d0[2], r2
1314; CHECK-NEXT:    vmov.16 d0[3], r12
1315; CHECK-NEXT:    mov sp, r11
1316; CHECK-NEXT:    pop {r11}
1317; CHECK-NEXT:    bx lr
1318  %tmp = extractelement <8 x i16> %x, i32 %idx
1319  %tmp2 = insertelement <4 x i16> undef, i16 %tmp, i32 0
1320  %tmp3 = extractelement <8 x i16> %x, i32 1
1321  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 1
1322  %tmp5 = extractelement <8 x i16> %x, i32 2
1323  %tmp6 = insertelement <4 x i16> %tmp4, i16 %tmp5, i32 2
1324  %tmp7 = extractelement <8 x i16> %x, i32 3
1325  %tmp8 = insertelement <4 x i16> %tmp6, i16 %tmp7, i32 3
1326  ret <4 x i16> %tmp8
1327}
1328
1329define <4 x i16> @test_extracts_inserts_varidx_insert(<8 x i16> %x, i32 %idx) {
1330; CHECK-LABEL: test_extracts_inserts_varidx_insert:
1331; CHECK:       @ %bb.0:
1332; CHECK-NEXT:    .pad #8
1333; CHECK-NEXT:    sub sp, sp, #8
1334; CHECK-NEXT:    vmov.u16 r1, d0[1]
1335; CHECK-NEXT:    and r0, r0, #3
1336; CHECK-NEXT:    vmov.u16 r2, d0[2]
1337; CHECK-NEXT:    mov r3, sp
1338; CHECK-NEXT:    vmov.u16 r12, d0[3]
1339; CHECK-NEXT:    orr r0, r3, r0, lsl #1
1340; CHECK-NEXT:    vst1.16 {d0[0]}, [r0:16]
1341; CHECK-NEXT:    vldr d0, [sp]
1342; CHECK-NEXT:    vmov.16 d0[1], r1
1343; CHECK-NEXT:    vmov.16 d0[2], r2
1344; CHECK-NEXT:    vmov.16 d0[3], r12
1345; CHECK-NEXT:    add sp, sp, #8
1346; CHECK-NEXT:    bx lr
1347  %tmp = extractelement <8 x i16> %x, i32 0
1348  %tmp2 = insertelement <4 x i16> undef, i16 %tmp, i32 %idx
1349  %tmp3 = extractelement <8 x i16> %x, i32 1
1350  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 1
1351  %tmp5 = extractelement <8 x i16> %x, i32 2
1352  %tmp6 = insertelement <4 x i16> %tmp4, i16 %tmp5, i32 2
1353  %tmp7 = extractelement <8 x i16> %x, i32 3
1354  %tmp8 = insertelement <4 x i16> %tmp6, i16 %tmp7, i32 3
1355  ret <4 x i16> %tmp8
1356}
1357
1358define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) {
1359; CHECK-LABEL: test_dup_v2i32_v4i16:
1360; CHECK:       @ %bb.0: @ %entry
1361; CHECK-NEXT:    vmov.32 r0, d0[1]
1362; CHECK-NEXT:    vmov.16 d16[1], r0
1363; CHECK-NEXT:    vdup.16 d0, d16[1]
1364; CHECK-NEXT:    bx lr
1365entry:
1366  %x = extractelement <2 x i32> %a, i32 1
1367  %vget_lane = trunc i32 %x to i16
1368  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
1369  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
1370  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
1371  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
1372  ret <4 x i16> %vecinit3.i
1373}
1374
1375define <8 x i16> @test_dup_v4i32_v8i16(<4 x i32> %a) {
1376; CHECK-LABEL: test_dup_v4i32_v8i16:
1377; CHECK:       @ %bb.0: @ %entry
1378; CHECK-NEXT:    vmov.32 r0, d1[1]
1379; CHECK-NEXT:    vmov.16 d16[3], r0
1380; CHECK-NEXT:    vdup.16 q0, d16[3]
1381; CHECK-NEXT:    bx lr
1382entry:
1383  %x = extractelement <4 x i32> %a, i32 3
1384  %vget_lane = trunc i32 %x to i16
1385  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
1386  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
1387  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
1388  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
1389  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
1390  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
1391  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
1392  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
1393  ret <8 x i16> %vecinit7.i
1394}
1395
1396define <4 x i16> @test_dup_v1i64_v4i16(<1 x i64> %a) {
1397; CHECK-LABEL: test_dup_v1i64_v4i16:
1398; CHECK:       @ %bb.0: @ %entry
1399; CHECK-NEXT:    vmov.32 r0, d0[0]
1400; CHECK-NEXT:    vmov.16 d16[0], r0
1401; CHECK-NEXT:    vdup.16 d0, d16[0]
1402; CHECK-NEXT:    bx lr
1403entry:
1404  %x = extractelement <1 x i64> %a, i32 0
1405  %vget_lane = trunc i64 %x to i16
1406  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
1407  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
1408  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
1409  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
1410  ret <4 x i16> %vecinit3.i
1411}
1412
1413define <2 x i32> @test_dup_v1i64_v2i32(<1 x i64> %a) {
1414; CHECK-LABEL: test_dup_v1i64_v2i32:
1415; CHECK:       @ %bb.0: @ %entry
1416; CHECK-NEXT:    vdup.32 d0, d0[0]
1417; CHECK-NEXT:    bx lr
1418entry:
1419  %x = extractelement <1 x i64> %a, i32 0
1420  %vget_lane = trunc i64 %x to i32
1421  %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
1422  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
1423  ret <2 x i32> %vecinit1.i
1424}
1425
1426define <8 x i16> @test_dup_v2i64_v8i16(<2 x i64> %a) {
1427; CHECK-LABEL: test_dup_v2i64_v8i16:
1428; CHECK:       @ %bb.0: @ %entry
1429; CHECK-NEXT:    vmov.32 r0, d1[0]
1430; CHECK-NEXT:    vmov.16 d16[2], r0
1431; CHECK-NEXT:    vdup.16 q0, d16[2]
1432; CHECK-NEXT:    bx lr
1433entry:
1434  %x = extractelement <2 x i64> %a, i32 1
1435  %vget_lane = trunc i64 %x to i16
1436  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
1437  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
1438  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
1439  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
1440  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
1441  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
1442  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
1443  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
1444  ret <8 x i16> %vecinit7.i
1445}
1446
1447define <4 x i32> @test_dup_v2i64_v4i32(<2 x i64> %a) {
1448; CHECK-LABEL: test_dup_v2i64_v4i32:
1449; CHECK:       @ %bb.0: @ %entry
1450; CHECK-NEXT:    vdup.32 q0, d1[0]
1451; CHECK-NEXT:    bx lr
1452entry:
1453  %x = extractelement <2 x i64> %a, i32 1
1454  %vget_lane = trunc i64 %x to i32
1455  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
1456  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
1457  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
1458  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
1459  ret <4 x i32> %vecinit3.i
1460}
1461
1462define <4 x i16> @test_dup_v4i32_v4i16(<4 x i32> %a) {
1463; CHECK-LABEL: test_dup_v4i32_v4i16:
1464; CHECK:       @ %bb.0: @ %entry
1465; CHECK-NEXT:    vmov.32 r0, d0[1]
1466; CHECK-NEXT:    vmov.16 d16[1], r0
1467; CHECK-NEXT:    vdup.16 d0, d16[1]
1468; CHECK-NEXT:    bx lr
1469entry:
1470  %x = extractelement <4 x i32> %a, i32 1
1471  %vget_lane = trunc i32 %x to i16
1472  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
1473  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
1474  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
1475  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
1476  ret <4 x i16> %vecinit3.i
1477}
1478
1479define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) {
1480; CHECK-LABEL: test_dup_v2i64_v4i16:
1481; CHECK:       @ %bb.0: @ %entry
1482; CHECK-NEXT:    vmov.32 r0, d0[0]
1483; CHECK-NEXT:    vmov.16 d16[0], r0
1484; CHECK-NEXT:    vdup.16 d0, d16[0]
1485; CHECK-NEXT:    bx lr
1486entry:
1487  %x = extractelement <2 x i64> %a, i32 0
1488  %vget_lane = trunc i64 %x to i16
1489  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
1490  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
1491  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
1492  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
1493  ret <4 x i16> %vecinit3.i
1494}
1495
1496define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) {
1497; CHECK-LABEL: test_dup_v2i64_v2i32:
1498; CHECK:       @ %bb.0: @ %entry
1499; CHECK-NEXT:    vdup.32 d0, d0[0]
1500; CHECK-NEXT:    bx lr
1501entry:
1502  %x = extractelement <2 x i64> %a, i32 0
1503  %vget_lane = trunc i64 %x to i32
1504  %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
1505  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
1506  ret <2 x i32> %vecinit1.i
1507}
1508
1509define <2 x i32> @test_concat_undef_v1i32(<2 x i32> %a) {
1510; CHECK-LABEL: test_concat_undef_v1i32:
1511; CHECK:       @ %bb.0: @ %entry
1512; CHECK-NEXT:    vdup.32 d0, d0[0]
1513; CHECK-NEXT:    bx lr
1514entry:
1515  %0 = extractelement <2 x i32> %a, i32 0
1516  %vecinit1.i = insertelement <2 x i32> undef, i32 %0, i32 1
1517  ret <2 x i32> %vecinit1.i
1518}
1519
1520define <2 x i32> @test_concat_same_v1i32_v1i32(<2 x i32> %a) {
1521; CHECK-LABEL: test_concat_same_v1i32_v1i32:
1522; CHECK:       @ %bb.0: @ %entry
1523; CHECK-NEXT:    vdup.32 d0, d0[0]
1524; CHECK-NEXT:    bx lr
1525entry:
1526  %0 = extractelement <2 x i32> %a, i32 0
1527  %vecinit.i = insertelement <2 x i32> undef, i32 %0, i32 0
1528  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %0, i32 1
1529  ret <2 x i32> %vecinit1.i
1530}
1531
1532
1533define <16 x i8> @test_concat_v16i8_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) #0 {
1534; CHECK-LABEL: test_concat_v16i8_v16i8_v16i8:
1535; CHECK:       @ %bb.0: @ %entry
1536; CHECK-NEXT:    vmov.f64 d1, d2
1537; CHECK-NEXT:    bx lr
1538entry:
1539  %vecinit30 = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
1540  ret <16 x i8> %vecinit30
1541}
1542
1543define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
1544; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8:
1545; CHECK:       @ %bb.0: @ %entry
1546; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
1547; CHECK-NEXT:    vmov.f64 d1, d2
1548; CHECK-NEXT:    bx lr
1549entry:
1550  %vecext = extractelement <8 x i8> %x, i32 0
1551  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
1552  %vecext1 = extractelement <8 x i8> %x, i32 1
1553  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
1554  %vecext3 = extractelement <8 x i8> %x, i32 2
1555  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
1556  %vecext5 = extractelement <8 x i8> %x, i32 3
1557  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
1558  %vecext7 = extractelement <8 x i8> %x, i32 4
1559  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
1560  %vecext9 = extractelement <8 x i8> %x, i32 5
1561  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
1562  %vecext11 = extractelement <8 x i8> %x, i32 6
1563  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
1564  %vecext13 = extractelement <8 x i8> %x, i32 7
1565  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
1566  %vecinit30 = shufflevector <16 x i8> %vecinit14, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
1567  ret <16 x i8> %vecinit30
1568}
1569
1570define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 {
1571; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8:
1572; CHECK:       @ %bb.0: @ %entry
1573; CHECK-NEXT:    vmov.f64 d1, d2
1574; CHECK-NEXT:    bx lr
1575entry:
1576  %vecext = extractelement <16 x i8> %x, i32 0
1577  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
1578  %vecext1 = extractelement <16 x i8> %x, i32 1
1579  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
1580  %vecext3 = extractelement <16 x i8> %x, i32 2
1581  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
1582  %vecext5 = extractelement <16 x i8> %x, i32 3
1583  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
1584  %vecext7 = extractelement <16 x i8> %x, i32 4
1585  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
1586  %vecext9 = extractelement <16 x i8> %x, i32 5
1587  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
1588  %vecext11 = extractelement <16 x i8> %x, i32 6
1589  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
1590  %vecext13 = extractelement <16 x i8> %x, i32 7
1591  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
1592  %vecext15 = extractelement <8 x i8> %y, i32 0
1593  %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8
1594  %vecext17 = extractelement <8 x i8> %y, i32 1
1595  %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9
1596  %vecext19 = extractelement <8 x i8> %y, i32 2
1597  %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10
1598  %vecext21 = extractelement <8 x i8> %y, i32 3
1599  %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11
1600  %vecext23 = extractelement <8 x i8> %y, i32 4
1601  %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12
1602  %vecext25 = extractelement <8 x i8> %y, i32 5
1603  %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13
1604  %vecext27 = extractelement <8 x i8> %y, i32 6
1605  %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14
1606  %vecext29 = extractelement <8 x i8> %y, i32 7
1607  %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15
1608  ret <16 x i8> %vecinit30
1609}
1610
1611define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
1612; CHECK-LABEL: test_concat_v16i8_v8i8_v8i8:
1613; CHECK:       @ %bb.0: @ %entry
1614; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0 def $q0
1615; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0 def $q0
1616; CHECK-NEXT:    bx lr
1617entry:
1618  %vecext = extractelement <8 x i8> %x, i32 0
1619  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
1620  %vecext1 = extractelement <8 x i8> %x, i32 1
1621  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
1622  %vecext3 = extractelement <8 x i8> %x, i32 2
1623  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
1624  %vecext5 = extractelement <8 x i8> %x, i32 3
1625  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
1626  %vecext7 = extractelement <8 x i8> %x, i32 4
1627  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
1628  %vecext9 = extractelement <8 x i8> %x, i32 5
1629  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
1630  %vecext11 = extractelement <8 x i8> %x, i32 6
1631  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
1632  %vecext13 = extractelement <8 x i8> %x, i32 7
1633  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
1634  %vecext15 = extractelement <8 x i8> %y, i32 0
1635  %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8
1636  %vecext17 = extractelement <8 x i8> %y, i32 1
1637  %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9
1638  %vecext19 = extractelement <8 x i8> %y, i32 2
1639  %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10
1640  %vecext21 = extractelement <8 x i8> %y, i32 3
1641  %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11
1642  %vecext23 = extractelement <8 x i8> %y, i32 4
1643  %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12
1644  %vecext25 = extractelement <8 x i8> %y, i32 5
1645  %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13
1646  %vecext27 = extractelement <8 x i8> %y, i32 6
1647  %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14
1648  %vecext29 = extractelement <8 x i8> %y, i32 7
1649  %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15
1650  ret <16 x i8> %vecinit30
1651}
1652
1653define <8 x i16> @test_concat_v8i16_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) #0 {
1654; CHECK-LABEL: test_concat_v8i16_v8i16_v8i16:
1655; CHECK:       @ %bb.0: @ %entry
1656; CHECK-NEXT:    vmov.f64 d1, d2
1657; CHECK-NEXT:    bx lr
1658entry:
1659  %vecinit14 = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1660  ret <8 x i16> %vecinit14
1661}
1662
1663define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
1664; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16:
1665; CHECK:       @ %bb.0: @ %entry
1666; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
1667; CHECK-NEXT:    vmov.f64 d1, d2
1668; CHECK-NEXT:    bx lr
1669entry:
1670  %vecext = extractelement <4 x i16> %x, i32 0
1671  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
1672  %vecext1 = extractelement <4 x i16> %x, i32 1
1673  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
1674  %vecext3 = extractelement <4 x i16> %x, i32 2
1675  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
1676  %vecext5 = extractelement <4 x i16> %x, i32 3
1677  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
1678  %vecinit14 = shufflevector <8 x i16> %vecinit6, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1679  ret <8 x i16> %vecinit14
1680}
1681
1682define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 {
1683; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16:
1684; CHECK:       @ %bb.0: @ %entry
1685; CHECK-NEXT:    vmov.f64 d1, d2
1686; CHECK-NEXT:    bx lr
1687entry:
1688  %vecext = extractelement <8 x i16> %x, i32 0
1689  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
1690  %vecext1 = extractelement <8 x i16> %x, i32 1
1691  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
1692  %vecext3 = extractelement <8 x i16> %x, i32 2
1693  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
1694  %vecext5 = extractelement <8 x i16> %x, i32 3
1695  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
1696  %vecext7 = extractelement <4 x i16> %y, i32 0
1697  %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4
1698  %vecext9 = extractelement <4 x i16> %y, i32 1
1699  %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5
1700  %vecext11 = extractelement <4 x i16> %y, i32 2
1701  %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6
1702  %vecext13 = extractelement <4 x i16> %y, i32 3
1703  %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7
1704  ret <8 x i16> %vecinit14
1705}
1706
1707define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
1708; CHECK-LABEL: test_concat_v8i16_v4i16_v4i16:
1709; CHECK:       @ %bb.0: @ %entry
1710; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0 def $q0
1711; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0 def $q0
1712; CHECK-NEXT:    bx lr
1713entry:
1714  %vecext = extractelement <4 x i16> %x, i32 0
1715  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
1716  %vecext1 = extractelement <4 x i16> %x, i32 1
1717  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
1718  %vecext3 = extractelement <4 x i16> %x, i32 2
1719  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
1720  %vecext5 = extractelement <4 x i16> %x, i32 3
1721  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
1722  %vecext7 = extractelement <4 x i16> %y, i32 0
1723  %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4
1724  %vecext9 = extractelement <4 x i16> %y, i32 1
1725  %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5
1726  %vecext11 = extractelement <4 x i16> %y, i32 2
1727  %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6
1728  %vecext13 = extractelement <4 x i16> %y, i32 3
1729  %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7
1730  ret <8 x i16> %vecinit14
1731}
1732
1733define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 {
1734; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32:
1735; CHECK:       @ %bb.0: @ %entry
1736; CHECK-NEXT:    vmov.f64 d1, d2
1737; CHECK-NEXT:    bx lr
1738entry:
1739  %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1740  ret <4 x i32> %vecinit6
1741}
1742
1743define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
1744; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32:
1745; CHECK:       @ %bb.0: @ %entry
1746; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
1747; CHECK-NEXT:    vmov.f64 d1, d2
1748; CHECK-NEXT:    bx lr
1749entry:
1750  %vecext = extractelement <2 x i32> %x, i32 0
1751  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1752  %vecext1 = extractelement <2 x i32> %x, i32 1
1753  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
1754  %vecinit6 = shufflevector <4 x i32> %vecinit2, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1755  ret <4 x i32> %vecinit6
1756}
1757
1758define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 {
1759; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32:
1760; CHECK:       @ %bb.0: @ %entry
1761; CHECK-NEXT:    vmov.f64 d1, d2
1762; CHECK-NEXT:    bx lr
1763entry:
1764  %vecext = extractelement <4 x i32> %x, i32 0
1765  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1766  %vecext1 = extractelement <4 x i32> %x, i32 1
1767  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
1768  %vecext3 = extractelement <2 x i32> %y, i32 0
1769  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
1770  %vecext5 = extractelement <2 x i32> %y, i32 1
1771  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %vecext5, i32 3
1772  ret <4 x i32> %vecinit6
1773}
1774
1775define <4 x i32> @test_concat_v4i32_v2i32_v2i32(<2 x i32> %x, <2 x i32> %y) #0 {
1776; CHECK-LABEL: test_concat_v4i32_v2i32_v2i32:
1777; CHECK:       @ %bb.0: @ %entry
1778; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0 def $q0
1779; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0 def $q0
1780; CHECK-NEXT:    bx lr
1781entry:
1782  %vecinit6 = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1783  ret <4 x i32> %vecinit6
1784}
1785
1786define <2 x i64> @test_concat_v2i64_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) #0 {
1787; CHECK-LABEL: test_concat_v2i64_v2i64_v2i64:
1788; CHECK:       @ %bb.0: @ %entry
1789; CHECK-NEXT:    vmov.f64 d1, d2
1790; CHECK-NEXT:    bx lr
1791entry:
1792  %vecinit2 = shufflevector <2 x i64> %x, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
1793  ret <2 x i64> %vecinit2
1794}
1795
1796define <2 x i64> @test_concat_v2i64_v1i64_v2i64(<1 x i64> %x, <2 x i64> %y) #0 {
1797; CHECK-LABEL: test_concat_v2i64_v1i64_v2i64:
1798; CHECK:       @ %bb.0: @ %entry
1799; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
1800; CHECK-NEXT:    vmov.f64 d1, d2
1801; CHECK-NEXT:    bx lr
1802entry:
1803  %vecext = extractelement <1 x i64> %x, i32 0
1804  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
1805  %vecinit2 = shufflevector <2 x i64> %vecinit, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
1806  ret <2 x i64> %vecinit2
1807}
1808
1809define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 {
1810; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64:
1811; CHECK:       @ %bb.0: @ %entry
1812; CHECK-NEXT:    vmov.f64 d1, d2
1813; CHECK-NEXT:    bx lr
1814entry:
1815  %vecext = extractelement <2 x i64> %x, i32 0
1816  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
1817  %vecext1 = extractelement <1 x i64> %y, i32 0
1818  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1
1819  ret <2 x i64> %vecinit2
1820}
1821
1822define <2 x i64> @test_concat_v2i64_v1i64_v1i64(<1 x i64> %x, <1 x i64> %y) #0 {
1823; CHECK-LABEL: test_concat_v2i64_v1i64_v1i64:
1824; CHECK:       @ %bb.0: @ %entry
1825; CHECK-NEXT:    @ kill: def $d1 killed $d1 killed $q0 def $q0
1826; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0 def $q0
1827; CHECK-NEXT:    bx lr
1828entry:
1829  %vecext = extractelement <1 x i64> %x, i32 0
1830  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
1831  %vecext1 = extractelement <1 x i64> %y, i32 0
1832  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1
1833  ret <2 x i64> %vecinit2
1834}
1835
1836
1837define <4 x i16> @concat_vector_v4i16_const() {
1838; CHECK-LABEL: concat_vector_v4i16_const:
1839; CHECK:       @ %bb.0:
1840; CHECK-NEXT:    vmov.i32 d0, #0x0
1841; CHECK-NEXT:    bx lr
1842 %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer
1843 ret <4 x i16> %r
1844}
1845
1846define <4 x i16> @concat_vector_v4i16_const_one() {
1847; CHECK-LABEL: concat_vector_v4i16_const_one:
1848; CHECK:       @ %bb.0:
1849; CHECK-NEXT:    vmov.i16 d0, #0x1
1850; CHECK-NEXT:    bx lr
1851 %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <4 x i32> zeroinitializer
1852 ret <4 x i16> %r
1853}
1854
1855define <4 x i32> @concat_vector_v4i32_const() {
1856; CHECK-LABEL: concat_vector_v4i32_const:
1857; CHECK:       @ %bb.0:
1858; CHECK-NEXT:    vmov.i32 q0, #0x0
1859; CHECK-NEXT:    bx lr
1860 %r = shufflevector <1 x i32> zeroinitializer, <1 x i32> undef, <4 x i32> zeroinitializer
1861 ret <4 x i32> %r
1862}
1863
1864define <8 x i8> @concat_vector_v8i8_const() {
1865; CHECK-LABEL: concat_vector_v8i8_const:
1866; CHECK:       @ %bb.0:
1867; CHECK-NEXT:    vmov.i32 d0, #0x0
1868; CHECK-NEXT:    bx lr
1869 %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
1870 ret <8 x i8> %r
1871}
1872
1873define <8 x i16> @concat_vector_v8i16_const() {
1874; CHECK-LABEL: concat_vector_v8i16_const:
1875; CHECK:       @ %bb.0:
1876; CHECK-NEXT:    vmov.i32 q0, #0x0
1877; CHECK-NEXT:    bx lr
1878 %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <8 x i32> zeroinitializer
1879 ret <8 x i16> %r
1880}
1881
1882define <8 x i16> @concat_vector_v8i16_const_one() {
1883; CHECK-LABEL: concat_vector_v8i16_const_one:
1884; CHECK:       @ %bb.0:
1885; CHECK-NEXT:    vmov.i16 q0, #0x1
1886; CHECK-NEXT:    bx lr
1887 %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <8 x i32> zeroinitializer
1888 ret <8 x i16> %r
1889}
1890
1891define <16 x i8> @concat_vector_v16i8_const() {
1892; CHECK-LABEL: concat_vector_v16i8_const:
1893; CHECK:       @ %bb.0:
1894; CHECK-NEXT:    vmov.i32 q0, #0x0
1895; CHECK-NEXT:    bx lr
1896 %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <16 x i32> zeroinitializer
1897 ret <16 x i8> %r
1898}
1899
1900define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) {
1901; CHECK-LABEL: concat_vector_v4i16:
1902; CHECK:       @ %bb.0:
1903; CHECK-NEXT:    vdup.16 d0, r0
1904; CHECK-NEXT:    bx lr
1905 %r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer
1906 ret <4 x i16> %r
1907}
1908
1909define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) {
1910; CHECK-LABEL: concat_vector_v4i32:
1911; CHECK:       @ %bb.0:
1912; CHECK-NEXT:    vdup.32 q0, r0
1913; CHECK-NEXT:    bx lr
1914 %r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer
1915 ret <4 x i32> %r
1916}
1917
1918define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
1919; CHECK-LABEL: concat_vector_v8i8:
1920; CHECK:       @ %bb.0:
1921; CHECK-NEXT:    vdup.8 d0, r0
1922; CHECK-NEXT:    bx lr
1923 %r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer
1924 ret <8 x i8> %r
1925}
1926
1927define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) {
1928; CHECK-LABEL: concat_vector_v8i16:
1929; CHECK:       @ %bb.0:
1930; CHECK-NEXT:    vdup.16 q0, r0
1931; CHECK-NEXT:    bx lr
1932 %r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer
1933 ret <8 x i16> %r
1934}
1935
1936define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
1937; CHECK-LABEL: concat_vector_v16i8:
1938; CHECK:       @ %bb.0:
1939; CHECK-NEXT:    vdup.8 q0, r0
1940; CHECK-NEXT:    bx lr
1941 %r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer
1942 ret <16 x i8> %r
1943}
1944