xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-shuffle.ll (revision 7b3bbd83c0c24087072ec5b22a76799ab31f87d5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV
3; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV,CHECKFP
4; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS
5; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS,CHECKFP
6
7define arm_aapcs_vfpcc <4 x i32> @shuffle1_i32(<4 x i32> %src) {
8; CHECK-LABEL: shuffle1_i32:
9; CHECK:       @ %bb.0: @ %entry
10; CHECK-NEXT:    vmov.f32 s4, s3
11; CHECK-NEXT:    vmov.f32 s5, s2
12; CHECK-NEXT:    vmov.f32 s6, s1
13; CHECK-NEXT:    vmov.f32 s7, s0
14; CHECK-NEXT:    vmov q0, q1
15; CHECK-NEXT:    bx lr
16entry:
17  %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
18  ret <4 x i32> %out
19}
20
21define arm_aapcs_vfpcc <4 x i32> @shuffle2_i32(<4 x i32> %src) {
22; CHECK-LABEL: shuffle2_i32:
23; CHECK:       @ %bb.0: @ %entry
24; CHECK-NEXT:    bx lr
25entry:
26  %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
27  ret <4 x i32> %out
28}
29
30define arm_aapcs_vfpcc <4 x i32> @shuffle3_i32(<4 x i32> %src) {
31; CHECK-LABEL: shuffle3_i32:
32; CHECK:       @ %bb.0: @ %entry
33; CHECK-NEXT:    vmov.f32 s4, s3
34; CHECK-NEXT:    vmov.f32 s5, s1
35; CHECK-NEXT:    vmov.f32 s6, s2
36; CHECK-NEXT:    vmov.f32 s7, s0
37; CHECK-NEXT:    vmov q0, q1
38; CHECK-NEXT:    bx lr
39entry:
40  %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
41  ret <4 x i32> %out
42}
43
44define arm_aapcs_vfpcc <4 x i32> @shuffle5_i32(<4 x i32> %src) {
45; CHECK-LABEL: shuffle5_i32:
46; CHECK:       @ %bb.0: @ %entry
47; CHECK-NEXT:    vrev64.32 q1, q0
48; CHECK-NEXT:    vmov q0, q1
49; CHECK-NEXT:    bx lr
50entry:
51  %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
52  ret <4 x i32> %out
53}
54
55define arm_aapcs_vfpcc <4 x i32> @shuffle6_i32(<4 x i32> %src) {
56; CHECK-LABEL: shuffle6_i32:
57; CHECK:       @ %bb.0: @ %entry
58; CHECK-NEXT:    bx lr
59entry:
60  %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 3>
61  ret <4 x i32> %out
62}
63
64define arm_aapcs_vfpcc <4 x i32> @oneoff11_i32(<4 x i32> %src1, <4 x i32> %src2) {
65; CHECK-LABEL: oneoff11_i32:
66; CHECK:       @ %bb.0: @ %entry
67; CHECK-NEXT:    vmov.f32 s2, s1
68; CHECK-NEXT:    bx lr
69entry:
70  %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 0, i32 1, i32 1, i32 3>
71  ret <4 x i32> %out
72}
73
74define arm_aapcs_vfpcc <4 x i32> @oneoff12_i32(<4 x i32> %src1, <4 x i32> %src2) {
75; CHECK-LABEL: oneoff12_i32:
76; CHECK:       @ %bb.0: @ %entry
77; CHECK-NEXT:    vmov.f32 s0, s4
78; CHECK-NEXT:    bx lr
79entry:
80  %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
81  ret <4 x i32> %out
82}
83
84define arm_aapcs_vfpcc <4 x i32> @oneoff21_i32(<4 x i32> %src1, <4 x i32> %src2) {
85; CHECK-LABEL: oneoff21_i32:
86; CHECK:       @ %bb.0: @ %entry
87; CHECK-NEXT:    vmov.f32 s7, s0
88; CHECK-NEXT:    vmov q0, q1
89; CHECK-NEXT:    bx lr
90entry:
91  %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 4, i32 5, i32 6, i32 0>
92  ret <4 x i32> %out
93}
94
95define arm_aapcs_vfpcc <4 x i32> @oneoff22_i32(<4 x i32> %src1, <4 x i32> %src2) {
96; CHECK-LABEL: oneoff22_i32:
97; CHECK:       @ %bb.0: @ %entry
98; CHECK-NEXT:    vmov q0, q1
99; CHECK-NEXT:    vmov.f32 s2, s0
100; CHECK-NEXT:    bx lr
101entry:
102  %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 4, i32 5, i32 4, i32 7>
103  ret <4 x i32> %out
104}
105
106define arm_aapcs_vfpcc <4 x i32> @oneoffundef_i32(<4 x i32> %src1, <4 x i32> %src2) {
107; CHECK-LABEL: oneoffundef_i32:
108; CHECK:       @ %bb.0: @ %entry
109; CHECK-NEXT:    vmov.f32 s1, s4
110; CHECK-NEXT:    bx lr
111entry:
112  %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 0, i32 4, i32 undef, i32 3>
113  ret <4 x i32> %out
114}
115
116define arm_aapcs_vfpcc <4 x i32> @shuffle2step_i32(<8 x i32> %src) {
117; CHECK-LABEL: shuffle2step_i32:
118; CHECK:       @ %bb.0: @ %entry
119; CHECK-NEXT:    vmov.f32 s8, s1
120; CHECK-NEXT:    vmov.f32 s9, s3
121; CHECK-NEXT:    vmov.f32 s1, s2
122; CHECK-NEXT:    vmov.f32 s10, s5
123; CHECK-NEXT:    vmov.f32 s11, s7
124; CHECK-NEXT:    vmov.f32 s2, s4
125; CHECK-NEXT:    vmov.f32 s3, s6
126; CHECK-NEXT:    vadd.i32 q0, q0, q2
127; CHECK-NEXT:    bx lr
128entry:
129  %s1 = shufflevector <8 x i32> %src, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
130  %s2 = shufflevector <8 x i32> %src, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
131  %r = add <4 x i32> %s1, %s2
132  ret <4 x i32> %r
133}
134
135define arm_aapcs_vfpcc <4 x i32> @shuffle3step_i32(<16 x i32> %src) {
136; CHECK-LABEL: shuffle3step_i32:
137; CHECK:       @ %bb.0: @ %entry
138; CHECK-NEXT:    .vsave {d8, d9}
139; CHECK-NEXT:    vpush {d8, d9}
140; CHECK-NEXT:    vmov.f32 s13, s4
141; CHECK-NEXT:    vmov.f32 s14, s7
142; CHECK-NEXT:    vmov.f32 s18, s6
143; CHECK-NEXT:    vmov.f32 s12, s1
144; CHECK-NEXT:    vmov.f32 s15, s10
145; CHECK-NEXT:    vmov.f32 s16, s0
146; CHECK-NEXT:    vmov.f32 s17, s3
147; CHECK-NEXT:    vmov.f32 s19, s9
148; CHECK-NEXT:    vadd.i32 q3, q4, q3
149; CHECK-NEXT:    vmov.f32 s4, s2
150; CHECK-NEXT:    vmov.f32 s6, s8
151; CHECK-NEXT:    vmov.f32 s7, s11
152; CHECK-NEXT:    vadd.i32 q0, q3, q1
153; CHECK-NEXT:    vpop {d8, d9}
154; CHECK-NEXT:    bx lr
155entry:
156  %s1 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
157  %s2 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
158  %s3 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
159  %a = add <4 x i32> %s1, %s2
160  %r = add <4 x i32> %a, %s3
161  ret <4 x i32> %r
162}
163
164define arm_aapcs_vfpcc <4 x i32> @shuffle4step_i32(<16 x i32> %src) {
165; CHECK-LABEL: shuffle4step_i32:
166; CHECK:       @ %bb.0: @ %entry
167; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
168; CHECK-NEXT:    vpush {d8, d9, d10, d11}
169; CHECK-NEXT:    vmov.f32 s16, s3
170; CHECK-NEXT:    vmov.f32 s20, s2
171; CHECK-NEXT:    vmov.f32 s17, s7
172; CHECK-NEXT:    vmov.f32 s18, s11
173; CHECK-NEXT:    vmov.f32 s19, s15
174; CHECK-NEXT:    vmov.f32 s21, s6
175; CHECK-NEXT:    vmov.f32 s22, s10
176; CHECK-NEXT:    vmov.f32 s23, s14
177; CHECK-NEXT:    vadd.i32 q4, q5, q4
178; CHECK-NEXT:    vmov.f32 s20, s1
179; CHECK-NEXT:    vmov.f32 s21, s5
180; CHECK-NEXT:    vmov.f32 s22, s9
181; CHECK-NEXT:    vmov.f32 s23, s13
182; CHECK-NEXT:    vmov.f32 s1, s4
183; CHECK-NEXT:    vmov.f32 s2, s8
184; CHECK-NEXT:    vmov.f32 s3, s12
185; CHECK-NEXT:    vadd.i32 q0, q0, q5
186; CHECK-NEXT:    vadd.i32 q0, q0, q4
187; CHECK-NEXT:    vpop {d8, d9, d10, d11}
188; CHECK-NEXT:    bx lr
189entry:
190  %s1 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
191  %s2 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
192  %s3 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
193  %s4 = shufflevector <16 x i32> %src, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
194  %a1 = add <4 x i32> %s1, %s2
195  %a2 = add <4 x i32> %s3, %s4
196  %r = add <4 x i32> %a1, %a2
197  ret <4 x i32> %r
198}
199
200; i16
201
202define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) {
203; CHECK-LABEL: shuffle1_i16:
204; CHECK:       @ %bb.0: @ %entry
205; CHECK-NEXT:    vrev64.16 q1, q0
206; CHECK-NEXT:    vmov.f32 s0, s6
207; CHECK-NEXT:    vmov.f32 s1, s7
208; CHECK-NEXT:    vmov.f32 s2, s4
209; CHECK-NEXT:    vmov.f32 s3, s5
210; CHECK-NEXT:    bx lr
211entry:
212  %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
213  ret <8 x i16> %out
214}
215
216define arm_aapcs_vfpcc <8 x i16> @shuffle2_i16(<8 x i16> %src) {
217; CHECK-LABEL: shuffle2_i16:
218; CHECK:       @ %bb.0: @ %entry
219; CHECK-NEXT:    bx lr
220entry:
221  %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
222  ret <8 x i16> %out
223}
224
225define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) {
226; CHECK-LABEL: shuffle3_i16:
227; CHECK:       @ %bb.0: @ %entry
228; CHECK-NEXT:    vmov q1, q0
229; CHECK-NEXT:    vmovx.f16 s2, s5
230; CHECK-NEXT:    vmovx.f16 s0, s4
231; CHECK-NEXT:    vins.f16 s5, s4
232; CHECK-NEXT:    vins.f16 s2, s0
233; CHECK-NEXT:    vmov.f32 s3, s5
234; CHECK-NEXT:    vmovx.f16 s1, s7
235; CHECK-NEXT:    vmov.f32 s0, s6
236; CHECK-NEXT:    vins.f16 s1, s7
237; CHECK-NEXT:    bx lr
238entry:
239  %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
240  ret <8 x i16> %out
241}
242
243define arm_aapcs_vfpcc <8 x i16> @shuffle5_i16(<8 x i16> %src) {
244; CHECK-LABEL: shuffle5_i16:
245; CHECK:       @ %bb.0: @ %entry
246; CHECK-NEXT:    vrev64.16 q1, q0
247; CHECK-NEXT:    vmov q0, q1
248; CHECK-NEXT:    bx lr
249entry:
250  %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
251  ret <8 x i16> %out
252}
253
254define arm_aapcs_vfpcc <8 x i16> @shuffle6_i16(<8 x i16> %src) {
255; CHECK-LABEL: shuffle6_i16:
256; CHECK:       @ %bb.0: @ %entry
257; CHECK-NEXT:    vrev32.16 q0, q0
258; CHECK-NEXT:    bx lr
259entry:
260  %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
261  ret <8 x i16> %out
262}
263
264define arm_aapcs_vfpcc <8 x i16> @oneoff11_i16(<8 x i16> %src1, <8 x i16> %src2) {
265; CHECK-LABEL: oneoff11_i16:
266; CHECK:       @ %bb.0: @ %entry
267; CHECK-NEXT:    vmov.u16 r0, q0[1]
268; CHECK-NEXT:    vmov.16 q0[2], r0
269; CHECK-NEXT:    bx lr
270entry:
271  %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 0, i32 1, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7>
272  ret <8 x i16> %out
273}
274
275define arm_aapcs_vfpcc <8 x i16> @oneoff12_i16(<8 x i16> %src1, <8 x i16> %src2) {
276; CHECK-LABEL: oneoff12_i16:
277; CHECK:       @ %bb.0: @ %entry
278; CHECK-NEXT:    vmov.u16 r0, q1[0]
279; CHECK-NEXT:    vmov.16 q0[0], r0
280; CHECK-NEXT:    bx lr
281entry:
282  %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
283  ret <8 x i16> %out
284}
285
286define arm_aapcs_vfpcc <8 x i16> @oneoff21_i16(<8 x i16> %src1, <8 x i16> %src2) {
287; CHECK-LABEL: oneoff21_i16:
288; CHECK:       @ %bb.0: @ %entry
289; CHECK-NEXT:    vins.f16 s5, s0
290; CHECK-NEXT:    vmov q0, q1
291; CHECK-NEXT:    bx lr
292entry:
293  %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 8, i32 9, i32 10, i32 0, i32 12, i32 13, i32 14, i32 15>
294  ret <8 x i16> %out
295}
296
297define arm_aapcs_vfpcc <8 x i16> @oneoff22_i16(<8 x i16> %src1, <8 x i16> %src2) {
298; CHECK-LABEL: oneoff22_i16:
299; CHECK:       @ %bb.0: @ %entry
300; CHECK-NEXT:    vmov q0, q1
301; CHECK-NEXT:    vmov.u16 r0, q1[6]
302; CHECK-NEXT:    vmov.16 q0[0], r0
303; CHECK-NEXT:    bx lr
304entry:
305  %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 14, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
306  ret <8 x i16> %out
307}
308
309define arm_aapcs_vfpcc <8 x i16> @oneoffundef_i16(<8 x i16> %src1, <8 x i16> %src2) {
310; CHECK-LABEL: oneoffundef_i16:
311; CHECK:       @ %bb.0: @ %entry
312; CHECK-NEXT:    vmov.u16 r0, q0[3]
313; CHECK-NEXT:    vmov.16 q1[5], r0
314; CHECK-NEXT:    vmov q0, q1
315; CHECK-NEXT:    bx lr
316entry:
317  %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 8, i32 9, i32 undef, i32 undef, i32 12, i32 3, i32 14, i32 15>
318  ret <8 x i16> %out
319}
320
321define arm_aapcs_vfpcc <8 x i16> @shuffle2step_i16(<16 x i16> %src) {
322; CHECK-LABEL: shuffle2step_i16:
323; CHECK:       @ %bb.0: @ %entry
324; CHECK-NEXT:    .pad #32
325; CHECK-NEXT:    sub sp, #32
326; CHECK-NEXT:    mov r0, sp
327; CHECK-NEXT:    vshr.u32 q2, q1, #16
328; CHECK-NEXT:    vstrh.32 q2, [r0, #8]
329; CHECK-NEXT:    vshr.u32 q2, q0, #16
330; CHECK-NEXT:    add r1, sp, #16
331; CHECK-NEXT:    vstrh.32 q2, [r0]
332; CHECK-NEXT:    vstrh.32 q1, [r1, #8]
333; CHECK-NEXT:    vstrh.32 q0, [r1]
334; CHECK-NEXT:    vldrw.u32 q0, [r0]
335; CHECK-NEXT:    vldrw.u32 q1, [r1]
336; CHECK-NEXT:    vadd.i16 q0, q1, q0
337; CHECK-NEXT:    add sp, #32
338; CHECK-NEXT:    bx lr
339entry:
340  %s1 = shufflevector <16 x i16> %src, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
341  %s2 = shufflevector <16 x i16> %src, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
342  %r = add <8 x i16> %s1, %s2
343  ret <8 x i16> %r
344}
345
346define arm_aapcs_vfpcc <8 x i16> @shuffle3step_i16(<32 x i16> %src) {
347; CHECK-LABEL: shuffle3step_i16:
348; CHECK:       @ %bb.0: @ %entry
349; CHECK-NEXT:    .vsave {d8, d9}
350; CHECK-NEXT:    vpush {d8, d9}
351; CHECK-NEXT:    vmovx.f16 s12, s0
352; CHECK-NEXT:    vmov.f32 s16, s1
353; CHECK-NEXT:    vins.f16 s12, s2
354; CHECK-NEXT:    vmovx.f16 s2, s2
355; CHECK-NEXT:    vins.f16 s16, s2
356; CHECK-NEXT:    vmovx.f16 s2, s5
357; CHECK-NEXT:    vmov.f32 s17, s4
358; CHECK-NEXT:    vmovx.f16 s13, s3
359; CHECK-NEXT:    vins.f16 s17, s2
360; CHECK-NEXT:    vmov.f32 s18, s7
361; CHECK-NEXT:    vmovx.f16 s2, s8
362; CHECK-NEXT:    vmov.f32 s19, s10
363; CHECK-NEXT:    vins.f16 s18, s2
364; CHECK-NEXT:    vmovx.f16 s2, s11
365; CHECK-NEXT:    vins.f16 s19, s2
366; CHECK-NEXT:    vmovx.f16 s2, s1
367; CHECK-NEXT:    vins.f16 s0, s2
368; CHECK-NEXT:    vmovx.f16 s2, s4
369; CHECK-NEXT:    vins.f16 s3, s2
370; CHECK-NEXT:    vmovx.f16 s2, s7
371; CHECK-NEXT:    vmovx.f16 s4, s10
372; CHECK-NEXT:    vmovx.f16 s14, s6
373; CHECK-NEXT:    vmovx.f16 s15, s9
374; CHECK-NEXT:    vins.f16 s6, s2
375; CHECK-NEXT:    vins.f16 s9, s4
376; CHECK-NEXT:    vmov.f32 s1, s3
377; CHECK-NEXT:    vins.f16 s14, s8
378; CHECK-NEXT:    vins.f16 s15, s11
379; CHECK-NEXT:    vins.f16 s13, s5
380; CHECK-NEXT:    vmov.f32 s2, s6
381; CHECK-NEXT:    vmov.f32 s3, s9
382; CHECK-NEXT:    vadd.i16 q0, q0, q3
383; CHECK-NEXT:    vadd.i16 q0, q0, q4
384; CHECK-NEXT:    vpop {d8, d9}
385; CHECK-NEXT:    bx lr
386entry:
387  %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
388  %s2 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
389  %s3 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
390  %a = add <8 x i16> %s1, %s2
391  %r = add <8 x i16> %a, %s3
392  ret <8 x i16> %r
393}
394
395define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) {
396; CHECK-LABEL: shuffle4step_i16:
397; CHECK:       @ %bb.0: @ %entry
398; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
399; CHECK-NEXT:    vpush {d8, d9, d10, d11}
400; CHECK-NEXT:    vmovx.f16 s18, s9
401; CHECK-NEXT:    vmovx.f16 s16, s11
402; CHECK-NEXT:    vins.f16 s18, s16
403; CHECK-NEXT:    vmovx.f16 s19, s13
404; CHECK-NEXT:    vmovx.f16 s16, s15
405; CHECK-NEXT:    vmovx.f16 s20, s3
406; CHECK-NEXT:    vins.f16 s19, s16
407; CHECK-NEXT:    vmovx.f16 s16, s1
408; CHECK-NEXT:    vins.f16 s16, s20
409; CHECK-NEXT:    vmovx.f16 s17, s5
410; CHECK-NEXT:    vmovx.f16 s20, s7
411; CHECK-NEXT:    vins.f16 s9, s11
412; CHECK-NEXT:    vins.f16 s13, s15
413; CHECK-NEXT:    vins.f16 s5, s7
414; CHECK-NEXT:    vins.f16 s1, s3
415; CHECK-NEXT:    vins.f16 s17, s20
416; CHECK-NEXT:    vmov.f32 s20, s1
417; CHECK-NEXT:    vmovx.f16 s1, s10
418; CHECK-NEXT:    vmov.f32 s22, s9
419; CHECK-NEXT:    vmov.f32 s23, s13
420; CHECK-NEXT:    vmov.f32 s21, s5
421; CHECK-NEXT:    vadd.i16 q4, q5, q4
422; CHECK-NEXT:    vmovx.f16 s22, s8
423; CHECK-NEXT:    vins.f16 s22, s1
424; CHECK-NEXT:    vmovx.f16 s23, s12
425; CHECK-NEXT:    vmovx.f16 s1, s14
426; CHECK-NEXT:    vmovx.f16 s20, s0
427; CHECK-NEXT:    vins.f16 s23, s1
428; CHECK-NEXT:    vmovx.f16 s1, s2
429; CHECK-NEXT:    vins.f16 s20, s1
430; CHECK-NEXT:    vmovx.f16 s21, s4
431; CHECK-NEXT:    vmovx.f16 s1, s6
432; CHECK-NEXT:    vins.f16 s12, s14
433; CHECK-NEXT:    vins.f16 s8, s10
434; CHECK-NEXT:    vins.f16 s4, s6
435; CHECK-NEXT:    vins.f16 s21, s1
436; CHECK-NEXT:    vins.f16 s0, s2
437; CHECK-NEXT:    vmov.f32 s3, s12
438; CHECK-NEXT:    vmov.f32 s1, s4
439; CHECK-NEXT:    vmov.f32 s2, s8
440; CHECK-NEXT:    vadd.i16 q0, q0, q5
441; CHECK-NEXT:    vadd.i16 q0, q0, q4
442; CHECK-NEXT:    vpop {d8, d9, d10, d11}
443; CHECK-NEXT:    bx lr
444entry:
445  %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
446  %s2 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
447  %s3 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
448  %s4 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
449  %a1 = add <8 x i16> %s1, %s2
450  %a2 = add <8 x i16> %s3, %s4
451  %r = add <8 x i16> %a1, %a2
452  ret <8 x i16> %r
453}
454
455; i8
456
457define arm_aapcs_vfpcc <16 x i8> @shuffle1_i8(<16 x i8> %src) {
458; CHECK-LABEL: shuffle1_i8:
459; CHECK:       @ %bb.0: @ %entry
460; CHECK-NEXT:    vrev64.8 q1, q0
461; CHECK-NEXT:    vmov.f32 s0, s6
462; CHECK-NEXT:    vmov.f32 s1, s7
463; CHECK-NEXT:    vmov.f32 s2, s4
464; CHECK-NEXT:    vmov.f32 s3, s5
465; CHECK-NEXT:    bx lr
466entry:
467  %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
468  ret <16 x i8> %out
469}
470
471define arm_aapcs_vfpcc <16 x i8> @shuffle2_i8(<16 x i8> %src) {
472; CHECK-LABEL: shuffle2_i8:
473; CHECK:       @ %bb.0: @ %entry
474; CHECK-NEXT:    bx lr
475entry:
476  %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
477  ret <16 x i8> %out
478}
479
480define arm_aapcs_vfpcc <16 x i8> @shuffle3_i8(<16 x i8> %src) {
481; CHECK-LABEL: shuffle3_i8:
482; CHECK:       @ %bb.0: @ %entry
483; CHECK-NEXT:    vmov q1, q0
484; CHECK-NEXT:    vmov.u8 r0, q0[4]
485; CHECK-NEXT:    vmov.8 q0[0], r0
486; CHECK-NEXT:    vmov.u8 r0, q1[5]
487; CHECK-NEXT:    vmov.8 q0[1], r0
488; CHECK-NEXT:    vmov.u8 r0, q1[15]
489; CHECK-NEXT:    vmov.8 q0[2], r0
490; CHECK-NEXT:    vmov.u8 r0, q1[7]
491; CHECK-NEXT:    vmov.8 q0[3], r0
492; CHECK-NEXT:    vmov.u8 r0, q1[14]
493; CHECK-NEXT:    vmov.8 q0[4], r0
494; CHECK-NEXT:    vmov.u8 r0, q1[9]
495; CHECK-NEXT:    vmov.8 q0[5], r0
496; CHECK-NEXT:    vmov.u8 r0, q1[6]
497; CHECK-NEXT:    vmov.8 q0[6], r0
498; CHECK-NEXT:    vmov.u8 r0, q1[3]
499; CHECK-NEXT:    vmov.8 q0[7], r0
500; CHECK-NEXT:    vmov.u8 r0, q1[10]
501; CHECK-NEXT:    vmov.8 q0[8], r0
502; CHECK-NEXT:    vmov.u8 r0, q1[12]
503; CHECK-NEXT:    vmov.8 q0[9], r0
504; CHECK-NEXT:    vmov.u8 r0, q1[1]
505; CHECK-NEXT:    vmov.8 q0[10], r0
506; CHECK-NEXT:    vmov.u8 r0, q1[13]
507; CHECK-NEXT:    vmov.8 q0[11], r0
508; CHECK-NEXT:    vmov.u8 r0, q1[2]
509; CHECK-NEXT:    vmov.8 q0[12], r0
510; CHECK-NEXT:    vmov.u8 r0, q1[8]
511; CHECK-NEXT:    vmov.8 q0[13], r0
512; CHECK-NEXT:    vmov.u8 r0, q1[0]
513; CHECK-NEXT:    vmov.8 q0[14], r0
514; CHECK-NEXT:    vmov.u8 r0, q1[11]
515; CHECK-NEXT:    vmov.8 q0[15], r0
516; CHECK-NEXT:    bx lr
517entry:
518  %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 15, i32 7, i32 14, i32 9, i32 6, i32 3, i32 10, i32 12, i32 1, i32 13, i32 2, i32 8, i32 0, i32 11>
519  ret <16 x i8> %out
520}
521
522define arm_aapcs_vfpcc <16 x i8> @shuffle5_i8(<16 x i8> %src) {
523; CHECK-LABEL: shuffle5_i8:
524; CHECK:       @ %bb.0: @ %entry
525; CHECK-NEXT:    vrev64.8 q1, q0
526; CHECK-NEXT:    vmov q0, q1
527; CHECK-NEXT:    bx lr
528entry:
529  %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
530  ret <16 x i8> %out
531}
532
533define arm_aapcs_vfpcc <16 x i8> @shuffle6_i8(<16 x i8> %src) {
534; CHECK-LABEL: shuffle6_i8:
535; CHECK:       @ %bb.0: @ %entry
536; CHECK-NEXT:    vrev32.8 q0, q0
537; CHECK-NEXT:    bx lr
538entry:
539  %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
540  ret <16 x i8> %out
541}
542
543define arm_aapcs_vfpcc <16 x i8> @shuffle7_i8(<16 x i8> %src) {
544; CHECK-LABEL: shuffle7_i8:
545; CHECK:       @ %bb.0: @ %entry
546; CHECK-NEXT:    vrev16.8 q0, q0
547; CHECK-NEXT:    bx lr
548entry:
549  %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
550  ret <16 x i8> %out
551}
552
553define arm_aapcs_vfpcc <16 x i8> @oneoff11_i8(<16 x i8> %src1, <16 x i8> %src2) {
554; CHECK-LABEL: oneoff11_i8:
555; CHECK:       @ %bb.0: @ %entry
556; CHECK-NEXT:    vmov.u8 r0, q0[1]
557; CHECK-NEXT:    vmov.8 q0[2], r0
558; CHECK-NEXT:    bx lr
559entry:
560  %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> <i32 0, i32 1, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
561  ret <16 x i8> %out
562}
563
564define arm_aapcs_vfpcc <16 x i8> @oneoff12_i8(<16 x i8> %src1, <16 x i8> %src2) {
565; CHECK-LABEL: oneoff12_i8:
566; CHECK:       @ %bb.0: @ %entry
567; CHECK-NEXT:    vmov.u8 r0, q1[4]
568; CHECK-NEXT:    vmov.8 q0[0], r0
569; CHECK-NEXT:    bx lr
570entry:
571  %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> <i32 20, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
572  ret <16 x i8> %out
573}
574
575define arm_aapcs_vfpcc <16 x i8> @oneoff21_i8(<16 x i8> %src1, <16 x i8> %src2) {
576; CHECK-LABEL: oneoff21_i8:
577; CHECK:       @ %bb.0: @ %entry
578; CHECK-NEXT:    vmov.u8 r0, q0[0]
579; CHECK-NEXT:    vmov.8 q1[3], r0
580; CHECK-NEXT:    vmov q0, q1
581; CHECK-NEXT:    bx lr
582entry:
583  %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> <i32 16, i32 17, i32 18, i32 0, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
584  ret <16 x i8> %out
585}
586
587define arm_aapcs_vfpcc <16 x i8> @oneoff22_i8(<16 x i8> %src1, <16 x i8> %src2) {
588; CHECK-LABEL: oneoff22_i8:
589; CHECK:       @ %bb.0: @ %entry
590; CHECK-NEXT:    vmov q0, q1
591; CHECK-NEXT:    vmov.u8 r0, q1[15]
592; CHECK-NEXT:    vmov.8 q0[9], r0
593; CHECK-NEXT:    bx lr
594entry:
595  %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 31, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
596  ret <16 x i8> %out
597}
598
599define arm_aapcs_vfpcc <16 x i8> @oneoffundef_i8(<16 x i8> %src1, <16 x i8> %src2) {
600; CHECK-LABEL: oneoffundef_i8:
601; CHECK:       @ %bb.0: @ %entry
602; CHECK-NEXT:    vmov.u8 r0, q0[2]
603; CHECK-NEXT:    vmov.8 q0[1], r0
604; CHECK-NEXT:    bx lr
605entry:
606  %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> <i32 undef, i32 2, i32 2, i32 3, i32 undef, i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 undef, i32 13, i32 14, i32 15>
607  ret <16 x i8> %out
608}
609
610define arm_aapcs_vfpcc <16 x i8> @shuffle2step_i8(<32 x i8> %src) {
611; CHECK-LABEL: shuffle2step_i8:
612; CHECK:       @ %bb.0: @ %entry
613; CHECK-NEXT:    .pad #32
614; CHECK-NEXT:    sub sp, #32
615; CHECK-NEXT:    mov r0, sp
616; CHECK-NEXT:    vshr.u16 q2, q1, #8
617; CHECK-NEXT:    vstrb.16 q2, [r0, #8]
618; CHECK-NEXT:    vshr.u16 q2, q0, #8
619; CHECK-NEXT:    add r1, sp, #16
620; CHECK-NEXT:    vstrb.16 q2, [r0]
621; CHECK-NEXT:    vstrb.16 q1, [r1, #8]
622; CHECK-NEXT:    vstrb.16 q0, [r1]
623; CHECK-NEXT:    vldrw.u32 q0, [r0]
624; CHECK-NEXT:    vldrw.u32 q1, [r1]
625; CHECK-NEXT:    vadd.i8 q0, q1, q0
626; CHECK-NEXT:    add sp, #32
627; CHECK-NEXT:    bx lr
628entry:
629  %s1 = shufflevector <32 x i8> %src, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
630  %s2 = shufflevector <32 x i8> %src, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
631  %r = add <16 x i8> %s1, %s2
632  ret <16 x i8> %r
633}
634
635define arm_aapcs_vfpcc <16 x i8> @shuffle3step_i8(<64 x i8> %src) {
636; CHECK-LABEL: shuffle3step_i8:
637; CHECK:       @ %bb.0: @ %entry
638; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
639; CHECK-NEXT:    vpush {d8, d9, d10, d11}
640; CHECK-NEXT:    vmov.u8 r0, q0[1]
641; CHECK-NEXT:    vmov.8 q3[0], r0
642; CHECK-NEXT:    vmov.u8 r0, q0[4]
643; CHECK-NEXT:    vmov.8 q3[1], r0
644; CHECK-NEXT:    vmov.u8 r0, q0[7]
645; CHECK-NEXT:    vmov.8 q3[2], r0
646; CHECK-NEXT:    vmov.u8 r0, q0[10]
647; CHECK-NEXT:    vmov.8 q3[3], r0
648; CHECK-NEXT:    vmov.u8 r0, q0[13]
649; CHECK-NEXT:    vmov.8 q3[4], r0
650; CHECK-NEXT:    vmov.u8 r0, q1[0]
651; CHECK-NEXT:    vmov.8 q3[5], r0
652; CHECK-NEXT:    vmov.u8 r0, q1[3]
653; CHECK-NEXT:    vmov.8 q3[6], r0
654; CHECK-NEXT:    vmov.u8 r0, q1[9]
655; CHECK-NEXT:    vmov.8 q4[8], r0
656; CHECK-NEXT:    vmov.u8 r0, q1[12]
657; CHECK-NEXT:    vmov.8 q4[9], r0
658; CHECK-NEXT:    vmov.u8 r0, q1[15]
659; CHECK-NEXT:    vmov.8 q4[10], r0
660; CHECK-NEXT:    vmov.u8 r0, q2[2]
661; CHECK-NEXT:    vmov.8 q4[11], r0
662; CHECK-NEXT:    vmov.u8 r0, q2[5]
663; CHECK-NEXT:    vmov.8 q4[12], r0
664; CHECK-NEXT:    vmov.u8 r0, q2[8]
665; CHECK-NEXT:    vmov.8 q4[13], r0
666; CHECK-NEXT:    vmov.u8 r0, q2[11]
667; CHECK-NEXT:    vmov.8 q4[14], r0
668; CHECK-NEXT:    vmov.u8 r0, q2[14]
669; CHECK-NEXT:    vmov.8 q4[15], r0
670; CHECK-NEXT:    vmov.u8 r0, q1[6]
671; CHECK-NEXT:    vmov.8 q3[7], r0
672; CHECK-NEXT:    vmov.u8 r0, q0[0]
673; CHECK-NEXT:    vmov.f32 s14, s18
674; CHECK-NEXT:    vmov.f32 s15, s19
675; CHECK-NEXT:    vmov.8 q4[0], r0
676; CHECK-NEXT:    vmov.u8 r0, q0[3]
677; CHECK-NEXT:    vmov.8 q4[1], r0
678; CHECK-NEXT:    vmov.u8 r0, q0[6]
679; CHECK-NEXT:    vmov.8 q4[2], r0
680; CHECK-NEXT:    vmov.u8 r0, q0[9]
681; CHECK-NEXT:    vmov.8 q4[3], r0
682; CHECK-NEXT:    vmov.u8 r0, q0[12]
683; CHECK-NEXT:    vmov.8 q4[4], r0
684; CHECK-NEXT:    vmov.u8 r0, q0[15]
685; CHECK-NEXT:    vmov.8 q4[5], r0
686; CHECK-NEXT:    vmov.u8 r0, q1[2]
687; CHECK-NEXT:    vmov.8 q4[6], r0
688; CHECK-NEXT:    vmov.u8 r0, q1[8]
689; CHECK-NEXT:    vmov.8 q5[8], r0
690; CHECK-NEXT:    vmov.u8 r0, q1[11]
691; CHECK-NEXT:    vmov.8 q5[9], r0
692; CHECK-NEXT:    vmov.u8 r0, q1[14]
693; CHECK-NEXT:    vmov.8 q5[10], r0
694; CHECK-NEXT:    vmov.u8 r0, q2[1]
695; CHECK-NEXT:    vmov.8 q5[11], r0
696; CHECK-NEXT:    vmov.u8 r0, q2[4]
697; CHECK-NEXT:    vmov.8 q5[12], r0
698; CHECK-NEXT:    vmov.u8 r0, q2[7]
699; CHECK-NEXT:    vmov.8 q5[13], r0
700; CHECK-NEXT:    vmov.u8 r0, q2[10]
701; CHECK-NEXT:    vmov.8 q5[14], r0
702; CHECK-NEXT:    vmov.u8 r0, q2[13]
703; CHECK-NEXT:    vmov.8 q5[15], r0
704; CHECK-NEXT:    vmov.u8 r0, q1[5]
705; CHECK-NEXT:    vmov.8 q4[7], r0
706; CHECK-NEXT:    vmov.u8 r0, q0[2]
707; CHECK-NEXT:    vmov.f32 s18, s22
708; CHECK-NEXT:    vmov.f32 s19, s23
709; CHECK-NEXT:    vadd.i8 q3, q4, q3
710; CHECK-NEXT:    vmov.8 q4[0], r0
711; CHECK-NEXT:    vmov.u8 r0, q0[5]
712; CHECK-NEXT:    vmov.8 q4[1], r0
713; CHECK-NEXT:    vmov.u8 r0, q0[8]
714; CHECK-NEXT:    vmov.8 q4[2], r0
715; CHECK-NEXT:    vmov.u8 r0, q0[11]
716; CHECK-NEXT:    vmov.8 q4[3], r0
717; CHECK-NEXT:    vmov.u8 r0, q0[14]
718; CHECK-NEXT:    vmov.8 q4[4], r0
719; CHECK-NEXT:    vmov.u8 r0, q1[1]
720; CHECK-NEXT:    vmov.8 q4[5], r0
721; CHECK-NEXT:    vmov.u8 r0, q1[4]
722; CHECK-NEXT:    vmov.8 q4[6], r0
723; CHECK-NEXT:    vmov.u8 r0, q1[10]
724; CHECK-NEXT:    vmov.8 q0[8], r0
725; CHECK-NEXT:    vmov.u8 r0, q1[13]
726; CHECK-NEXT:    vmov.8 q0[9], r0
727; CHECK-NEXT:    vmov.u8 r0, q2[0]
728; CHECK-NEXT:    vmov.8 q0[10], r0
729; CHECK-NEXT:    vmov.u8 r0, q2[3]
730; CHECK-NEXT:    vmov.8 q0[11], r0
731; CHECK-NEXT:    vmov.u8 r0, q2[6]
732; CHECK-NEXT:    vmov.8 q0[12], r0
733; CHECK-NEXT:    vmov.u8 r0, q2[9]
734; CHECK-NEXT:    vmov.8 q0[13], r0
735; CHECK-NEXT:    vmov.u8 r0, q2[12]
736; CHECK-NEXT:    vmov.8 q0[14], r0
737; CHECK-NEXT:    vmov.u8 r0, q2[15]
738; CHECK-NEXT:    vmov.8 q0[15], r0
739; CHECK-NEXT:    vmov.u8 r0, q1[7]
740; CHECK-NEXT:    vmov.8 q4[7], r0
741; CHECK-NEXT:    vmov.f32 s18, s2
742; CHECK-NEXT:    vmov.f32 s19, s3
743; CHECK-NEXT:    vadd.i8 q0, q3, q4
744; CHECK-NEXT:    vpop {d8, d9, d10, d11}
745; CHECK-NEXT:    bx lr
746entry:
747  %s1 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
748  %s2 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
749  %s3 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
750  %a = add <16 x i8> %s1, %s2
751  %r = add <16 x i8> %a, %s3
752  ret <16 x i8> %r
753}
754
755define arm_aapcs_vfpcc <16 x i8> @shuffle4step_i8(<64 x i8> %src) {
756; CHECK-LABEL: shuffle4step_i8:
757; CHECK:       @ %bb.0: @ %entry
758; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
759; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
760; CHECK-NEXT:    vmov.u8 r0, q0[3]
761; CHECK-NEXT:    vmov.8 q4[0], r0
762; CHECK-NEXT:    vmov.u8 r0, q0[7]
763; CHECK-NEXT:    vmov.8 q4[1], r0
764; CHECK-NEXT:    vmov.u8 r0, q0[11]
765; CHECK-NEXT:    vmov.8 q4[2], r0
766; CHECK-NEXT:    vmov.u8 r0, q0[15]
767; CHECK-NEXT:    vmov.8 q4[3], r0
768; CHECK-NEXT:    vmov.u8 r0, q1[3]
769; CHECK-NEXT:    vmov.8 q4[4], r0
770; CHECK-NEXT:    vmov.u8 r0, q1[7]
771; CHECK-NEXT:    vmov.8 q4[5], r0
772; CHECK-NEXT:    vmov.u8 r0, q1[11]
773; CHECK-NEXT:    vmov.8 q4[6], r0
774; CHECK-NEXT:    vmov.u8 r0, q2[3]
775; CHECK-NEXT:    vmov.8 q5[8], r0
776; CHECK-NEXT:    vmov.u8 r0, q2[7]
777; CHECK-NEXT:    vmov.8 q5[9], r0
778; CHECK-NEXT:    vmov.u8 r0, q2[11]
779; CHECK-NEXT:    vmov.8 q5[10], r0
780; CHECK-NEXT:    vmov.u8 r0, q2[15]
781; CHECK-NEXT:    vmov.8 q5[11], r0
782; CHECK-NEXT:    vmov.u8 r0, q3[3]
783; CHECK-NEXT:    vmov.8 q5[12], r0
784; CHECK-NEXT:    vmov.u8 r0, q3[7]
785; CHECK-NEXT:    vmov.8 q5[13], r0
786; CHECK-NEXT:    vmov.u8 r0, q3[11]
787; CHECK-NEXT:    vmov.8 q5[14], r0
788; CHECK-NEXT:    vmov.u8 r0, q3[15]
789; CHECK-NEXT:    vmov.8 q5[15], r0
790; CHECK-NEXT:    vmov.u8 r0, q1[15]
791; CHECK-NEXT:    vmov.8 q4[7], r0
792; CHECK-NEXT:    vmov.u8 r0, q0[2]
793; CHECK-NEXT:    vmov.f32 s18, s22
794; CHECK-NEXT:    vmov.f32 s19, s23
795; CHECK-NEXT:    vmov.8 q5[0], r0
796; CHECK-NEXT:    vmov.u8 r0, q0[6]
797; CHECK-NEXT:    vmov.8 q5[1], r0
798; CHECK-NEXT:    vmov.u8 r0, q0[10]
799; CHECK-NEXT:    vmov.8 q5[2], r0
800; CHECK-NEXT:    vmov.u8 r0, q0[14]
801; CHECK-NEXT:    vmov.8 q5[3], r0
802; CHECK-NEXT:    vmov.u8 r0, q1[2]
803; CHECK-NEXT:    vmov.8 q5[4], r0
804; CHECK-NEXT:    vmov.u8 r0, q1[6]
805; CHECK-NEXT:    vmov.8 q5[5], r0
806; CHECK-NEXT:    vmov.u8 r0, q1[10]
807; CHECK-NEXT:    vmov.8 q5[6], r0
808; CHECK-NEXT:    vmov.u8 r0, q2[2]
809; CHECK-NEXT:    vmov.8 q6[8], r0
810; CHECK-NEXT:    vmov.u8 r0, q2[6]
811; CHECK-NEXT:    vmov.8 q6[9], r0
812; CHECK-NEXT:    vmov.u8 r0, q2[10]
813; CHECK-NEXT:    vmov.8 q6[10], r0
814; CHECK-NEXT:    vmov.u8 r0, q2[14]
815; CHECK-NEXT:    vmov.8 q6[11], r0
816; CHECK-NEXT:    vmov.u8 r0, q3[2]
817; CHECK-NEXT:    vmov.8 q6[12], r0
818; CHECK-NEXT:    vmov.u8 r0, q3[6]
819; CHECK-NEXT:    vmov.8 q6[13], r0
820; CHECK-NEXT:    vmov.u8 r0, q3[10]
821; CHECK-NEXT:    vmov.8 q6[14], r0
822; CHECK-NEXT:    vmov.u8 r0, q3[14]
823; CHECK-NEXT:    vmov.8 q6[15], r0
824; CHECK-NEXT:    vmov.u8 r0, q1[14]
825; CHECK-NEXT:    vmov.8 q5[7], r0
826; CHECK-NEXT:    vmov.u8 r0, q0[1]
827; CHECK-NEXT:    vmov.f32 s22, s26
828; CHECK-NEXT:    vmov.f32 s23, s27
829; CHECK-NEXT:    vadd.i8 q4, q5, q4
830; CHECK-NEXT:    vmov.8 q5[0], r0
831; CHECK-NEXT:    vmov.u8 r0, q0[5]
832; CHECK-NEXT:    vmov.8 q5[1], r0
833; CHECK-NEXT:    vmov.u8 r0, q0[9]
834; CHECK-NEXT:    vmov.8 q5[2], r0
835; CHECK-NEXT:    vmov.u8 r0, q0[13]
836; CHECK-NEXT:    vmov.8 q5[3], r0
837; CHECK-NEXT:    vmov.u8 r0, q1[1]
838; CHECK-NEXT:    vmov.8 q5[4], r0
839; CHECK-NEXT:    vmov.u8 r0, q1[5]
840; CHECK-NEXT:    vmov.8 q5[5], r0
841; CHECK-NEXT:    vmov.u8 r0, q1[9]
842; CHECK-NEXT:    vmov.8 q5[6], r0
843; CHECK-NEXT:    vmov.u8 r0, q2[1]
844; CHECK-NEXT:    vmov.8 q6[8], r0
845; CHECK-NEXT:    vmov.u8 r0, q2[5]
846; CHECK-NEXT:    vmov.8 q6[9], r0
847; CHECK-NEXT:    vmov.u8 r0, q2[9]
848; CHECK-NEXT:    vmov.8 q6[10], r0
849; CHECK-NEXT:    vmov.u8 r0, q2[13]
850; CHECK-NEXT:    vmov.8 q6[11], r0
851; CHECK-NEXT:    vmov.u8 r0, q3[1]
852; CHECK-NEXT:    vmov.8 q6[12], r0
853; CHECK-NEXT:    vmov.u8 r0, q3[5]
854; CHECK-NEXT:    vmov.8 q6[13], r0
855; CHECK-NEXT:    vmov.u8 r0, q3[9]
856; CHECK-NEXT:    vmov.8 q6[14], r0
857; CHECK-NEXT:    vmov.u8 r0, q3[13]
858; CHECK-NEXT:    vmov.8 q6[15], r0
859; CHECK-NEXT:    vmov.u8 r0, q1[13]
860; CHECK-NEXT:    vmov.8 q5[7], r0
861; CHECK-NEXT:    vmov.u8 r0, q0[0]
862; CHECK-NEXT:    vmov.f32 s22, s26
863; CHECK-NEXT:    vmov.f32 s23, s27
864; CHECK-NEXT:    vmov.8 q6[0], r0
865; CHECK-NEXT:    vmov.u8 r0, q0[4]
866; CHECK-NEXT:    vmov.8 q6[1], r0
867; CHECK-NEXT:    vmov.u8 r0, q0[8]
868; CHECK-NEXT:    vmov.8 q6[2], r0
869; CHECK-NEXT:    vmov.u8 r0, q0[12]
870; CHECK-NEXT:    vmov.8 q6[3], r0
871; CHECK-NEXT:    vmov.u8 r0, q1[0]
872; CHECK-NEXT:    vmov.8 q6[4], r0
873; CHECK-NEXT:    vmov.u8 r0, q1[4]
874; CHECK-NEXT:    vmov.8 q6[5], r0
875; CHECK-NEXT:    vmov.u8 r0, q1[8]
876; CHECK-NEXT:    vmov.8 q6[6], r0
877; CHECK-NEXT:    vmov.u8 r0, q2[0]
878; CHECK-NEXT:    vmov.8 q0[8], r0
879; CHECK-NEXT:    vmov.u8 r0, q2[4]
880; CHECK-NEXT:    vmov.8 q0[9], r0
881; CHECK-NEXT:    vmov.u8 r0, q2[8]
882; CHECK-NEXT:    vmov.8 q0[10], r0
883; CHECK-NEXT:    vmov.u8 r0, q2[12]
884; CHECK-NEXT:    vmov.8 q0[11], r0
885; CHECK-NEXT:    vmov.u8 r0, q3[0]
886; CHECK-NEXT:    vmov.8 q0[12], r0
887; CHECK-NEXT:    vmov.u8 r0, q3[4]
888; CHECK-NEXT:    vmov.8 q0[13], r0
889; CHECK-NEXT:    vmov.u8 r0, q3[8]
890; CHECK-NEXT:    vmov.8 q0[14], r0
891; CHECK-NEXT:    vmov.u8 r0, q3[12]
892; CHECK-NEXT:    vmov.8 q0[15], r0
893; CHECK-NEXT:    vmov.u8 r0, q1[12]
894; CHECK-NEXT:    vmov.8 q6[7], r0
895; CHECK-NEXT:    vmov.f32 s26, s2
896; CHECK-NEXT:    vmov.f32 s27, s3
897; CHECK-NEXT:    vadd.i8 q0, q6, q5
898; CHECK-NEXT:    vadd.i8 q0, q0, q4
899; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
900; CHECK-NEXT:    bx lr
901entry:
902  %s1 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
903  %s2 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
904  %s3 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
905  %s4 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
906  %a1 = add <16 x i8> %s1, %s2
907  %a2 = add <16 x i8> %s3, %s4
908  %r = add <16 x i8> %a1, %a2
909  ret <16 x i8> %r
910}
911
912; i64
913
914define arm_aapcs_vfpcc <2 x i64> @shuffle1_i64(<2 x i64> %src) {
915; CHECK-LABEL: shuffle1_i64:
916; CHECK:       @ %bb.0: @ %entry
917; CHECK-NEXT:    bx lr
918entry:
919  %out = shufflevector <2 x i64> %src, <2 x i64> undef, <2 x i32> <i32 0, i32 1>
920  ret <2 x i64> %out
921}
922
923define arm_aapcs_vfpcc <2 x i64> @shuffle2_i64(<2 x i64> %src) {
924; CHECK-LABEL: shuffle2_i64:
925; CHECK:       @ %bb.0: @ %entry
926; CHECK-NEXT:    vmov.f32 s4, s2
927; CHECK-NEXT:    vmov.f32 s6, s0
928; CHECK-NEXT:    vmov.f32 s5, s3
929; CHECK-NEXT:    vmov.f32 s7, s1
930; CHECK-NEXT:    vmov q0, q1
931; CHECK-NEXT:    bx lr
932entry:
933  %out = shufflevector <2 x i64> %src, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
934  ret <2 x i64> %out
935}
936
937define arm_aapcs_vfpcc <2 x i64> @shuffle3_i64(<2 x i64> %src) {
938; CHECK-LABEL: shuffle3_i64:
939; CHECK:       @ %bb.0: @ %entry
940; CHECK-NEXT:    bx lr
941entry:
942  %out = shufflevector <2 x i64> %src, <2 x i64> undef, <2 x i32> <i32 undef, i32 1>
943  ret <2 x i64> %out
944}
945
946; f32
947
948define arm_aapcs_vfpcc <4 x float> @shuffle1_f32(<4 x float> %src) {
949; CHECK-LABEL: shuffle1_f32:
950; CHECK:       @ %bb.0: @ %entry
951; CHECK-NEXT:    vmov.f32 s4, s3
952; CHECK-NEXT:    vmov.f32 s5, s2
953; CHECK-NEXT:    vmov.f32 s6, s1
954; CHECK-NEXT:    vmov.f32 s7, s0
955; CHECK-NEXT:    vmov q0, q1
956; CHECK-NEXT:    bx lr
957entry:
958  %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
959  ret <4 x float> %out
960}
961
962define arm_aapcs_vfpcc <4 x float> @shuffle2_f32(<4 x float> %src) {
963; CHECK-LABEL: shuffle2_f32:
964; CHECK:       @ %bb.0: @ %entry
965; CHECK-NEXT:    bx lr
966entry:
967  %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
968  ret <4 x float> %out
969}
970
971define arm_aapcs_vfpcc <4 x float> @shuffle3_f32(<4 x float> %src) {
972; CHECK-LABEL: shuffle3_f32:
973; CHECK:       @ %bb.0: @ %entry
974; CHECK-NEXT:    vmov.f32 s4, s3
975; CHECK-NEXT:    vmov.f32 s5, s1
976; CHECK-NEXT:    vmov.f32 s6, s2
977; CHECK-NEXT:    vmov.f32 s7, s0
978; CHECK-NEXT:    vmov q0, q1
979; CHECK-NEXT:    bx lr
980entry:
981  %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
982  ret <4 x float> %out
983}
984
985define arm_aapcs_vfpcc <4 x float> @shuffle5_f32(<4 x float> %src) {
986; CHECK-LABEL: shuffle5_f32:
987; CHECK:       @ %bb.0: @ %entry
988; CHECK-NEXT:    vrev64.32 q1, q0
989; CHECK-NEXT:    vmov q0, q1
990; CHECK-NEXT:    bx lr
991entry:
992  %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
993  ret <4 x float> %out
994}
995
996define arm_aapcs_vfpcc <4 x float> @oneoff11_f32(<4 x float> %src1, <4 x float> %src2) {
997; CHECK-LABEL: oneoff11_f32:
998; CHECK:       @ %bb.0: @ %entry
999; CHECK-NEXT:    vmov.f32 s2, s1
1000; CHECK-NEXT:    bx lr
1001entry:
1002  %out = shufflevector <4 x float> %src1, <4 x float> %src2, <4 x i32> <i32 0, i32 1, i32 1, i32 3>
1003  ret <4 x float> %out
1004}
1005
1006define arm_aapcs_vfpcc <4 x float> @oneoff12_f32(<4 x float> %src1, <4 x float> %src2) {
1007; CHECK-LABEL: oneoff12_f32:
1008; CHECK:       @ %bb.0: @ %entry
1009; CHECK-NEXT:    vmov.f32 s0, s4
1010; CHECK-NEXT:    bx lr
1011entry:
1012  %out = shufflevector <4 x float> %src1, <4 x float> %src2, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1013  ret <4 x float> %out
1014}
1015
1016define arm_aapcs_vfpcc <4 x float> @oneoff21_f32(<4 x float> %src1, <4 x float> %src2) {
1017; CHECK-LABEL: oneoff21_f32:
1018; CHECK:       @ %bb.0: @ %entry
1019; CHECK-NEXT:    vmov.f32 s7, s0
1020; CHECK-NEXT:    vmov q0, q1
1021; CHECK-NEXT:    bx lr
1022entry:
1023  %out = shufflevector <4 x float> %src1, <4 x float> %src2, <4 x i32> <i32 4, i32 5, i32 6, i32 0>
1024  ret <4 x float> %out
1025}
1026
1027define arm_aapcs_vfpcc <4 x float> @oneoff22_f32(<4 x float> %src1, <4 x float> %src2) {
1028; CHECK-LABEL: oneoff22_f32:
1029; CHECK:       @ %bb.0: @ %entry
1030; CHECK-NEXT:    vmov q0, q1
1031; CHECK-NEXT:    vmov.f32 s2, s0
1032; CHECK-NEXT:    bx lr
1033entry:
1034  %out = shufflevector <4 x float> %src1, <4 x float> %src2, <4 x i32> <i32 4, i32 5, i32 4, i32 7>
1035  ret <4 x float> %out
1036}
1037
1038define arm_aapcs_vfpcc <4 x float> @shuffle2step_f32(<8 x float> %src) {
1039; CHECKFP-LABEL: shuffle2step_f32:
1040; CHECKFP:       @ %bb.0: @ %entry
1041; CHECKFP-NEXT:    vmov.f32 s8, s1
1042; CHECKFP-NEXT:    vmov.f32 s9, s3
1043; CHECKFP-NEXT:    vmov.f32 s1, s2
1044; CHECKFP-NEXT:    vmov.f32 s10, s5
1045; CHECKFP-NEXT:    vmov.f32 s11, s7
1046; CHECKFP-NEXT:    vmov.f32 s2, s4
1047; CHECKFP-NEXT:    vmov.f32 s3, s6
1048; CHECKFP-NEXT:    vadd.f32 q0, q0, q2
1049; CHECKFP-NEXT:    bx lr
1050entry:
1051  %s1 = shufflevector <8 x float> %src, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1052  %s2 = shufflevector <8 x float> %src, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1053  %r = fadd <4 x float> %s1, %s2
1054  ret <4 x float> %r
1055}
1056
1057define arm_aapcs_vfpcc <4 x float> @shuffle3step_f32(<16 x float> %src) {
1058; CHECKFP-LABEL: shuffle3step_f32:
1059; CHECKFP:       @ %bb.0: @ %entry
1060; CHECKFP-NEXT:    .vsave {d8, d9}
1061; CHECKFP-NEXT:    vpush {d8, d9}
1062; CHECKFP-NEXT:    vmov.f32 s13, s4
1063; CHECKFP-NEXT:    vmov.f32 s14, s7
1064; CHECKFP-NEXT:    vmov.f32 s18, s6
1065; CHECKFP-NEXT:    vmov.f32 s12, s1
1066; CHECKFP-NEXT:    vmov.f32 s15, s10
1067; CHECKFP-NEXT:    vmov.f32 s16, s0
1068; CHECKFP-NEXT:    vmov.f32 s17, s3
1069; CHECKFP-NEXT:    vmov.f32 s19, s9
1070; CHECKFP-NEXT:    vadd.f32 q3, q4, q3
1071; CHECKFP-NEXT:    vmov.f32 s4, s2
1072; CHECKFP-NEXT:    vmov.f32 s6, s8
1073; CHECKFP-NEXT:    vmov.f32 s7, s11
1074; CHECKFP-NEXT:    vadd.f32 q0, q3, q1
1075; CHECKFP-NEXT:    vpop {d8, d9}
1076; CHECKFP-NEXT:    bx lr
1077entry:
1078  %s1 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
1079  %s2 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
1080  %s3 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
1081  %a = fadd <4 x float> %s1, %s2
1082  %r = fadd <4 x float> %a, %s3
1083  ret <4 x float> %r
1084}
1085
1086define arm_aapcs_vfpcc <4 x float> @shuffle4step_f32(<16 x float> %src) {
1087; CHECKFP-LABEL: shuffle4step_f32:
1088; CHECKFP:       @ %bb.0: @ %entry
1089; CHECKFP-NEXT:    .vsave {d8, d9, d10, d11}
1090; CHECKFP-NEXT:    vpush {d8, d9, d10, d11}
1091; CHECKFP-NEXT:    vmov.f32 s16, s3
1092; CHECKFP-NEXT:    vmov.f32 s20, s2
1093; CHECKFP-NEXT:    vmov.f32 s17, s7
1094; CHECKFP-NEXT:    vmov.f32 s18, s11
1095; CHECKFP-NEXT:    vmov.f32 s19, s15
1096; CHECKFP-NEXT:    vmov.f32 s21, s6
1097; CHECKFP-NEXT:    vmov.f32 s22, s10
1098; CHECKFP-NEXT:    vmov.f32 s23, s14
1099; CHECKFP-NEXT:    vadd.f32 q4, q5, q4
1100; CHECKFP-NEXT:    vmov.f32 s20, s1
1101; CHECKFP-NEXT:    vmov.f32 s21, s5
1102; CHECKFP-NEXT:    vmov.f32 s22, s9
1103; CHECKFP-NEXT:    vmov.f32 s23, s13
1104; CHECKFP-NEXT:    vmov.f32 s1, s4
1105; CHECKFP-NEXT:    vmov.f32 s2, s8
1106; CHECKFP-NEXT:    vmov.f32 s3, s12
1107; CHECKFP-NEXT:    vadd.f32 q0, q0, q5
1108; CHECKFP-NEXT:    vadd.f32 q0, q0, q4
1109; CHECKFP-NEXT:    vpop {d8, d9, d10, d11}
1110; CHECKFP-NEXT:    bx lr
1111entry:
1112  %s1 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
1113  %s2 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
1114  %s3 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
1115  %s4 = shufflevector <16 x float> %src, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
1116  %a1 = fadd <4 x float> %s1, %s2
1117  %a2 = fadd <4 x float> %s3, %s4
1118  %r = fadd <4 x float> %a1, %a2
1119  ret <4 x float> %r
1120}
1121
1122; f16
1123
1124define arm_aapcs_vfpcc <8 x half> @shuffle1_f16(<8 x half> %src) {
1125; CHECK-LABEL: shuffle1_f16:
1126; CHECK:       @ %bb.0: @ %entry
1127; CHECK-NEXT:    vrev64.16 q1, q0
1128; CHECK-NEXT:    vmov.f32 s0, s6
1129; CHECK-NEXT:    vmov.f32 s1, s7
1130; CHECK-NEXT:    vmov.f32 s2, s4
1131; CHECK-NEXT:    vmov.f32 s3, s5
1132; CHECK-NEXT:    bx lr
1133entry:
1134  %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1135  ret <8 x half> %out
1136}
1137
1138define arm_aapcs_vfpcc <8 x half> @shuffle2_f16(<8 x half> %src) {
1139; CHECK-LABEL: shuffle2_f16:
1140; CHECK:       @ %bb.0: @ %entry
1141; CHECK-NEXT:    bx lr
1142entry:
1143  %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1144  ret <8 x half> %out
1145}
1146
1147define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) {
1148; CHECK-LABEL: shuffle3_f16:
1149; CHECK:       @ %bb.0: @ %entry
1150; CHECK-NEXT:    vmov q1, q0
1151; CHECK-NEXT:    vmovx.f16 s2, s5
1152; CHECK-NEXT:    vmovx.f16 s0, s4
1153; CHECK-NEXT:    vins.f16 s5, s4
1154; CHECK-NEXT:    vins.f16 s2, s0
1155; CHECK-NEXT:    vmov.f32 s3, s5
1156; CHECK-NEXT:    vmovx.f16 s1, s7
1157; CHECK-NEXT:    vmov.f32 s0, s6
1158; CHECK-NEXT:    vins.f16 s1, s7
1159; CHECK-NEXT:    bx lr
1160entry:
1161  %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
1162  ret <8 x half> %out
1163}
1164
1165define arm_aapcs_vfpcc <8 x half> @shuffle5_f16(<8 x half> %src) {
1166; CHECK-LABEL: shuffle5_f16:
1167; CHECK:       @ %bb.0: @ %entry
1168; CHECK-NEXT:    vrev64.16 q1, q0
1169; CHECK-NEXT:    vmov q0, q1
1170; CHECK-NEXT:    bx lr
1171entry:
1172  %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
1173  ret <8 x half> %out
1174}
1175
1176define arm_aapcs_vfpcc <8 x half> @shuffle6_f16(<8 x half> %src) {
1177; CHECK-LABEL: shuffle6_f16:
1178; CHECK:       @ %bb.0: @ %entry
1179; CHECK-NEXT:    vrev32.16 q0, q0
1180; CHECK-NEXT:    bx lr
1181entry:
1182  %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1183  ret <8 x half> %out
1184}
1185
1186define arm_aapcs_vfpcc <8 x half> @oneoff11_f16(<8 x half> %src1, <8 x half> %src2) {
1187; CHECK-LABEL: oneoff11_f16:
1188; CHECK:       @ %bb.0: @ %entry
1189; CHECK-NEXT:    vmovx.f16 s4, s0
1190; CHECK-NEXT:    vmov r0, s4
1191; CHECK-NEXT:    vmov.16 q0[2], r0
1192; CHECK-NEXT:    bx lr
1193entry:
1194  %out = shufflevector <8 x half> %src1, <8 x half> %src2, <8 x i32> <i32 0, i32 1, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7>
1195  ret <8 x half> %out
1196}
1197
1198define arm_aapcs_vfpcc <8 x half> @oneoff12_f16(<8 x half> %src1, <8 x half> %src2) {
1199; CHECK-LABEL: oneoff12_f16:
1200; CHECK:       @ %bb.0: @ %entry
1201; CHECK-NEXT:    vmov r0, s4
1202; CHECK-NEXT:    vmov.16 q0[0], r0
1203; CHECK-NEXT:    bx lr
1204entry:
1205  %out = shufflevector <8 x half> %src1, <8 x half> %src2, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1206  ret <8 x half> %out
1207}
1208
1209define arm_aapcs_vfpcc <8 x half> @oneoff21_f16(<8 x half> %src1, <8 x half> %src2) {
1210; CHECK-LABEL: oneoff21_f16:
1211; CHECK:       @ %bb.0: @ %entry
1212; CHECK-NEXT:    vins.f16 s5, s0
1213; CHECK-NEXT:    vmov q0, q1
1214; CHECK-NEXT:    bx lr
1215entry:
1216  %out = shufflevector <8 x half> %src1, <8 x half> %src2, <8 x i32> <i32 8, i32 9, i32 10, i32 0, i32 12, i32 13, i32 14, i32 15>
1217  ret <8 x half> %out
1218}
1219
1220define arm_aapcs_vfpcc <8 x half> @oneoff22_f16(<8 x half> %src1, <8 x half> %src2) {
1221; CHECK-LABEL: oneoff22_f16:
1222; CHECK:       @ %bb.0: @ %entry
1223; CHECK-NEXT:    vmov q0, q1
1224; CHECK-NEXT:    vmov r0, s3
1225; CHECK-NEXT:    vmov.16 q0[0], r0
1226; CHECK-NEXT:    bx lr
1227entry:
1228  %out = shufflevector <8 x half> %src1, <8 x half> %src2, <8 x i32> <i32 14, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1229  ret <8 x half> %out
1230}
1231
1232define arm_aapcs_vfpcc <8 x half> @shuffle2step_f16(<16 x half> %src) {
1233; CHECKFP-LABEL: shuffle2step_f16:
1234; CHECKFP:       @ %bb.0: @ %entry
1235; CHECKFP-NEXT:    vmovx.f16 s8, s0
1236; CHECKFP-NEXT:    vmovx.f16 s10, s1
1237; CHECKFP-NEXT:    vins.f16 s8, s10
1238; CHECKFP-NEXT:    vmovx.f16 s9, s2
1239; CHECKFP-NEXT:    vmovx.f16 s10, s3
1240; CHECKFP-NEXT:    vmovx.f16 s12, s5
1241; CHECKFP-NEXT:    vins.f16 s9, s10
1242; CHECKFP-NEXT:    vmovx.f16 s10, s4
1243; CHECKFP-NEXT:    vins.f16 s10, s12
1244; CHECKFP-NEXT:    vmovx.f16 s11, s6
1245; CHECKFP-NEXT:    vmovx.f16 s12, s7
1246; CHECKFP-NEXT:    vins.f16 s2, s3
1247; CHECKFP-NEXT:    vins.f16 s6, s7
1248; CHECKFP-NEXT:    vins.f16 s4, s5
1249; CHECKFP-NEXT:    vins.f16 s0, s1
1250; CHECKFP-NEXT:    vmov.f32 s1, s2
1251; CHECKFP-NEXT:    vins.f16 s11, s12
1252; CHECKFP-NEXT:    vmov.f32 s2, s4
1253; CHECKFP-NEXT:    vmov.f32 s3, s6
1254; CHECKFP-NEXT:    vadd.f16 q0, q0, q2
1255; CHECKFP-NEXT:    bx lr
1256entry:
1257  %s1 = shufflevector <16 x half> %src, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
1258  %s2 = shufflevector <16 x half> %src, <16 x half> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
1259  %r = fadd <8 x half> %s1, %s2
1260  ret <8 x half> %r
1261}
1262
1263define arm_aapcs_vfpcc <8 x half> @shuffle3step_f16(<32 x half> %src) {
1264; CHECKFP-LABEL: shuffle3step_f16:
1265; CHECKFP:       @ %bb.0: @ %entry
1266; CHECKFP-NEXT:    .vsave {d8, d9}
1267; CHECKFP-NEXT:    vpush {d8, d9}
1268; CHECKFP-NEXT:    vmov.f32 s13, s4
1269; CHECKFP-NEXT:    vmovx.f16 s4, s4
1270; CHECKFP-NEXT:    vmovx.f16 s17, s3
1271; CHECKFP-NEXT:    vins.f16 s3, s4
1272; CHECKFP-NEXT:    vmovx.f16 s4, s7
1273; CHECKFP-NEXT:    vmovx.f16 s18, s6
1274; CHECKFP-NEXT:    vmovx.f16 s16, s0
1275; CHECKFP-NEXT:    vins.f16 s6, s4
1276; CHECKFP-NEXT:    vmovx.f16 s14, s2
1277; CHECKFP-NEXT:    vmov.f32 s12, s1
1278; CHECKFP-NEXT:    vmovx.f16 s4, s10
1279; CHECKFP-NEXT:    vmovx.f16 s19, s9
1280; CHECKFP-NEXT:    vins.f16 s12, s14
1281; CHECKFP-NEXT:    vmovx.f16 s14, s5
1282; CHECKFP-NEXT:    vins.f16 s16, s2
1283; CHECKFP-NEXT:    vmovx.f16 s2, s11
1284; CHECKFP-NEXT:    vmovx.f16 s15, s8
1285; CHECKFP-NEXT:    vins.f16 s18, s8
1286; CHECKFP-NEXT:    vmovx.f16 s8, s1
1287; CHECKFP-NEXT:    vins.f16 s9, s4
1288; CHECKFP-NEXT:    vins.f16 s13, s14
1289; CHECKFP-NEXT:    vmov.f32 s14, s7
1290; CHECKFP-NEXT:    vins.f16 s10, s2
1291; CHECKFP-NEXT:    vmov.f32 s1, s3
1292; CHECKFP-NEXT:    vins.f16 s19, s11
1293; CHECKFP-NEXT:    vins.f16 s17, s5
1294; CHECKFP-NEXT:    vins.f16 s0, s8
1295; CHECKFP-NEXT:    vmov.f32 s2, s6
1296; CHECKFP-NEXT:    vmov.f32 s3, s9
1297; CHECKFP-NEXT:    vins.f16 s14, s15
1298; CHECKFP-NEXT:    vmov.f32 s15, s10
1299; CHECKFP-NEXT:    vadd.f16 q0, q0, q4
1300; CHECKFP-NEXT:    vadd.f16 q0, q0, q3
1301; CHECKFP-NEXT:    vpop {d8, d9}
1302; CHECKFP-NEXT:    bx lr
1303entry:
1304  %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
1305  %s2 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
1306  %s3 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
1307  %a = fadd <8 x half> %s1, %s2
1308  %r = fadd <8 x half> %a, %s3
1309  ret <8 x half> %r
1310}
1311
1312define arm_aapcs_vfpcc <8 x half> @shuffle4step_f16(<32 x half> %src) {
1313; CHECKFP-LABEL: shuffle4step_f16:
1314; CHECKFP:       @ %bb.0: @ %entry
1315; CHECKFP-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
1316; CHECKFP-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
1317; CHECKFP-NEXT:    vmovx.f16 s18, s9
1318; CHECKFP-NEXT:    vmovx.f16 s16, s11
1319; CHECKFP-NEXT:    vins.f16 s18, s16
1320; CHECKFP-NEXT:    vmovx.f16 s19, s13
1321; CHECKFP-NEXT:    vmovx.f16 s16, s15
1322; CHECKFP-NEXT:    vmovx.f16 s22, s8
1323; CHECKFP-NEXT:    vins.f16 s19, s16
1324; CHECKFP-NEXT:    vmovx.f16 s16, s1
1325; CHECKFP-NEXT:    vmovx.f16 s20, s3
1326; CHECKFP-NEXT:    vins.f16 s1, s3
1327; CHECKFP-NEXT:    vmovx.f16 s3, s10
1328; CHECKFP-NEXT:    vins.f16 s16, s20
1329; CHECKFP-NEXT:    vmovx.f16 s17, s5
1330; CHECKFP-NEXT:    vmovx.f16 s20, s7
1331; CHECKFP-NEXT:    vins.f16 s22, s3
1332; CHECKFP-NEXT:    vmovx.f16 s23, s12
1333; CHECKFP-NEXT:    vmovx.f16 s3, s14
1334; CHECKFP-NEXT:    vins.f16 s17, s20
1335; CHECKFP-NEXT:    vins.f16 s23, s3
1336; CHECKFP-NEXT:    vmovx.f16 s20, s0
1337; CHECKFP-NEXT:    vmovx.f16 s3, s2
1338; CHECKFP-NEXT:    vins.f16 s9, s11
1339; CHECKFP-NEXT:    vins.f16 s13, s15
1340; CHECKFP-NEXT:    vins.f16 s5, s7
1341; CHECKFP-NEXT:    vins.f16 s20, s3
1342; CHECKFP-NEXT:    vmovx.f16 s21, s4
1343; CHECKFP-NEXT:    vmovx.f16 s3, s6
1344; CHECKFP-NEXT:    vins.f16 s8, s10
1345; CHECKFP-NEXT:    vins.f16 s12, s14
1346; CHECKFP-NEXT:    vins.f16 s4, s6
1347; CHECKFP-NEXT:    vins.f16 s21, s3
1348; CHECKFP-NEXT:    vins.f16 s0, s2
1349; CHECKFP-NEXT:    vmov.f32 s24, s1
1350; CHECKFP-NEXT:    vmov.f32 s26, s9
1351; CHECKFP-NEXT:    vmov.f32 s27, s13
1352; CHECKFP-NEXT:    vmov.f32 s25, s5
1353; CHECKFP-NEXT:    vmov.f32 s2, s8
1354; CHECKFP-NEXT:    vadd.f16 q4, q6, q4
1355; CHECKFP-NEXT:    vmov.f32 s3, s12
1356; CHECKFP-NEXT:    vmov.f32 s1, s4
1357; CHECKFP-NEXT:    vadd.f16 q0, q0, q5
1358; CHECKFP-NEXT:    vadd.f16 q0, q0, q4
1359; CHECKFP-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
1360; CHECKFP-NEXT:    bx lr
1361entry:
1362  %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
1363  %s2 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
1364  %s3 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
1365  %s4 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
1366  %a1 = fadd <8 x half> %s1, %s2
1367  %a2 = fadd <8 x half> %s3, %s4
1368  %r = fadd <8 x half> %a1, %a2
1369  ret <8 x half> %r
1370}
1371
1372; f64
1373
1374define arm_aapcs_vfpcc <2 x double> @shuffle1_f64(<2 x double> %src) {
1375; CHECK-LABEL: shuffle1_f64:
1376; CHECK:       @ %bb.0: @ %entry
1377; CHECK-NEXT:    bx lr
1378entry:
1379  %out = shufflevector <2 x double> %src, <2 x double> undef, <2 x i32> <i32 0, i32 1>
1380  ret <2 x double> %out
1381}
1382
1383define arm_aapcs_vfpcc <2 x double> @shuffle2_f64(<2 x double> %src) {
1384; CHECK-LABEL: shuffle2_f64:
1385; CHECK:       @ %bb.0: @ %entry
1386; CHECK-NEXT:    vmov.f32 s4, s2
1387; CHECK-NEXT:    vmov.f32 s6, s0
1388; CHECK-NEXT:    vmov.f32 s5, s3
1389; CHECK-NEXT:    vmov.f32 s7, s1
1390; CHECK-NEXT:    vmov q0, q1
1391; CHECK-NEXT:    bx lr
1392entry:
1393  %out = shufflevector <2 x double> %src, <2 x double> undef, <2 x i32> <i32 1, i32 0>
1394  ret <2 x double> %out
1395}
1396
1397define arm_aapcs_vfpcc <2 x double> @shuffle3_f64(<2 x double> %src) {
1398; CHECK-LABEL: shuffle3_f64:
1399; CHECK:       @ %bb.0: @ %entry
1400; CHECK-NEXT:    bx lr
1401entry:
1402  %out = shufflevector <2 x double> %src, <2 x double> undef, <2 x i32> <i32 undef, i32 1>
1403  ret <2 x double> %out
1404}
1405
1406define arm_aapcs_vfpcc <4 x double> @shuffle4_f64(<2 x double> %src1, <2 x double> %src2) {
1407; CHECK-LABEL: shuffle4_f64:
1408; CHECK:       @ %bb.0: @ %entry
1409; CHECK-NEXT:    vmov.f32 s8, s6
1410; CHECK-NEXT:    vmov.f32 s6, s0
1411; CHECK-NEXT:    vmov.f32 s9, s7
1412; CHECK-NEXT:    vmov.f32 s7, s1
1413; CHECK-NEXT:    vmov.f32 s10, s2
1414; CHECK-NEXT:    vmov.f32 s11, s3
1415; CHECK-NEXT:    vmov q0, q2
1416; CHECK-NEXT:    bx lr
1417entry:
1418  %out = shufflevector <2 x double> %src1, <2 x double> %src2, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
1419  ret <4 x double> %out
1420}
1421define arm_aapcs_vfpcc <4 x double> @shuffle5_f64(<2 x double> %src1, <2 x double> %src2) {
1422; CHECK-LABEL: shuffle5_f64:
1423; CHECK:       @ %bb.0: @ %entry
1424; CHECK-NEXT:    vmov.f32 s8, s6
1425; CHECK-NEXT:    vmov.f32 s10, s4
1426; CHECK-NEXT:    vmov.f32 s4, s2
1427; CHECK-NEXT:    vmov.f32 s6, s0
1428; CHECK-NEXT:    vmov.f32 s9, s7
1429; CHECK-NEXT:    vmov.f32 s11, s5
1430; CHECK-NEXT:    vmov.f32 s5, s3
1431; CHECK-NEXT:    vmov.f32 s7, s1
1432; CHECK-NEXT:    vmov q0, q2
1433; CHECK-NEXT:    bx lr
1434entry:
1435  %out = shufflevector <2 x double> %src1, <2 x double> %src2, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1436  ret <4 x double> %out
1437}
1438define arm_aapcs_vfpcc <2 x double> @shuffle6_f64(<2 x double> %src1, <2 x double> %src2) {
1439; CHECK-LABEL: shuffle6_f64:
1440; CHECK:       @ %bb.0: @ %entry
1441; CHECK-NEXT:    vmov.f32 s2, s6
1442; CHECK-NEXT:    vmov.f32 s3, s7
1443; CHECK-NEXT:    bx lr
1444entry:
1445  %out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 0, i32 3>
1446  ret <2 x double> %out
1447}
1448define arm_aapcs_vfpcc <2 x double> @shuffle7_f64(<2 x double> %src1, <2 x double> %src2) {
1449; CHECK-LABEL: shuffle7_f64:
1450; CHECK:       @ %bb.0: @ %entry
1451; CHECK-NEXT:    vmov.f32 s0, s6
1452; CHECK-NEXT:    vmov.f32 s1, s7
1453; CHECK-NEXT:    bx lr
1454entry:
1455  %out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 3, i32 1>
1456  ret <2 x double> %out
1457}
1458define arm_aapcs_vfpcc <2 x double> @shuffle8_f64(<2 x double> %src1, <2 x double> %src2) {
1459; CHECK-LABEL: shuffle8_f64:
1460; CHECK:       @ %bb.0: @ %entry
1461; CHECK-NEXT:    vmov.f32 s6, s2
1462; CHECK-NEXT:    vmov.f32 s7, s3
1463; CHECK-NEXT:    vmov q0, q1
1464; CHECK-NEXT:    bx lr
1465entry:
1466  %out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 2, i32 1>
1467  ret <2 x double> %out
1468}
1469define arm_aapcs_vfpcc <8 x double> @shuffle9_f64(<4 x double> %src1, <4 x double> %src2) {
1470; CHECK-LV-LABEL: shuffle9_f64:
1471; CHECK-LV:       @ %bb.0: @ %entry
1472; CHECK-LV-NEXT:    .vsave {d8, d9, d10, d11}
1473; CHECK-LV-NEXT:    vpush {d8, d9, d10, d11}
1474; CHECK-LV-NEXT:    vmov q5, q2
1475; CHECK-LV-NEXT:    vmov.f32 s16, s0
1476; CHECK-LV-NEXT:    vmov.f32 s18, s20
1477; CHECK-LV-NEXT:    vmov.f32 s20, s2
1478; CHECK-LV-NEXT:    vmov.f32 s10, s12
1479; CHECK-LV-NEXT:    vmov.f32 s19, s21
1480; CHECK-LV-NEXT:    vmov.f32 s8, s4
1481; CHECK-LV-NEXT:    vmov.f32 s17, s1
1482; CHECK-LV-NEXT:    vmov.f32 s21, s3
1483; CHECK-LV-NEXT:    vmov q0, q4
1484; CHECK-LV-NEXT:    vmov.f32 s12, s6
1485; CHECK-LV-NEXT:    vmov.f32 s11, s13
1486; CHECK-LV-NEXT:    vmov.f32 s9, s5
1487; CHECK-LV-NEXT:    vmov.f32 s13, s7
1488; CHECK-LV-NEXT:    vmov q1, q5
1489; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11}
1490; CHECK-LV-NEXT:    bx lr
1491;
1492; CHECK-LIS-LABEL: shuffle9_f64:
1493; CHECK-LIS:       @ %bb.0: @ %entry
1494; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11}
1495; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11}
1496; CHECK-LIS-NEXT:    vmov q5, q2
1497; CHECK-LIS-NEXT:    vmov q4, q0
1498; CHECK-LIS-NEXT:    vmov.f32 s2, s20
1499; CHECK-LIS-NEXT:    vmov.f32 s20, s18
1500; CHECK-LIS-NEXT:    vmov.f32 s10, s12
1501; CHECK-LIS-NEXT:    vmov.f32 s3, s21
1502; CHECK-LIS-NEXT:    vmov.f32 s8, s4
1503; CHECK-LIS-NEXT:    vmov.f32 s21, s19
1504; CHECK-LIS-NEXT:    vmov.f32 s12, s6
1505; CHECK-LIS-NEXT:    vmov.f32 s11, s13
1506; CHECK-LIS-NEXT:    vmov.f32 s9, s5
1507; CHECK-LIS-NEXT:    vmov.f32 s13, s7
1508; CHECK-LIS-NEXT:    vmov q1, q5
1509; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11}
1510; CHECK-LIS-NEXT:    bx lr
1511entry:
1512  %out = shufflevector <4 x double> %src1, <4 x double> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1513  ret <8 x double> %out
1514}
1515
1516
1517
1518
1519define arm_aapcs_vfpcc <4 x i64> @shuffle4_i64(<2 x i64> %src1, <2 x i64> %src2) {
1520; CHECK-LABEL: shuffle4_i64:
1521; CHECK:       @ %bb.0: @ %entry
1522; CHECK-NEXT:    vmov.f32 s8, s6
1523; CHECK-NEXT:    vmov.f32 s6, s0
1524; CHECK-NEXT:    vmov.f32 s9, s7
1525; CHECK-NEXT:    vmov.f32 s7, s1
1526; CHECK-NEXT:    vmov.f32 s10, s2
1527; CHECK-NEXT:    vmov.f32 s11, s3
1528; CHECK-NEXT:    vmov q0, q2
1529; CHECK-NEXT:    bx lr
1530entry:
1531  %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
1532  ret <4 x i64> %out
1533}
1534define arm_aapcs_vfpcc <4 x i64> @shuffle5_i64(<2 x i64> %src1, <2 x i64> %src2) {
1535; CHECK-LABEL: shuffle5_i64:
1536; CHECK:       @ %bb.0: @ %entry
1537; CHECK-NEXT:    vmov.f32 s8, s6
1538; CHECK-NEXT:    vmov.f32 s10, s4
1539; CHECK-NEXT:    vmov.f32 s4, s2
1540; CHECK-NEXT:    vmov.f32 s6, s0
1541; CHECK-NEXT:    vmov.f32 s9, s7
1542; CHECK-NEXT:    vmov.f32 s11, s5
1543; CHECK-NEXT:    vmov.f32 s5, s3
1544; CHECK-NEXT:    vmov.f32 s7, s1
1545; CHECK-NEXT:    vmov q0, q2
1546; CHECK-NEXT:    bx lr
1547entry:
1548  %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1549  ret <4 x i64> %out
1550}
1551define arm_aapcs_vfpcc <2 x i64> @shuffle6_i64(<2 x i64> %src1, <2 x i64> %src2) {
1552; CHECK-LABEL: shuffle6_i64:
1553; CHECK:       @ %bb.0: @ %entry
1554; CHECK-NEXT:    vmov.f32 s2, s6
1555; CHECK-NEXT:    vmov.f32 s3, s7
1556; CHECK-NEXT:    bx lr
1557entry:
1558  %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 0, i32 3>
1559  ret <2 x i64> %out
1560}
1561define arm_aapcs_vfpcc <2 x i64> @shuffle7_i64(<2 x i64> %src1, <2 x i64> %src2) {
1562; CHECK-LABEL: shuffle7_i64:
1563; CHECK:       @ %bb.0: @ %entry
1564; CHECK-NEXT:    vmov.f32 s0, s6
1565; CHECK-NEXT:    vmov.f32 s1, s7
1566; CHECK-NEXT:    bx lr
1567entry:
1568  %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 3, i32 1>
1569  ret <2 x i64> %out
1570}
1571define arm_aapcs_vfpcc <2 x i64> @shuffle8_i64(<2 x i64> %src1, <2 x i64> %src2) {
1572; CHECK-LABEL: shuffle8_i64:
1573; CHECK:       @ %bb.0: @ %entry
1574; CHECK-NEXT:    vmov.f32 s6, s2
1575; CHECK-NEXT:    vmov.f32 s7, s3
1576; CHECK-NEXT:    vmov q0, q1
1577; CHECK-NEXT:    bx lr
1578entry:
1579  %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 2, i32 1>
1580  ret <2 x i64> %out
1581}
1582define arm_aapcs_vfpcc <8 x i64> @shuffle9_i64(<4 x i64> %src1, <4 x i64> %src2) {
1583; CHECK-LV-LABEL: shuffle9_i64:
1584; CHECK-LV:       @ %bb.0: @ %entry
1585; CHECK-LV-NEXT:    .vsave {d8, d9, d10, d11}
1586; CHECK-LV-NEXT:    vpush {d8, d9, d10, d11}
1587; CHECK-LV-NEXT:    vmov q5, q2
1588; CHECK-LV-NEXT:    vmov.f32 s16, s0
1589; CHECK-LV-NEXT:    vmov.f32 s18, s20
1590; CHECK-LV-NEXT:    vmov.f32 s20, s2
1591; CHECK-LV-NEXT:    vmov.f32 s10, s12
1592; CHECK-LV-NEXT:    vmov.f32 s19, s21
1593; CHECK-LV-NEXT:    vmov.f32 s8, s4
1594; CHECK-LV-NEXT:    vmov.f32 s17, s1
1595; CHECK-LV-NEXT:    vmov.f32 s21, s3
1596; CHECK-LV-NEXT:    vmov q0, q4
1597; CHECK-LV-NEXT:    vmov.f32 s12, s6
1598; CHECK-LV-NEXT:    vmov.f32 s11, s13
1599; CHECK-LV-NEXT:    vmov.f32 s9, s5
1600; CHECK-LV-NEXT:    vmov.f32 s13, s7
1601; CHECK-LV-NEXT:    vmov q1, q5
1602; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11}
1603; CHECK-LV-NEXT:    bx lr
1604;
1605; CHECK-LIS-LABEL: shuffle9_i64:
1606; CHECK-LIS:       @ %bb.0: @ %entry
1607; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11}
1608; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11}
1609; CHECK-LIS-NEXT:    vmov q5, q2
1610; CHECK-LIS-NEXT:    vmov q4, q0
1611; CHECK-LIS-NEXT:    vmov.f32 s2, s20
1612; CHECK-LIS-NEXT:    vmov.f32 s20, s18
1613; CHECK-LIS-NEXT:    vmov.f32 s10, s12
1614; CHECK-LIS-NEXT:    vmov.f32 s3, s21
1615; CHECK-LIS-NEXT:    vmov.f32 s8, s4
1616; CHECK-LIS-NEXT:    vmov.f32 s21, s19
1617; CHECK-LIS-NEXT:    vmov.f32 s12, s6
1618; CHECK-LIS-NEXT:    vmov.f32 s11, s13
1619; CHECK-LIS-NEXT:    vmov.f32 s9, s5
1620; CHECK-LIS-NEXT:    vmov.f32 s13, s7
1621; CHECK-LIS-NEXT:    vmov q1, q5
1622; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11}
1623; CHECK-LIS-NEXT:    bx lr
1624entry:
1625  %out = shufflevector <4 x i64> %src1, <4 x i64> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1626  ret <8 x i64> %out
1627}
1628
1629
1630define arm_aapcs_vfpcc <4 x i32> @insert_i32(i32 %a) {
1631; CHECK-LABEL: insert_i32:
1632; CHECK:       @ %bb.0: @ %entry
1633; CHECK-NEXT:    vmov.32 q0[0], r0
1634; CHECK-NEXT:    bx lr
1635entry:
1636  %res = insertelement <4 x i32> undef, i32 %a, i32 0
1637  ret <4 x i32> %res
1638}
1639
1640define arm_aapcs_vfpcc <8 x i16> @insert_i16(i16 %a) {
1641; CHECK-LABEL: insert_i16:
1642; CHECK:       @ %bb.0: @ %entry
1643; CHECK-NEXT:    vmov.16 q0[0], r0
1644; CHECK-NEXT:    bx lr
1645entry:
1646  %res = insertelement <8 x i16> undef, i16 %a, i32 0
1647  ret <8 x i16> %res
1648}
1649
1650define arm_aapcs_vfpcc <16 x i8> @insert_i8(i8 %a) {
1651; CHECK-LABEL: insert_i8:
1652; CHECK:       @ %bb.0: @ %entry
1653; CHECK-NEXT:    vmov.8 q0[0], r0
1654; CHECK-NEXT:    bx lr
1655entry:
1656  %res = insertelement <16 x i8> undef, i8 %a, i32 0
1657  ret <16 x i8> %res
1658}
1659
1660define arm_aapcs_vfpcc <2 x i64> @insert_i64(i64 %a) {
1661; CHECK-LABEL: insert_i64:
1662; CHECK:       @ %bb.0: @ %entry
1663; CHECK-NEXT:    vmov.32 q0[0], r0
1664; CHECK-NEXT:    vmov.32 q0[1], r1
1665; CHECK-NEXT:    bx lr
1666entry:
1667  %res = insertelement <2 x i64> undef, i64 %a, i32 0
1668  ret <2 x i64> %res
1669}
1670
1671define arm_aapcs_vfpcc <4 x float> @insert_f32(float %a) {
1672; CHECK-LABEL: insert_f32:
1673; CHECK:       @ %bb.0: @ %entry
1674; CHECK-NEXT:    bx lr
1675entry:
1676  %res = insertelement <4 x float> undef, float %a, i32 0
1677  ret <4 x float> %res
1678}
1679
1680define arm_aapcs_vfpcc <8 x half> @insert_f16(half %a) {
1681; CHECK-LABEL: insert_f16:
1682; CHECK:       @ %bb.0: @ %entry
1683; CHECK-NEXT:    bx lr
1684entry:
1685  %res = insertelement <8 x half> undef, half %a, i32 0
1686  ret <8 x half> %res
1687}
1688
1689define arm_aapcs_vfpcc <2 x double> @insert_f64(double %a) {
1690; CHECK-LABEL: insert_f64:
1691; CHECK:       @ %bb.0: @ %entry
1692; CHECK-NEXT:    bx lr
1693entry:
1694  %res = insertelement <2 x double> undef, double %a, i32 0
1695  ret <2 x double> %res
1696}
1697
1698define arm_aapcs_vfpcc i64 @scalar_to_vector_i32(<8 x i16> %v) {
1699; CHECK-LABEL: scalar_to_vector_i32:
1700; CHECK:       @ %bb.0: @ %entry
1701; CHECK-NEXT:    .pad #8
1702; CHECK-NEXT:    sub sp, #8
1703; CHECK-NEXT:    adr r2, .LCPI88_0
1704; CHECK-NEXT:    vmov.u16 r0, q0[0]
1705; CHECK-NEXT:    vldrw.u32 q0, [r2]
1706; CHECK-NEXT:    mov r1, sp
1707; CHECK-NEXT:    vmov.32 q0[0], r0
1708; CHECK-NEXT:    vstrh.32 q0, [r1]
1709; CHECK-NEXT:    ldrd r0, r1, [sp], #8
1710; CHECK-NEXT:    bx lr
1711; CHECK-NEXT:    .p2align 4
1712; CHECK-NEXT:  @ %bb.1:
1713; CHECK-NEXT:  .LCPI88_0:
1714; CHECK-NEXT:    .zero 4
1715; CHECK-NEXT:    .long 7 @ 0x7
1716; CHECK-NEXT:    .long 1 @ 0x1
1717; CHECK-NEXT:    .long 9 @ 0x9
1718entry:
1719  %f = shufflevector <8 x i16> %v, <8 x i16> <i16 undef, i16 7, i16 1, i16 9, i16 undef, i16 undef, i16 undef, i16 undef>, <4 x i32> <i32 0, i32 9, i32 10, i32 11>
1720  %0 = bitcast <4 x i16> %f to i64
1721  ret i64 %0
1722}
1723
1724
1725define arm_aapcs_vfpcc i32 @extract_i32_0(<4 x i32> %a) {
1726; CHECK-LABEL: extract_i32_0:
1727; CHECK:       @ %bb.0: @ %entry
1728; CHECK-NEXT:    vmov r0, s0
1729; CHECK-NEXT:    bx lr
1730entry:
1731  %res = extractelement <4 x i32> %a, i32 0
1732  ret i32 %res
1733}
1734
1735define arm_aapcs_vfpcc i32 @extract_i32_3(<4 x i32> %a) {
1736; CHECK-LABEL: extract_i32_3:
1737; CHECK:       @ %bb.0: @ %entry
1738; CHECK-NEXT:    vmov r0, s3
1739; CHECK-NEXT:    bx lr
1740entry:
1741  %res = extractelement <4 x i32> %a, i32 3
1742  ret i32 %res
1743}
1744
1745define arm_aapcs_vfpcc i16 @extract_i16_0(<8 x i16> %a) {
1746; CHECK-LABEL: extract_i16_0:
1747; CHECK:       @ %bb.0: @ %entry
1748; CHECK-NEXT:    vmov.u16 r0, q0[0]
1749; CHECK-NEXT:    bx lr
1750entry:
1751  %res = extractelement <8 x i16> %a, i32 0
1752  ret i16 %res
1753}
1754
1755define arm_aapcs_vfpcc i16 @extract_i16_3(<8 x i16> %a) {
1756; CHECK-LABEL: extract_i16_3:
1757; CHECK:       @ %bb.0: @ %entry
1758; CHECK-NEXT:    vmov.u16 r0, q0[3]
1759; CHECK-NEXT:    bx lr
1760entry:
1761  %res = extractelement <8 x i16> %a, i32 3
1762  ret i16 %res
1763}
1764
1765define arm_aapcs_vfpcc i8 @extract_i8_0(<16 x i8> %a) {
1766; CHECK-LABEL: extract_i8_0:
1767; CHECK:       @ %bb.0: @ %entry
1768; CHECK-NEXT:    vmov.u8 r0, q0[0]
1769; CHECK-NEXT:    bx lr
1770entry:
1771  %res = extractelement <16 x i8> %a, i32 0
1772  ret i8 %res
1773}
1774
1775define arm_aapcs_vfpcc i8 @extract_i8_3(<16 x i8> %a) {
1776; CHECK-LABEL: extract_i8_3:
1777; CHECK:       @ %bb.0: @ %entry
1778; CHECK-NEXT:    vmov.u8 r0, q0[3]
1779; CHECK-NEXT:    bx lr
1780entry:
1781  %res = extractelement <16 x i8> %a, i32 3
1782  ret i8 %res
1783}
1784
1785define arm_aapcs_vfpcc i64 @extract_i64_0(<2 x i64> %a) {
1786; CHECK-LABEL: extract_i64_0:
1787; CHECK:       @ %bb.0: @ %entry
1788; CHECK-NEXT:    vmov r0, r1, d0
1789; CHECK-NEXT:    bx lr
1790entry:
1791  %res = extractelement <2 x i64> %a, i32 0
1792  ret i64 %res
1793}
1794
1795define arm_aapcs_vfpcc i64 @extract_i64_1(<2 x i64> %a) {
1796; CHECK-LABEL: extract_i64_1:
1797; CHECK:       @ %bb.0: @ %entry
1798; CHECK-NEXT:    vmov r0, r1, d1
1799; CHECK-NEXT:    bx lr
1800entry:
1801  %res = extractelement <2 x i64> %a, i32 1
1802  ret i64 %res
1803}
1804
1805define arm_aapcs_vfpcc float @extract_f32_0(<4 x float> %a) {
1806; CHECK-LABEL: extract_f32_0:
1807; CHECK:       @ %bb.0: @ %entry
1808; CHECK-NEXT:    bx lr
1809entry:
1810  %res = extractelement <4 x float> %a, i32 0
1811  ret float %res
1812}
1813
1814define arm_aapcs_vfpcc float @extract_f32_3(<4 x float> %a) {
1815; CHECK-LABEL: extract_f32_3:
1816; CHECK:       @ %bb.0: @ %entry
1817; CHECK-NEXT:    vmov.f32 s0, s3
1818; CHECK-NEXT:    bx lr
1819entry:
1820  %res = extractelement <4 x float> %a, i32 3
1821  ret float %res
1822}
1823
1824define arm_aapcs_vfpcc half @extract_f16_0(<8 x half> %a) {
1825; CHECK-LABEL: extract_f16_0:
1826; CHECK:       @ %bb.0: @ %entry
1827; CHECK-NEXT:    bx lr
1828entry:
1829  %res = extractelement <8 x half> %a, i32 0
1830  ret half %res
1831}
1832
1833define arm_aapcs_vfpcc half @extract_f16_3(<8 x half> %a) {
1834; CHECK-LABEL: extract_f16_3:
1835; CHECK:       @ %bb.0: @ %entry
1836; CHECK-NEXT:    vmovx.f16 s0, s1
1837; CHECK-NEXT:    bx lr
1838entry:
1839  %res = extractelement <8 x half> %a, i32 3
1840  ret half %res
1841}
1842
1843define arm_aapcs_vfpcc double @extract_f64_0(<2 x double> %a) {
1844; CHECK-LABEL: extract_f64_0:
1845; CHECK:       @ %bb.0: @ %entry
1846; CHECK-NEXT:    bx lr
1847entry:
1848  %res = extractelement <2 x double> %a, i32 0
1849  ret double %res
1850}
1851
1852define arm_aapcs_vfpcc double @extract_f64_1(<2 x double> %a) {
1853; CHECK-LABEL: extract_f64_1:
1854; CHECK:       @ %bb.0: @ %entry
1855; CHECK-NEXT:    vmov.f32 s0, s2
1856; CHECK-NEXT:    vmov.f32 s1, s3
1857; CHECK-NEXT:    bx lr
1858entry:
1859  %res = extractelement <2 x double> %a, i32 1
1860  ret double %res
1861}
1862
1863