xref: /llvm-project/llvm/test/CodeGen/ARM/vdup.ll (revision 752819e813d1de1e76b4b509ad6fbb97b52d2d03)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs | FileCheck %s
3
4define <8 x i8> @v_dup8(i8 %A) nounwind {
5; CHECK-LABEL: v_dup8:
6; CHECK:       @ %bb.0:
7; CHECK-NEXT:    vdup.8 d16, r0
8; CHECK-NEXT:    vmov r0, r1, d16
9; CHECK-NEXT:    mov pc, lr
10	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
11	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
12	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
13	%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
14	%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
15	%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
16	%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
17	%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
18	ret <8 x i8> %tmp8
19}
20
21define <4 x i16> @v_dup16(i16 %A) nounwind {
22; CHECK-LABEL: v_dup16:
23; CHECK:       @ %bb.0:
24; CHECK-NEXT:    vdup.16 d16, r0
25; CHECK-NEXT:    vmov r0, r1, d16
26; CHECK-NEXT:    mov pc, lr
27	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
28	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
29	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
30	%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
31	ret <4 x i16> %tmp4
32}
33
34define <2 x i32> @v_dup32(i32 %A) nounwind {
35; CHECK-LABEL: v_dup32:
36; CHECK:       @ %bb.0:
37; CHECK-NEXT:    vdup.32 d16, r0
38; CHECK-NEXT:    vmov r0, r1, d16
39; CHECK-NEXT:    mov pc, lr
40	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
41	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
42	ret <2 x i32> %tmp2
43}
44
45define <2 x float> @v_dupfloat(float %A) nounwind {
46; CHECK-LABEL: v_dupfloat:
47; CHECK:       @ %bb.0:
48; CHECK-NEXT:    vdup.32 d16, r0
49; CHECK-NEXT:    vmov r0, r1, d16
50; CHECK-NEXT:    mov pc, lr
51	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
52	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
53	ret <2 x float> %tmp2
54}
55
56define <16 x i8> @v_dupQ8(i8 %A) nounwind {
57; CHECK-LABEL: v_dupQ8:
58; CHECK:       @ %bb.0:
59; CHECK-NEXT:    vdup.8 q8, r0
60; CHECK-NEXT:    vmov r0, r1, d16
61; CHECK-NEXT:    vmov r2, r3, d17
62; CHECK-NEXT:    mov pc, lr
63	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
64	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
65	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
66	%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
67	%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
68	%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
69	%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
70	%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
71	%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
72	%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
73	%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
74	%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
75	%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
76	%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
77	%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
78	%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
79	ret <16 x i8> %tmp16
80}
81
82define <8 x i16> @v_dupQ16(i16 %A) nounwind {
83; CHECK-LABEL: v_dupQ16:
84; CHECK:       @ %bb.0:
85; CHECK-NEXT:    vdup.16 q8, r0
86; CHECK-NEXT:    vmov r0, r1, d16
87; CHECK-NEXT:    vmov r2, r3, d17
88; CHECK-NEXT:    mov pc, lr
89	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
90	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
91	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
92	%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
93	%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
94	%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
95	%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
96	%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
97	ret <8 x i16> %tmp8
98}
99
100define <4 x i32> @v_dupQ32(i32 %A) nounwind {
101; CHECK-LABEL: v_dupQ32:
102; CHECK:       @ %bb.0:
103; CHECK-NEXT:    mov r1, r0
104; CHECK-NEXT:    mov r2, r0
105; CHECK-NEXT:    mov r3, r0
106; CHECK-NEXT:    mov pc, lr
107	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
108	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
109	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
110	%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
111	ret <4 x i32> %tmp4
112}
113
114define <4 x float> @v_dupQfloat(float %A) nounwind {
115; CHECK-LABEL: v_dupQfloat:
116; CHECK:       @ %bb.0:
117; CHECK-NEXT:    vdup.32 q8, r0
118; CHECK-NEXT:    vmov r0, r1, d16
119; CHECK-NEXT:    vmov r2, r3, d17
120; CHECK-NEXT:    mov pc, lr
121	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
122	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
123	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
124	%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
125	ret <4 x float> %tmp4
126}
127
128; Check to make sure it works with shuffles, too.
129
130define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
131; CHECK-LABEL: v_shuffledup8:
132; CHECK:       @ %bb.0:
133; CHECK-NEXT:    vdup.8 d16, r0
134; CHECK-NEXT:    vmov r0, r1, d16
135; CHECK-NEXT:    mov pc, lr
136	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
137	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
138	ret <8 x i8> %tmp2
139}
140
141define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
142; CHECK-LABEL: v_shuffledup16:
143; CHECK:       @ %bb.0:
144; CHECK-NEXT:    vdup.16 d16, r0
145; CHECK-NEXT:    vmov r0, r1, d16
146; CHECK-NEXT:    mov pc, lr
147	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
148	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
149	ret <4 x i16> %tmp2
150}
151
152define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
153; CHECK-LABEL: v_shuffledup32:
154; CHECK:       @ %bb.0:
155; CHECK-NEXT:    vdup.32 d16, r0
156; CHECK-NEXT:    vmov r0, r1, d16
157; CHECK-NEXT:    mov pc, lr
158	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
159	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
160	ret <2 x i32> %tmp2
161}
162
163define <2 x float> @v_shuffledupfloat(float %A) nounwind {
164; CHECK-LABEL: v_shuffledupfloat:
165; CHECK:       @ %bb.0:
166; CHECK-NEXT:    vdup.32 d16, r0
167; CHECK-NEXT:    vmov r0, r1, d16
168; CHECK-NEXT:    mov pc, lr
169	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
170	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
171	ret <2 x float> %tmp2
172}
173
174define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
175; CHECK-LABEL: v_shuffledupQ8:
176; CHECK:       @ %bb.0:
177; CHECK-NEXT:    vdup.8 q8, r0
178; CHECK-NEXT:    vmov r0, r1, d16
179; CHECK-NEXT:    vmov r2, r3, d17
180; CHECK-NEXT:    mov pc, lr
181	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
182	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
183	ret <16 x i8> %tmp2
184}
185
186define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
187; CHECK-LABEL: v_shuffledupQ16:
188; CHECK:       @ %bb.0:
189; CHECK-NEXT:    vdup.16 q8, r0
190; CHECK-NEXT:    vmov r0, r1, d16
191; CHECK-NEXT:    vmov r2, r3, d17
192; CHECK-NEXT:    mov pc, lr
193	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
194	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
195	ret <8 x i16> %tmp2
196}
197
198define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
199; CHECK-LABEL: v_shuffledupQ32:
200; CHECK:       @ %bb.0:
201; CHECK-NEXT:    vdup.32 q8, r0
202; CHECK-NEXT:    vmov r0, r1, d16
203; CHECK-NEXT:    vmov r2, r3, d17
204; CHECK-NEXT:    mov pc, lr
205	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
206	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
207	ret <4 x i32> %tmp2
208}
209
210define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
211; CHECK-LABEL: v_shuffledupQfloat:
212; CHECK:       @ %bb.0:
213; CHECK-NEXT:    vdup.32 q8, r0
214; CHECK-NEXT:    vmov r0, r1, d16
215; CHECK-NEXT:    vmov r2, r3, d17
216; CHECK-NEXT:    mov pc, lr
217	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
218	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
219	ret <4 x float> %tmp2
220}
221
222define arm_aapcs_vfpcc <8 x i8> @vduplane8(<8 x i8> %A) nounwind {
223; CHECK-LABEL: vduplane8:
224; CHECK:       @ %bb.0:
225; CHECK-NEXT:    vdup.8 d0, d0[1]
226; CHECK-NEXT:    mov pc, lr
227	%tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
228	ret <8 x i8> %tmp2
229}
230
231define arm_aapcs_vfpcc <4 x i16> @vduplane16(<4 x i16> %A) nounwind {
232; CHECK-LABEL: vduplane16:
233; CHECK:       @ %bb.0:
234; CHECK-NEXT:    vdup.16 d0, d0[1]
235; CHECK-NEXT:    mov pc, lr
236	%tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
237	ret <4 x i16> %tmp2
238}
239
240define arm_aapcs_vfpcc <2 x i32> @vduplane32(<2 x i32> %A) nounwind {
241; CHECK-LABEL: vduplane32:
242; CHECK:       @ %bb.0:
243; CHECK-NEXT:    vdup.32 d0, d0[1]
244; CHECK-NEXT:    mov pc, lr
245	%tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
246	ret <2 x i32> %tmp2
247}
248
249define arm_aapcs_vfpcc <2 x float> @vduplanefloat(<2 x float> %A) nounwind {
250; CHECK-LABEL: vduplanefloat:
251; CHECK:       @ %bb.0:
252; CHECK-NEXT:    vdup.32 d0, d0[1]
253; CHECK-NEXT:    mov pc, lr
254	%tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
255	ret <2 x float> %tmp2
256}
257
258define arm_aapcs_vfpcc <16 x i8> @vduplaneQ8(<8 x i8> %A) nounwind {
259; CHECK-LABEL: vduplaneQ8:
260; CHECK:       @ %bb.0:
261; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
262; CHECK-NEXT:    vdup.8 q0, d0[1]
263; CHECK-NEXT:    mov pc, lr
264	%tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
265	ret <16 x i8> %tmp2
266}
267
268define arm_aapcs_vfpcc <8 x i16> @vduplaneQ16(<4 x i16> %A) nounwind {
269; CHECK-LABEL: vduplaneQ16:
270; CHECK:       @ %bb.0:
271; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
272; CHECK-NEXT:    vdup.16 q0, d0[1]
273; CHECK-NEXT:    mov pc, lr
274	%tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
275	ret <8 x i16> %tmp2
276}
277
278define arm_aapcs_vfpcc <4 x i32> @vduplaneQ32(<2 x i32> %A) nounwind {
279; CHECK-LABEL: vduplaneQ32:
280; CHECK:       @ %bb.0:
281; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
282; CHECK-NEXT:    vdup.32 q0, d0[1]
283; CHECK-NEXT:    mov pc, lr
284	%tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
285	ret <4 x i32> %tmp2
286}
287
288define arm_aapcs_vfpcc <4 x float> @vduplaneQfloat(<2 x float> %A) nounwind {
289; CHECK-LABEL: vduplaneQfloat:
290; CHECK:       @ %bb.0:
291; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
292; CHECK-NEXT:    vdup.32 q0, d0[1]
293; CHECK-NEXT:    mov pc, lr
294	%tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
295	ret <4 x float> %tmp2
296}
297
298define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
299; CHECK-LABEL: foo:
300; CHECK:       @ %bb.0: @ %entry
301; CHECK-NEXT:    mov r0, r2
302; CHECK-NEXT:    mov r1, r3
303; CHECK-NEXT:    mov pc, lr
304entry:
305  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
306  ret <2 x i64> %0
307}
308
309define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
310; CHECK-LABEL: bar:
311; CHECK:       @ %bb.0: @ %entry
312; CHECK-NEXT:    mov r2, r0
313; CHECK-NEXT:    mov r3, r1
314; CHECK-NEXT:    mov pc, lr
315entry:
316  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
317  ret <2 x i64> %0
318}
319
320define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
321; CHECK-LABEL: baz:
322; CHECK:       @ %bb.0: @ %entry
323; CHECK-NEXT:    mov r0, r2
324; CHECK-NEXT:    mov r1, r3
325; CHECK-NEXT:    mov pc, lr
326entry:
327  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
328  ret <2 x double> %0
329}
330
331define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
332; CHECK-LABEL: qux:
333; CHECK:       @ %bb.0: @ %entry
334; CHECK-NEXT:    mov r2, r0
335; CHECK-NEXT:    mov r3, r1
336; CHECK-NEXT:    mov pc, lr
337entry:
338  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
339  ret <2 x double> %0
340}
341
342; Radar 7373643
343define void @redundantVdup(ptr %ptr) nounwind {
344; CHECK-LABEL: redundantVdup:
345; CHECK:       @ %bb.0:
346; CHECK-NEXT:    vmov.i8 d16, #0x80
347; CHECK-NEXT:    vstr d16, [r0]
348; CHECK-NEXT:    mov pc, lr
349  %1 = insertelement <8 x i8> undef, i8 -128, i32 0
350  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
351  store <8 x i8> %2, ptr %ptr, align 8
352  ret void
353}
354
355define <4 x i32> @tdupi(i32 %x, i32 %y) {
356; CHECK-LABEL: tdupi:
357; CHECK:       @ %bb.0:
358; CHECK-NEXT:    mov r3, r1
359; CHECK-NEXT:    mov r1, r0
360; CHECK-NEXT:    mov r2, r0
361; CHECK-NEXT:    mov pc, lr
362  %1 = insertelement <4 x i32> undef, i32 %x, i32 0
363  %2 = insertelement <4 x i32> %1, i32 %x, i32 1
364  %3 = insertelement <4 x i32> %2, i32 %x, i32 2
365  %4 = insertelement <4 x i32> %3, i32 %y, i32 3
366  ret <4 x i32> %4
367}
368
369define <4 x float> @tdupf(float %x, float %y) {
370; CHECK-LABEL: tdupf:
371; CHECK:       @ %bb.0:
372; CHECK-NEXT:    vdup.32 q0, r0
373; CHECK-NEXT:    vmov s3, r1
374; CHECK-NEXT:    vmov r0, r1, d0
375; CHECK-NEXT:    vmov r2, r3, d1
376; CHECK-NEXT:    mov pc, lr
377  %1 = insertelement <4 x float> undef, float %x, i32 0
378  %2 = insertelement <4 x float> %1, float %x, i32 1
379  %3 = insertelement <4 x float> %2, float %x, i32 2
380  %4 = insertelement <4 x float> %3, float %y, i32 3
381  ret <4 x float> %4
382}
383
384; This test checks that when splatting an element from a vector into another,
385; the value isn't moved out to GPRs first.
386define <4 x i32> @tduplane(<4 x i32> %invec) {
387; CHECK-LABEL: tduplane:
388; CHECK:       @ %bb.0:
389; CHECK-NEXT:    vmov d16, r0, r1
390; CHECK-NEXT:    mov r3, #255
391; CHECK-NEXT:    vmov.32 r0, d16[1]
392; CHECK-NEXT:    mov r1, r0
393; CHECK-NEXT:    mov r2, r0
394; CHECK-NEXT:    mov pc, lr
395  %in = extractelement <4 x i32> %invec, i32 1
396  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
397  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
398  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
399  %4 = insertelement <4 x i32> %3, i32 255, i32 3
400  ret <4 x i32> %4
401}
402
403define <2 x float> @check_f32(<4 x float> %v) nounwind {
404; CHECK-LABEL: check_f32:
405; CHECK:       @ %bb.0:
406; CHECK-NEXT:    vmov d16, r2, r3
407; CHECK-NEXT:    vdup.32 d16, d16[1]
408; CHECK-NEXT:    vmov r0, r1, d16
409; CHECK-NEXT:    mov pc, lr
410  %x = extractelement <4 x float> %v, i32 3
411  %1 = insertelement  <2 x float> undef, float %x, i32 0
412  %2 = insertelement  <2 x float> %1, float %x, i32 1
413  ret <2 x float> %2
414}
415
416define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
417; CHECK-LABEL: check_i32:
418; CHECK:       @ %bb.0:
419; CHECK-NEXT:    vmov d16, r2, r3
420; CHECK-NEXT:    vdup.32 d16, d16[1]
421; CHECK-NEXT:    vmov r0, r1, d16
422; CHECK-NEXT:    mov pc, lr
423  %x = extractelement <4 x i32> %v, i32 3
424  %1 = insertelement  <2 x i32> undef, i32 %x, i32 0
425  %2 = insertelement  <2 x i32> %1, i32 %x, i32 1
426  ret <2 x i32> %2
427}
428
429define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
430; CHECK-LABEL: check_i16:
431; CHECK:       @ %bb.0:
432; CHECK-NEXT:    vmov d16, r0, r1
433; CHECK-NEXT:    vdup.16 d16, d16[3]
434; CHECK-NEXT:    vmov r0, r1, d16
435; CHECK-NEXT:    mov pc, lr
436  %x = extractelement <8 x i16> %v, i32 3
437  %1 = insertelement  <4 x i16> undef, i16 %x, i32 0
438  %2 = insertelement  <4 x i16> %1, i16 %x, i32 1
439  ret <4 x i16> %2
440}
441
442define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
443; CHECK-LABEL: check_i8:
444; CHECK:       @ %bb.0:
445; CHECK-NEXT:    vmov d16, r0, r1
446; CHECK-NEXT:    vdup.8 d16, d16[3]
447; CHECK-NEXT:    vmov r0, r1, d16
448; CHECK-NEXT:    mov pc, lr
449  %x = extractelement <16 x i8> %v, i32 3
450  %1 = insertelement  <8  x i8> undef, i8 %x, i32 0
451  %2 = insertelement  <8  x i8> %1, i8 %x, i32 1
452  ret <8 x i8> %2
453}
454
455; Check that an SPR splat produces a vdup.
456
457define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) {
458; CHECK-LABEL: check_spr_splat2:
459; CHECK:       @ %bb.0:
460; CHECK-NEXT:    lsl r2, r2, #16
461; CHECK-NEXT:    vmov d16, r0, r1
462; CHECK-NEXT:    asr r2, r2, #16
463; CHECK-NEXT:    vmov s0, r2
464; CHECK-NEXT:    vcvt.f32.s32 s0, s0
465; CHECK-NEXT:    vdup.32 d17, d0[0]
466; CHECK-NEXT:    vsub.f32 d16, d17, d16
467; CHECK-NEXT:    vmov r0, r1, d16
468; CHECK-NEXT:    mov pc, lr
469  %conv = sitofp i16 %q to float
470  %splat.splatinsert = insertelement <2 x float> undef, float %conv, i32 0
471  %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> undef, <2 x i32> zeroinitializer
472  %sub = fsub <2 x float> %splat.splat, %p
473  ret <2 x float> %sub
474}
475
476define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) {
477; CHECK-LABEL: check_spr_splat4:
478; CHECK:       @ %bb.0:
479; CHECK-NEXT:    ldrsh r12, [sp]
480; CHECK-NEXT:    vmov d17, r2, r3
481; CHECK-NEXT:    vmov d16, r0, r1
482; CHECK-NEXT:    vmov s0, r12
483; CHECK-NEXT:    vcvt.f32.s32 s0, s0
484; CHECK-NEXT:    vdup.32 q9, d0[0]
485; CHECK-NEXT:    vsub.f32 q8, q9, q8
486; CHECK-NEXT:    vmov r0, r1, d16
487; CHECK-NEXT:    vmov r2, r3, d17
488; CHECK-NEXT:    mov pc, lr
489  %conv = sitofp i16 %q to float
490  %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 0
491  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
492  %sub = fsub <4 x float> %splat.splat, %p
493  ret <4 x float> %sub
494}
495; Same codegen as above test; scalar is splatted using vld1, so shuffle index is irrelevant.
496define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) {
497; CHECK-LABEL: check_spr_splat4_lane1:
498; CHECK:       @ %bb.0:
499; CHECK-NEXT:    ldrsh r12, [sp]
500; CHECK-NEXT:    vmov d17, r2, r3
501; CHECK-NEXT:    vmov d16, r0, r1
502; CHECK-NEXT:    vmov s0, r12
503; CHECK-NEXT:    vcvt.f32.s32 s0, s0
504; CHECK-NEXT:    vdup.32 q9, d0[0]
505; CHECK-NEXT:    vsub.f32 q8, q9, q8
506; CHECK-NEXT:    vmov r0, r1, d16
507; CHECK-NEXT:    vmov r2, r3, d17
508; CHECK-NEXT:    mov pc, lr
509  %conv = sitofp i16 %q to float
510  %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 1
511  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
512  %sub = fsub <4 x float> %splat.splat, %p
513  ret <4 x float> %sub
514}
515
516; Also make sure we don't barf on variable-index extractelts, where we almost
517; could have generated a vdup.
518
519define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) {
520; CHECK-LABEL: check_i8_varidx:
521; CHECK:       @ %bb.0:
522; CHECK-NEXT:    .save {r11}
523; CHECK-NEXT:    push {r11}
524; CHECK-NEXT:    .setfp r11, sp
525; CHECK-NEXT:    mov r11, sp
526; CHECK-NEXT:    .pad #28
527; CHECK-NEXT:    sub sp, sp, #28
528; CHECK-NEXT:    bic sp, sp, #15
529; CHECK-NEXT:    ldr r12, [r11, #4]
530; CHECK-NEXT:    vmov d17, r2, r3
531; CHECK-NEXT:    vmov d16, r0, r1
532; CHECK-NEXT:    mov r1, sp
533; CHECK-NEXT:    and r0, r12, #15
534; CHECK-NEXT:    vst1.64 {d16, d17}, [r1:128], r0
535; CHECK-NEXT:    vld1.8 {d16[]}, [r1]
536; CHECK-NEXT:    vmov r0, r1, d16
537; CHECK-NEXT:    mov sp, r11
538; CHECK-NEXT:    pop {r11}
539; CHECK-NEXT:    mov pc, lr
540  %x = extractelement <16 x i8> %v, i32 %idx
541  %1 = insertelement  <8 x i8> undef, i8 %x, i32 0
542  %2 = insertelement  <8 x i8> %1, i8 %x, i32 1
543  ret <8 x i8> %2
544}
545