xref: /llvm-project/llvm/test/CodeGen/ARM/vdup.ll (revision c6dfaa0e836c1b63366b1010157538d9c346a8b3)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs | FileCheck %s
3
4define <8 x i8> @v_dup8(i8 %A) nounwind {
5; CHECK-LABEL: v_dup8:
6; CHECK:       @ %bb.0:
7; CHECK-NEXT:    vdup.8 d16, r0
8; CHECK-NEXT:    vmov r0, r1, d16
9; CHECK-NEXT:    mov pc, lr
10	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
11	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
12	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
13	%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
14	%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
15	%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
16	%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
17	%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
18	ret <8 x i8> %tmp8
19}
20
21define <4 x i16> @v_dup16(i16 %A) nounwind {
22; CHECK-LABEL: v_dup16:
23; CHECK:       @ %bb.0:
24; CHECK-NEXT:    vdup.16 d16, r0
25; CHECK-NEXT:    vmov r0, r1, d16
26; CHECK-NEXT:    mov pc, lr
27	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
28	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
29	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
30	%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
31	ret <4 x i16> %tmp4
32}
33
34define <2 x i32> @v_dup32(i32 %A) nounwind {
35; CHECK-LABEL: v_dup32:
36; CHECK:       @ %bb.0:
37; CHECK-NEXT:    vdup.32 d16, r0
38; CHECK-NEXT:    vmov r0, r1, d16
39; CHECK-NEXT:    mov pc, lr
40	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
41	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
42	ret <2 x i32> %tmp2
43}
44
45define <2 x float> @v_dupfloat(float %A) nounwind {
46; CHECK-LABEL: v_dupfloat:
47; CHECK:       @ %bb.0:
48; CHECK-NEXT:    vdup.32 d16, r0
49; CHECK-NEXT:    vmov r0, r1, d16
50; CHECK-NEXT:    mov pc, lr
51	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
52	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
53	ret <2 x float> %tmp2
54}
55
56define <16 x i8> @v_dupQ8(i8 %A) nounwind {
57; CHECK-LABEL: v_dupQ8:
58; CHECK:       @ %bb.0:
59; CHECK-NEXT:    vdup.8 q8, r0
60; CHECK-NEXT:    vmov r0, r1, d16
61; CHECK-NEXT:    vmov r2, r3, d17
62; CHECK-NEXT:    mov pc, lr
63	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
64	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
65	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
66	%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
67	%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
68	%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
69	%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
70	%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
71	%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
72	%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
73	%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
74	%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
75	%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
76	%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
77	%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
78	%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
79	ret <16 x i8> %tmp16
80}
81
82define <8 x i16> @v_dupQ16(i16 %A) nounwind {
83; CHECK-LABEL: v_dupQ16:
84; CHECK:       @ %bb.0:
85; CHECK-NEXT:    vdup.16 q8, r0
86; CHECK-NEXT:    vmov r0, r1, d16
87; CHECK-NEXT:    vmov r2, r3, d17
88; CHECK-NEXT:    mov pc, lr
89	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
90	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
91	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
92	%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
93	%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
94	%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
95	%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
96	%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
97	ret <8 x i16> %tmp8
98}
99
100define <4 x i32> @v_dupQ32(i32 %A) nounwind {
101; CHECK-LABEL: v_dupQ32:
102; CHECK:       @ %bb.0:
103; CHECK-NEXT:    vdup.32 q8, r0
104; CHECK-NEXT:    vmov r0, r1, d16
105; CHECK-NEXT:    vmov r2, r3, d17
106; CHECK-NEXT:    mov pc, lr
107	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
108	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
109	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
110	%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
111	ret <4 x i32> %tmp4
112}
113
114define <4 x float> @v_dupQfloat(float %A) nounwind {
115; CHECK-LABEL: v_dupQfloat:
116; CHECK:       @ %bb.0:
117; CHECK-NEXT:    vdup.32 q8, r0
118; CHECK-NEXT:    vmov r0, r1, d16
119; CHECK-NEXT:    vmov r2, r3, d17
120; CHECK-NEXT:    mov pc, lr
121	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
122	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
123	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
124	%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
125	ret <4 x float> %tmp4
126}
127
128; Check to make sure it works with shuffles, too.
129
130define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
131; CHECK-LABEL: v_shuffledup8:
132; CHECK:       @ %bb.0:
133; CHECK-NEXT:    vdup.8 d16, r0
134; CHECK-NEXT:    vmov r0, r1, d16
135; CHECK-NEXT:    mov pc, lr
136	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
137	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
138	ret <8 x i8> %tmp2
139}
140
141define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
142; CHECK-LABEL: v_shuffledup16:
143; CHECK:       @ %bb.0:
144; CHECK-NEXT:    vdup.16 d16, r0
145; CHECK-NEXT:    vmov r0, r1, d16
146; CHECK-NEXT:    mov pc, lr
147	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
148	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
149	ret <4 x i16> %tmp2
150}
151
152define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
153; CHECK-LABEL: v_shuffledup32:
154; CHECK:       @ %bb.0:
155; CHECK-NEXT:    vdup.32 d16, r0
156; CHECK-NEXT:    vmov r0, r1, d16
157; CHECK-NEXT:    mov pc, lr
158	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
159	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
160	ret <2 x i32> %tmp2
161}
162
163define <2 x float> @v_shuffledupfloat(float %A) nounwind {
164; CHECK-LABEL: v_shuffledupfloat:
165; CHECK:       @ %bb.0:
166; CHECK-NEXT:    vdup.32 d16, r0
167; CHECK-NEXT:    vmov r0, r1, d16
168; CHECK-NEXT:    mov pc, lr
169	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
170	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
171	ret <2 x float> %tmp2
172}
173
174define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
175; CHECK-LABEL: v_shuffledupQ8:
176; CHECK:       @ %bb.0:
177; CHECK-NEXT:    vdup.8 q8, r0
178; CHECK-NEXT:    vmov r0, r1, d16
179; CHECK-NEXT:    vmov r2, r3, d17
180; CHECK-NEXT:    mov pc, lr
181	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
182	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
183	ret <16 x i8> %tmp2
184}
185
186define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
187; CHECK-LABEL: v_shuffledupQ16:
188; CHECK:       @ %bb.0:
189; CHECK-NEXT:    vdup.16 q8, r0
190; CHECK-NEXT:    vmov r0, r1, d16
191; CHECK-NEXT:    vmov r2, r3, d17
192; CHECK-NEXT:    mov pc, lr
193	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
194	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
195	ret <8 x i16> %tmp2
196}
197
198define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
199; CHECK-LABEL: v_shuffledupQ32:
200; CHECK:       @ %bb.0:
201; CHECK-NEXT:    vdup.32 q8, r0
202; CHECK-NEXT:    vmov r0, r1, d16
203; CHECK-NEXT:    vmov r2, r3, d17
204; CHECK-NEXT:    mov pc, lr
205	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
206	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
207	ret <4 x i32> %tmp2
208}
209
210define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
211; CHECK-LABEL: v_shuffledupQfloat:
212; CHECK:       @ %bb.0:
213; CHECK-NEXT:    vdup.32 q8, r0
214; CHECK-NEXT:    vmov r0, r1, d16
215; CHECK-NEXT:    vmov r2, r3, d17
216; CHECK-NEXT:    mov pc, lr
217	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
218	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
219	ret <4 x float> %tmp2
220}
221
222define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
223; CHECK-LABEL: vduplane8:
224; CHECK:       @ %bb.0:
225; CHECK-NEXT:    vldr d16, [r0]
226; CHECK-NEXT:    vdup.8 d16, d16[1]
227; CHECK-NEXT:    vmov r0, r1, d16
228; CHECK-NEXT:    mov pc, lr
229	%tmp1 = load <8 x i8>, <8 x i8>* %A
230	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
231	ret <8 x i8> %tmp2
232}
233
234define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
235; CHECK-LABEL: vduplane16:
236; CHECK:       @ %bb.0:
237; CHECK-NEXT:    vldr d16, [r0]
238; CHECK-NEXT:    vdup.16 d16, d16[1]
239; CHECK-NEXT:    vmov r0, r1, d16
240; CHECK-NEXT:    mov pc, lr
241	%tmp1 = load <4 x i16>, <4 x i16>* %A
242	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
243	ret <4 x i16> %tmp2
244}
245
246define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
247; CHECK-LABEL: vduplane32:
248; CHECK:       @ %bb.0:
249; CHECK-NEXT:    vldr d16, [r0]
250; CHECK-NEXT:    vdup.32 d16, d16[1]
251; CHECK-NEXT:    vmov r0, r1, d16
252; CHECK-NEXT:    mov pc, lr
253	%tmp1 = load <2 x i32>, <2 x i32>* %A
254	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
255	ret <2 x i32> %tmp2
256}
257
258define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
259; CHECK-LABEL: vduplanefloat:
260; CHECK:       @ %bb.0:
261; CHECK-NEXT:    vldr d16, [r0]
262; CHECK-NEXT:    vdup.32 d16, d16[1]
263; CHECK-NEXT:    vmov r0, r1, d16
264; CHECK-NEXT:    mov pc, lr
265	%tmp1 = load <2 x float>, <2 x float>* %A
266	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
267	ret <2 x float> %tmp2
268}
269
270define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
271; CHECK-LABEL: vduplaneQ8:
272; CHECK:       @ %bb.0:
273; CHECK-NEXT:    vldr d16, [r0]
274; CHECK-NEXT:    vdup.8 q8, d16[1]
275; CHECK-NEXT:    vmov r0, r1, d16
276; CHECK-NEXT:    vmov r2, r3, d17
277; CHECK-NEXT:    mov pc, lr
278	%tmp1 = load <8 x i8>, <8 x i8>* %A
279	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
280	ret <16 x i8> %tmp2
281}
282
283define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
284; CHECK-LABEL: vduplaneQ16:
285; CHECK:       @ %bb.0:
286; CHECK-NEXT:    vldr d16, [r0]
287; CHECK-NEXT:    vdup.16 q8, d16[1]
288; CHECK-NEXT:    vmov r0, r1, d16
289; CHECK-NEXT:    vmov r2, r3, d17
290; CHECK-NEXT:    mov pc, lr
291	%tmp1 = load <4 x i16>, <4 x i16>* %A
292	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
293	ret <8 x i16> %tmp2
294}
295
296define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
297; CHECK-LABEL: vduplaneQ32:
298; CHECK:       @ %bb.0:
299; CHECK-NEXT:    vldr d16, [r0]
300; CHECK-NEXT:    vdup.32 q8, d16[1]
301; CHECK-NEXT:    vmov r0, r1, d16
302; CHECK-NEXT:    vmov r2, r3, d17
303; CHECK-NEXT:    mov pc, lr
304	%tmp1 = load <2 x i32>, <2 x i32>* %A
305	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
306	ret <4 x i32> %tmp2
307}
308
309define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
310; CHECK-LABEL: vduplaneQfloat:
311; CHECK:       @ %bb.0:
312; CHECK-NEXT:    vldr d16, [r0]
313; CHECK-NEXT:    vdup.32 q8, d16[1]
314; CHECK-NEXT:    vmov r0, r1, d16
315; CHECK-NEXT:    vmov r2, r3, d17
316; CHECK-NEXT:    mov pc, lr
317	%tmp1 = load <2 x float>, <2 x float>* %A
318	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
319	ret <4 x float> %tmp2
320}
321
322define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
323; CHECK-LABEL: foo:
324; CHECK:       @ %bb.0: @ %entry
325; CHECK-NEXT:    mov r0, r2
326; CHECK-NEXT:    mov r1, r3
327; CHECK-NEXT:    mov pc, lr
328entry:
329  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
330  ret <2 x i64> %0
331}
332
333define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
334; CHECK-LABEL: bar:
335; CHECK:       @ %bb.0: @ %entry
336; CHECK-NEXT:    mov r2, r0
337; CHECK-NEXT:    mov r3, r1
338; CHECK-NEXT:    mov pc, lr
339entry:
340  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
341  ret <2 x i64> %0
342}
343
344define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
345; CHECK-LABEL: baz:
346; CHECK:       @ %bb.0: @ %entry
347; CHECK-NEXT:    mov r0, r2
348; CHECK-NEXT:    mov r1, r3
349; CHECK-NEXT:    mov pc, lr
350entry:
351  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
352  ret <2 x double> %0
353}
354
355define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
356; CHECK-LABEL: qux:
357; CHECK:       @ %bb.0: @ %entry
358; CHECK-NEXT:    mov r2, r0
359; CHECK-NEXT:    mov r3, r1
360; CHECK-NEXT:    mov pc, lr
361entry:
362  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
363  ret <2 x double> %0
364}
365
366; Radar 7373643
367define void @redundantVdup(<8 x i8>* %ptr) nounwind {
368; CHECK-LABEL: redundantVdup:
369; CHECK:       @ %bb.0:
370; CHECK-NEXT:    vmov.i8 d16, #0x80
371; CHECK-NEXT:    vstr d16, [r0]
372; CHECK-NEXT:    mov pc, lr
373  %1 = insertelement <8 x i8> undef, i8 -128, i32 0
374  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
375  store <8 x i8> %2, <8 x i8>* %ptr, align 8
376  ret void
377}
378
379define <4 x i32> @tdupi(i32 %x, i32 %y) {
380; CHECK-LABEL: tdupi:
381; CHECK:       @ %bb.0:
382; CHECK-NEXT:    vdup.32 q8, r0
383; CHECK-NEXT:    vmov.32 d17[1], r1
384; CHECK-NEXT:    vmov r0, r1, d16
385; CHECK-NEXT:    vmov r2, r3, d17
386; CHECK-NEXT:    mov pc, lr
387  %1 = insertelement <4 x i32> undef, i32 %x, i32 0
388  %2 = insertelement <4 x i32> %1, i32 %x, i32 1
389  %3 = insertelement <4 x i32> %2, i32 %x, i32 2
390  %4 = insertelement <4 x i32> %3, i32 %y, i32 3
391  ret <4 x i32> %4
392}
393
394define <4 x float> @tdupf(float %x, float %y) {
395; CHECK-LABEL: tdupf:
396; CHECK:       @ %bb.0:
397; CHECK-NEXT:    vdup.32 q0, r0
398; CHECK-NEXT:    vmov s3, r1
399; CHECK-NEXT:    vmov r0, r1, d0
400; CHECK-NEXT:    vmov r2, r3, d1
401; CHECK-NEXT:    mov pc, lr
402  %1 = insertelement <4 x float> undef, float %x, i32 0
403  %2 = insertelement <4 x float> %1, float %x, i32 1
404  %3 = insertelement <4 x float> %2, float %x, i32 2
405  %4 = insertelement <4 x float> %3, float %y, i32 3
406  ret <4 x float> %4
407}
408
409; This test checks that when splatting an element from a vector into another,
410; the value isn't moved out to GPRs first.
411define <4 x i32> @tduplane(<4 x i32> %invec) {
412; CHECK-LABEL: tduplane:
413; CHECK:       @ %bb.0:
414; CHECK-NEXT:    vmov d16, r0, r1
415; CHECK-NEXT:    mov r0, #255
416; CHECK-NEXT:    vdup.32 q8, d16[1]
417; CHECK-NEXT:    vmov.32 d17[1], r0
418; CHECK-NEXT:    vmov r0, r1, d16
419; CHECK-NEXT:    vmov r2, r3, d17
420; CHECK-NEXT:    mov pc, lr
421  %in = extractelement <4 x i32> %invec, i32 1
422  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
423  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
424  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
425  %4 = insertelement <4 x i32> %3, i32 255, i32 3
426  ret <4 x i32> %4
427}
428
429define <2 x float> @check_f32(<4 x float> %v) nounwind {
430; CHECK-LABEL: check_f32:
431; CHECK:       @ %bb.0:
432; CHECK-NEXT:    vmov d17, r2, r3
433; CHECK-NEXT:    vmov d16, r0, r1
434; CHECK-NEXT:    vdup.32 d16, d17[1]
435; CHECK-NEXT:    vmov r0, r1, d16
436; CHECK-NEXT:    mov pc, lr
437  %x = extractelement <4 x float> %v, i32 3
438  %1 = insertelement  <2 x float> undef, float %x, i32 0
439  %2 = insertelement  <2 x float> %1, float %x, i32 1
440  ret <2 x float> %2
441}
442
443define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
444; CHECK-LABEL: check_i32:
445; CHECK:       @ %bb.0:
446; CHECK-NEXT:    vmov d17, r2, r3
447; CHECK-NEXT:    vmov d16, r0, r1
448; CHECK-NEXT:    vdup.32 d16, d17[1]
449; CHECK-NEXT:    vmov r0, r1, d16
450; CHECK-NEXT:    mov pc, lr
451  %x = extractelement <4 x i32> %v, i32 3
452  %1 = insertelement  <2 x i32> undef, i32 %x, i32 0
453  %2 = insertelement  <2 x i32> %1, i32 %x, i32 1
454  ret <2 x i32> %2
455}
456
457define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
458; CHECK-LABEL: check_i16:
459; CHECK:       @ %bb.0:
460; CHECK-NEXT:    vmov d17, r2, r3
461; CHECK-NEXT:    vmov d16, r0, r1
462; CHECK-NEXT:    vdup.16 d16, d16[3]
463; CHECK-NEXT:    vmov r0, r1, d16
464; CHECK-NEXT:    mov pc, lr
465  %x = extractelement <8 x i16> %v, i32 3
466  %1 = insertelement  <4 x i16> undef, i16 %x, i32 0
467  %2 = insertelement  <4 x i16> %1, i16 %x, i32 1
468  ret <4 x i16> %2
469}
470
471define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
472; CHECK-LABEL: check_i8:
473; CHECK:       @ %bb.0:
474; CHECK-NEXT:    vmov d17, r2, r3
475; CHECK-NEXT:    vmov d16, r0, r1
476; CHECK-NEXT:    vdup.8 d16, d16[3]
477; CHECK-NEXT:    vmov r0, r1, d16
478; CHECK-NEXT:    mov pc, lr
479  %x = extractelement <16 x i8> %v, i32 3
480  %1 = insertelement  <8  x i8> undef, i8 %x, i32 0
481  %2 = insertelement  <8  x i8> %1, i8 %x, i32 1
482  ret <8 x i8> %2
483}
484
485; Check that an SPR splat produces a vdup.
486
487define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) {
488; CHECK-LABEL: check_spr_splat2:
489; CHECK:       @ %bb.0:
490; CHECK-NEXT:    lsl r2, r2, #16
491; CHECK-NEXT:    vmov d16, r0, r1
492; CHECK-NEXT:    asr r2, r2, #16
493; CHECK-NEXT:    vmov s0, r2
494; CHECK-NEXT:    vcvt.f32.s32 s0, s0
495; CHECK-NEXT:    vdup.32 d17, d0[0]
496; CHECK-NEXT:    vsub.f32 d16, d17, d16
497; CHECK-NEXT:    vmov r0, r1, d16
498; CHECK-NEXT:    mov pc, lr
499  %conv = sitofp i16 %q to float
500  %splat.splatinsert = insertelement <2 x float> undef, float %conv, i32 0
501  %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> undef, <2 x i32> zeroinitializer
502  %sub = fsub <2 x float> %splat.splat, %p
503  ret <2 x float> %sub
504}
505
506define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) {
507; CHECK-LABEL: check_spr_splat4:
508; CHECK:       @ %bb.0:
509; CHECK-NEXT:    ldrsh r12, [sp]
510; CHECK-NEXT:    vmov d17, r2, r3
511; CHECK-NEXT:    vmov d16, r0, r1
512; CHECK-NEXT:    vmov s0, r12
513; CHECK-NEXT:    vcvt.f32.s32 s0, s0
514; CHECK-NEXT:    vdup.32 q9, d0[0]
515; CHECK-NEXT:    vsub.f32 q8, q9, q8
516; CHECK-NEXT:    vmov r0, r1, d16
517; CHECK-NEXT:    vmov r2, r3, d17
518; CHECK-NEXT:    mov pc, lr
519  %conv = sitofp i16 %q to float
520  %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 0
521  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
522  %sub = fsub <4 x float> %splat.splat, %p
523  ret <4 x float> %sub
524}
525; Same codegen as above test; scalar is splatted using vld1, so shuffle index is irrelevant.
526define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) {
527; CHECK-LABEL: check_spr_splat4_lane1:
528; CHECK:       @ %bb.0:
529; CHECK-NEXT:    ldrsh r12, [sp]
530; CHECK-NEXT:    vmov d17, r2, r3
531; CHECK-NEXT:    vmov d16, r0, r1
532; CHECK-NEXT:    vmov s0, r12
533; CHECK-NEXT:    vcvt.f32.s32 s0, s0
534; CHECK-NEXT:    vdup.32 q9, d0[0]
535; CHECK-NEXT:    vsub.f32 q8, q9, q8
536; CHECK-NEXT:    vmov r0, r1, d16
537; CHECK-NEXT:    vmov r2, r3, d17
538; CHECK-NEXT:    mov pc, lr
539  %conv = sitofp i16 %q to float
540  %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 1
541  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
542  %sub = fsub <4 x float> %splat.splat, %p
543  ret <4 x float> %sub
544}
545
546; Also make sure we don't barf on variable-index extractelts, where we almost
547; could have generated a vdup.
548
549define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) {
550; CHECK-LABEL: check_i8_varidx:
551; CHECK:       @ %bb.0:
552; CHECK-NEXT:    .save {r11}
553; CHECK-NEXT:    push {r11}
554; CHECK-NEXT:    .setfp r11, sp
555; CHECK-NEXT:    mov r11, sp
556; CHECK-NEXT:    .pad #28
557; CHECK-NEXT:    sub sp, sp, #28
558; CHECK-NEXT:    bic sp, sp, #15
559; CHECK-NEXT:    ldr r12, [r11, #4]
560; CHECK-NEXT:    vmov d17, r2, r3
561; CHECK-NEXT:    vmov d16, r0, r1
562; CHECK-NEXT:    mov r1, sp
563; CHECK-NEXT:    and r0, r12, #15
564; CHECK-NEXT:    vst1.64 {d16, d17}, [r1:128], r0
565; CHECK-NEXT:    vld1.8 {d16[]}, [r1]
566; CHECK-NEXT:    vmov r0, r1, d16
567; CHECK-NEXT:    mov sp, r11
568; CHECK-NEXT:    pop {r11}
569; CHECK-NEXT:    mov pc, lr
570  %x = extractelement <16 x i8> %v, i32 %idx
571  %1 = insertelement  <8 x i8> undef, i8 %x, i32 0
572  %2 = insertelement  <8 x i8> %1, i8 %x, i32 1
573  ret <8 x i8> %2
574}
575