xref: /llvm-project/llvm/test/CodeGen/AArch64/arm64-dup.ll (revision 61510b51c33464a6bc15e4cf5b1ee07e2e0ec1c9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
4
5define <8 x i8> @v_dup8(i8 %A) nounwind {
6; CHECK-LABEL: v_dup8:
7; CHECK:       // %bb.0:
8; CHECK-NEXT:    dup.8b v0, w0
9; CHECK-NEXT:    ret
10  %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
11  %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
12  %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
13  %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
14  %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
15  %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
16  %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
17  %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
18  ret <8 x i8> %tmp8
19}
20
21define <4 x i16> @v_dup16(i16 %A) nounwind {
22; CHECK-LABEL: v_dup16:
23; CHECK:       // %bb.0:
24; CHECK-NEXT:    dup.4h v0, w0
25; CHECK-NEXT:    ret
26  %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
27  %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
28  %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
29  %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
30  ret <4 x i16> %tmp4
31}
32
33define <2 x i32> @v_dup32(i32 %A) nounwind {
34; CHECK-LABEL: v_dup32:
35; CHECK:       // %bb.0:
36; CHECK-NEXT:    dup.2s v0, w0
37; CHECK-NEXT:    ret
38  %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
39  %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
40  ret <2 x i32> %tmp2
41}
42
43define <2 x float> @v_dupfloat(float %A) nounwind {
44; CHECK-LABEL: v_dupfloat:
45; CHECK:       // %bb.0:
46; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
47; CHECK-NEXT:    dup.2s v0, v0[0]
48; CHECK-NEXT:    ret
49  %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
50  %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
51  ret <2 x float> %tmp2
52}
53
54define <16 x i8> @v_dupQ8(i8 %A) nounwind {
55; CHECK-LABEL: v_dupQ8:
56; CHECK:       // %bb.0:
57; CHECK-NEXT:    dup.16b v0, w0
58; CHECK-NEXT:    ret
59  %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
60  %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
61  %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
62  %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
63  %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
64  %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
65  %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
66  %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
67  %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
68  %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
69  %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
70  %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
71  %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
72  %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
73  %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
74  %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
75  ret <16 x i8> %tmp16
76}
77
78define <8 x i16> @v_dupQ16(i16 %A) nounwind {
79; CHECK-LABEL: v_dupQ16:
80; CHECK:       // %bb.0:
81; CHECK-NEXT:    dup.8h v0, w0
82; CHECK-NEXT:    ret
83  %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
84  %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
85  %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
86  %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
87  %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
88  %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
89  %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
90  %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
91  ret <8 x i16> %tmp8
92}
93
94define <4 x i32> @v_dupQ32(i32 %A) nounwind {
95; CHECK-LABEL: v_dupQ32:
96; CHECK:       // %bb.0:
97; CHECK-NEXT:    dup.4s v0, w0
98; CHECK-NEXT:    ret
99  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
100  %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
101  %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
102  %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
103  ret <4 x i32> %tmp4
104}
105
106define <4 x i16> @v_dup16_const(i16 %y, ptr %p) {
107; CHECK-LABEL: v_dup16_const:
108; CHECK:       // %bb.0:
109; CHECK-NEXT:    movi.4h v0, #10
110; CHECK-NEXT:    mov w8, #10 // =0xa
111; CHECK-NEXT:    strh w8, [x1]
112; CHECK-NEXT:    ret
113    %i = insertelement <4 x i16> undef, i16 10, i32 0
114    %lo = shufflevector <4 x i16> %i, <4 x i16> undef, <4 x i32> zeroinitializer
115    store i16 10, ptr %p
116    ret <4 x i16> %lo
117}
118
119define <4 x float> @v_dupQfloat(float %A) nounwind {
120; CHECK-LABEL: v_dupQfloat:
121; CHECK:       // %bb.0:
122; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
123; CHECK-NEXT:    dup.4s v0, v0[0]
124; CHECK-NEXT:    ret
125  %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
126  %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
127  %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
128  %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
129  ret <4 x float> %tmp4
130}
131
132; Check to make sure it works with shuffles, too.
133
134define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
135; CHECK-LABEL: v_shuffledup8:
136; CHECK:       // %bb.0:
137; CHECK-NEXT:    dup.8b v0, w0
138; CHECK-NEXT:    ret
139  %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
140  %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
141  ret <8 x i8> %tmp2
142}
143
144define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
145; CHECK-LABEL: v_shuffledup16:
146; CHECK:       // %bb.0:
147; CHECK-NEXT:    dup.4h v0, w0
148; CHECK-NEXT:    ret
149  %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
150  %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
151  ret <4 x i16> %tmp2
152}
153
154define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
155; CHECK-LABEL: v_shuffledup32:
156; CHECK:       // %bb.0:
157; CHECK-NEXT:    dup.2s v0, w0
158; CHECK-NEXT:    ret
159  %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
160  %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
161  ret <2 x i32> %tmp2
162}
163
164define <2 x float> @v_shuffledupfloat(float %A) nounwind {
165; CHECK-LABEL: v_shuffledupfloat:
166; CHECK:       // %bb.0:
167; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
168; CHECK-NEXT:    dup.2s v0, v0[0]
169; CHECK-NEXT:    ret
170  %tmp1 = insertelement <2 x float> undef, float %A, i32 0
171  %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
172  ret <2 x float> %tmp2
173}
174
175define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
176; CHECK-LABEL: v_shuffledupQ8:
177; CHECK:       // %bb.0:
178; CHECK-NEXT:    dup.16b v0, w0
179; CHECK-NEXT:    ret
180  %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
181  %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
182  ret <16 x i8> %tmp2
183}
184
185define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
186; CHECK-LABEL: v_shuffledupQ16:
187; CHECK:       // %bb.0:
188; CHECK-NEXT:    dup.8h v0, w0
189; CHECK-NEXT:    ret
190  %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
191  %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
192  ret <8 x i16> %tmp2
193}
194
195define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
196; CHECK-LABEL: v_shuffledupQ32:
197; CHECK:       // %bb.0:
198; CHECK-NEXT:    dup.4s v0, w0
199; CHECK-NEXT:    ret
200  %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
201  %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
202  ret <4 x i32> %tmp2
203}
204
205define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
206; CHECK-LABEL: v_shuffledupQfloat:
207; CHECK:       // %bb.0:
208; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
209; CHECK-NEXT:    dup.4s v0, v0[0]
210; CHECK-NEXT:    ret
211  %tmp1 = insertelement <4 x float> undef, float %A, i32 0
212  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
213  ret <4 x float> %tmp2
214}
215
216define <8 x i8> @vduplane8(<8 x i8> %A) nounwind {
217; CHECK-LABEL: vduplane8:
218; CHECK:       // %bb.0:
219; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
220; CHECK-NEXT:    dup.8b v0, v0[1]
221; CHECK-NEXT:    ret
222  %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
223  ret <8 x i8> %tmp2
224}
225
226define <4 x i16> @vduplane16(<4 x i16> %A) nounwind {
227; CHECK-LABEL: vduplane16:
228; CHECK:       // %bb.0:
229; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
230; CHECK-NEXT:    dup.4h v0, v0[1]
231; CHECK-NEXT:    ret
232  %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
233  ret <4 x i16> %tmp2
234}
235
236define <2 x i32> @vduplane32(<2 x i32> %A) nounwind {
237; CHECK-LABEL: vduplane32:
238; CHECK:       // %bb.0:
239; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
240; CHECK-NEXT:    dup.2s v0, v0[1]
241; CHECK-NEXT:    ret
242  %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
243  ret <2 x i32> %tmp2
244}
245
246define <2 x float> @vduplanefloat(<2 x float> %A) nounwind {
247; CHECK-LABEL: vduplanefloat:
248; CHECK:       // %bb.0:
249; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
250; CHECK-NEXT:    dup.2s v0, v0[1]
251; CHECK-NEXT:    ret
252  %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
253  ret <2 x float> %tmp2
254}
255
256define <16 x i8> @vduplaneQ8(<8 x i8> %A) nounwind {
257; CHECK-LABEL: vduplaneQ8:
258; CHECK:       // %bb.0:
259; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
260; CHECK-NEXT:    dup.16b v0, v0[1]
261; CHECK-NEXT:    ret
262  %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
263  ret <16 x i8> %tmp2
264}
265
266define <8 x i16> @vduplaneQ16(<4 x i16> %A) nounwind {
267; CHECK-LABEL: vduplaneQ16:
268; CHECK:       // %bb.0:
269; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
270; CHECK-NEXT:    dup.8h v0, v0[1]
271; CHECK-NEXT:    ret
272  %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
273  ret <8 x i16> %tmp2
274}
275
276define <4 x i32> @vduplaneQ32(<2 x i32> %A) nounwind {
277; CHECK-LABEL: vduplaneQ32:
278; CHECK:       // %bb.0:
279; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
280; CHECK-NEXT:    dup.4s v0, v0[1]
281; CHECK-NEXT:    ret
282  %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
283  ret <4 x i32> %tmp2
284}
285
286define <4 x float> @vduplaneQfloat(<2 x float> %A) nounwind {
287; CHECK-LABEL: vduplaneQfloat:
288; CHECK:       // %bb.0:
289; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
290; CHECK-NEXT:    dup.4s v0, v0[1]
291; CHECK-NEXT:    ret
292  %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
293  ret <4 x float> %tmp2
294}
295
296define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
297; CHECK-LABEL: foo:
298; CHECK:       // %bb.0: // %entry
299; CHECK-NEXT:    dup.2d v0, v0[1]
300; CHECK-NEXT:    ret
301entry:
302  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
303  ret <2 x i64> %0
304}
305
306define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
307; CHECK-LABEL: bar:
308; CHECK:       // %bb.0: // %entry
309; CHECK-NEXT:    dup.2d v0, v0[0]
310; CHECK-NEXT:    ret
311entry:
312  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
313  ret <2 x i64> %0
314}
315
316define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
317; CHECK-LABEL: baz:
318; CHECK:       // %bb.0: // %entry
319; CHECK-NEXT:    dup.2d v0, v0[1]
320; CHECK-NEXT:    ret
321entry:
322  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
323  ret <2 x double> %0
324}
325
326define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
327; CHECK-LABEL: qux:
328; CHECK:       // %bb.0: // %entry
329; CHECK-NEXT:    dup.2d v0, v0[0]
330; CHECK-NEXT:    ret
331entry:
332  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
333  ret <2 x double> %0
334}
335
336define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone  {
337; CHECK-SD-LABEL: f:
338; CHECK-SD:       // %bb.0:
339; CHECK-SD-NEXT:    fmov s0, w0
340; CHECK-SD-NEXT:    mov.s v0[1], w1
341; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
342; CHECK-SD-NEXT:    ret
343;
344; CHECK-GI-LABEL: f:
345; CHECK-GI:       // %bb.0:
346; CHECK-GI-NEXT:    mov.s v0[0], w0
347; CHECK-GI-NEXT:    mov.s v0[1], w1
348; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
349; CHECK-GI-NEXT:    ret
350  %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
351  %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
352  ret <2 x i32> %vecinit1
353}
354
355define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone  {
356; CHECK-SD-LABEL: g:
357; CHECK-SD:       // %bb.0:
358; CHECK-SD-NEXT:    fmov s0, w0
359; CHECK-SD-NEXT:    mov.s v0[1], w1
360; CHECK-SD-NEXT:    mov.s v0[2], w1
361; CHECK-SD-NEXT:    mov.s v0[3], w0
362; CHECK-SD-NEXT:    ret
363;
364; CHECK-GI-LABEL: g:
365; CHECK-GI:       // %bb.0:
366; CHECK-GI-NEXT:    mov.s v0[0], w0
367; CHECK-GI-NEXT:    mov.s v0[1], w1
368; CHECK-GI-NEXT:    mov.s v0[2], w1
369; CHECK-GI-NEXT:    mov.s v0[3], w0
370; CHECK-GI-NEXT:    ret
371  %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
372  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
373  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
374  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3
375  ret <4 x i32> %vecinit3
376}
377
378define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone  {
379; CHECK-SD-LABEL: h:
380; CHECK-SD:       // %bb.0:
381; CHECK-SD-NEXT:    fmov d0, x0
382; CHECK-SD-NEXT:    mov.d v0[1], x1
383; CHECK-SD-NEXT:    ret
384;
385; CHECK-GI-LABEL: h:
386; CHECK-GI:       // %bb.0:
387; CHECK-GI-NEXT:    mov.d v0[0], x0
388; CHECK-GI-NEXT:    mov.d v0[1], x1
389; CHECK-GI-NEXT:    ret
390  %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
391  %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
392  ret <2 x i64> %vecinit1
393}
394
395; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that
396; the single value needed was of the same type as the vector. This is false if
397; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
398; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
399; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
400;
401; *However*, it is a dup vD.4h, vN.h[2*idx].
402define <4 x i16> @test_build_illegal(<4 x i32> %in) {
403; CHECK-SD-LABEL: test_build_illegal:
404; CHECK-SD:       // %bb.0:
405; CHECK-SD-NEXT:    dup.4h v0, v0[6]
406; CHECK-SD-NEXT:    ret
407;
408; CHECK-GI-LABEL: test_build_illegal:
409; CHECK-GI:       // %bb.0:
410; CHECK-GI-NEXT:    mov.s w8, v0[3]
411; CHECK-GI-NEXT:    mov.h v0[3], w8
412; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
413; CHECK-GI-NEXT:    ret
414  %val = extractelement <4 x i32> %in, i32 3
415  %smallval = trunc i32 %val to i16
416  %vec = insertelement <4x i16> undef, i16 %smallval, i32 3
417
418  ret <4 x i16> %vec
419}
420
421; We used to inherit an already extract_subvectored v4i16 from
422; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
423; the formation of an indexed-by-7 MLS.
424define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
425; CHECK-SD-LABEL: test_high_splat:
426; CHECK-SD:       // %bb.0: // %entry
427; CHECK-SD-NEXT:    mls.4h v0, v1, v2[7]
428; CHECK-SD-NEXT:    ret
429;
430; CHECK-GI-LABEL: test_high_splat:
431; CHECK-GI:       // %bb.0: // %entry
432; CHECK-GI-NEXT:    dup.8h v2, v2[7]
433; CHECK-GI-NEXT:    mls.4h v0, v2, v1
434; CHECK-GI-NEXT:    ret
435entry:
436  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
437  %mul = mul <4 x i16> %shuffle, %b
438  %sub = sub <4 x i16> %a, %mul
439  ret <4 x i16> %sub
440}
441
442; Also test the DUP path in the PerfectShuffle generator.
443
444define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
445; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i16:
446; CHECK-SD:       // %bb.0:
447; CHECK-SD-NEXT:    trn1.4h v0, v0, v0
448; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
449; CHECK-SD-NEXT:    mov.s v0[1], v1[0]
450; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
451; CHECK-SD-NEXT:    ret
452;
453; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i16:
454; CHECK-GI:       // %bb.0:
455; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
456; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
457; CHECK-GI-NEXT:    adrp x8, .LCPI34_0
458; CHECK-GI-NEXT:    mov.d v0[1], v1[0]
459; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI34_0]
460; CHECK-GI-NEXT:    tbl.16b v0, { v0 }, v1
461; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
462; CHECK-GI-NEXT:    ret
463  %r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
464  ret <4 x i16> %r
465}
466
467define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind {
468; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f16:
469; CHECK-SD:       // %bb.0:
470; CHECK-SD-NEXT:    trn1.4h v0, v0, v0
471; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
472; CHECK-SD-NEXT:    mov.s v0[1], v1[0]
473; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
474; CHECK-SD-NEXT:    ret
475;
476; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f16:
477; CHECK-GI:       // %bb.0:
478; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
479; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
480; CHECK-GI-NEXT:    adrp x8, .LCPI35_0
481; CHECK-GI-NEXT:    mov.d v0[1], v1[0]
482; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI35_0]
483; CHECK-GI-NEXT:    tbl.16b v0, { v0 }, v1
484; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
485; CHECK-GI-NEXT:    ret
486  %r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
487  ret <4 x half> %r
488}
489
490define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
491; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i32:
492; CHECK-SD:       // %bb.0:
493; CHECK-SD-NEXT:    trn1.4s v0, v0, v0
494; CHECK-SD-NEXT:    mov.d v0[1], v1[0]
495; CHECK-SD-NEXT:    ret
496;
497; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i32:
498; CHECK-GI:       // %bb.0:
499; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
500; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
501; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI36_0]
502; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
503; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
504; CHECK-GI-NEXT:    ret
505  %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
506  ret <4 x i32> %r
507}
508
509define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind {
510; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f32:
511; CHECK-SD:       // %bb.0:
512; CHECK-SD-NEXT:    trn1.4s v0, v0, v0
513; CHECK-SD-NEXT:    mov.d v0[1], v1[0]
514; CHECK-SD-NEXT:    ret
515;
516; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f32:
517; CHECK-GI:       // %bb.0:
518; CHECK-GI-NEXT:    adrp x8, .LCPI37_0
519; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
520; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI37_0]
521; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
522; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
523; CHECK-GI-NEXT:    ret
524  %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
525  ret <4 x float> %r
526}
527
528define void @disguised_dup(<4 x float> %x, ptr %p1, ptr %p2) {
529; CHECK-SD-LABEL: disguised_dup:
530; CHECK-SD:       // %bb.0:
531; CHECK-SD-NEXT:    ext.16b v1, v0, v0, #4
532; CHECK-SD-NEXT:    mov.s v1[2], v0[0]
533; CHECK-SD-NEXT:    dup.4s v0, v0[0]
534; CHECK-SD-NEXT:    str q1, [x0]
535; CHECK-SD-NEXT:    str q0, [x1]
536; CHECK-SD-NEXT:    ret
537;
538; CHECK-GI-LABEL: disguised_dup:
539; CHECK-GI:       // %bb.0:
540; CHECK-GI-NEXT:    adrp x8, .LCPI38_1
541; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
542; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI38_1]
543; CHECK-GI-NEXT:    adrp x8, .LCPI38_0
544; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
545; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI38_0]
546; CHECK-GI-NEXT:    tbl.16b v2, { v0, v1 }, v2
547; CHECK-GI-NEXT:    str q0, [x0]
548; CHECK-GI-NEXT:    str q2, [x1]
549; CHECK-GI-NEXT:    ret
550  %shuf = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 0>
551  %dup = shufflevector <4 x float> %shuf, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
552  store <4 x float> %shuf, ptr %p1, align 8
553  store <4 x float> %dup, ptr %p2, align 8
554  ret void
555}
556
557define <2 x i32> @dup_const2(<2 x i32> %A) nounwind {
558; CHECK-SD-LABEL: dup_const2:
559; CHECK-SD:       // %bb.0:
560; CHECK-SD-NEXT:    mov w8, #32770 // =0x8002
561; CHECK-SD-NEXT:    movk w8, #128, lsl #16
562; CHECK-SD-NEXT:    dup.2s v1, w8
563; CHECK-SD-NEXT:    add.2s v0, v0, v1
564; CHECK-SD-NEXT:    ret
565;
566; CHECK-GI-LABEL: dup_const2:
567; CHECK-GI:       // %bb.0:
568; CHECK-GI-NEXT:    adrp x8, .LCPI39_0
569; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI39_0]
570; CHECK-GI-NEXT:    add.2s v0, v0, v1
571; CHECK-GI-NEXT:    ret
572  %tmp2 = add <2 x i32> %A, <i32 8421378, i32 8421378>
573  ret <2 x i32> %tmp2
574}
575
576define <2 x i32> @dup_const4_ext(<4 x i32> %A) nounwind {
577; CHECK-SD-LABEL: dup_const4_ext:
578; CHECK-SD:       // %bb.0:
579; CHECK-SD-NEXT:    mov w8, #32769 // =0x8001
580; CHECK-SD-NEXT:    movk w8, #128, lsl #16
581; CHECK-SD-NEXT:    dup.2s v1, w8
582; CHECK-SD-NEXT:    add.2s v0, v0, v1
583; CHECK-SD-NEXT:    ret
584;
585; CHECK-GI-LABEL: dup_const4_ext:
586; CHECK-GI:       // %bb.0:
587; CHECK-GI-NEXT:    adrp x8, .LCPI40_0
588; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI40_0]
589; CHECK-GI-NEXT:    add.4s v0, v0, v1
590; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
591; CHECK-GI-NEXT:    ret
592  %tmp1 = add <4 x i32> %A, <i32 8421377, i32 8421377, i32 8421377, i32 8421377>
593  %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
594  ret <2 x i32> %tmp2
595}
596
597define <4 x i32> @dup_const24(<2 x i32> %A, <2 x i32> %B, <4 x i32> %C) nounwind {
598; CHECK-SD-LABEL: dup_const24:
599; CHECK-SD:       // %bb.0:
600; CHECK-SD-NEXT:    mov w8, #32768 // =0x8000
601; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
602; CHECK-SD-NEXT:    movk w8, #128, lsl #16
603; CHECK-SD-NEXT:    dup.4s v3, w8
604; CHECK-SD-NEXT:    add.2s v0, v0, v3
605; CHECK-SD-NEXT:    mov.d v0[1], v1[0]
606; CHECK-SD-NEXT:    add.4s v1, v2, v3
607; CHECK-SD-NEXT:    eor.16b v0, v1, v0
608; CHECK-SD-NEXT:    ret
609;
610; CHECK-GI-LABEL: dup_const24:
611; CHECK-GI:       // %bb.0:
612; CHECK-GI-NEXT:    adrp x8, .LCPI41_1
613; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
614; CHECK-GI-NEXT:    ldr d3, [x8, :lo12:.LCPI41_1]
615; CHECK-GI-NEXT:    adrp x8, .LCPI41_0
616; CHECK-GI-NEXT:    add.2s v0, v0, v3
617; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI41_0]
618; CHECK-GI-NEXT:    mov.d v0[1], v1[0]
619; CHECK-GI-NEXT:    add.4s v1, v2, v3
620; CHECK-GI-NEXT:    eor.16b v0, v1, v0
621; CHECK-GI-NEXT:    ret
622  %tmp1 = add <2 x i32> %A, <i32 8421376, i32 8421376>
623  %tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
624  %tmp3 = add <4 x i32> %C, <i32 8421376, i32 8421376, i32 8421376, i32 8421376>
625  %tmp5 = xor <4 x i32> %tmp3, %tmp4
626  ret <4 x i32> %tmp5
627}
628
629define <8 x i16> @bitcast_i64_v8i16(i64 %a) {
630; CHECK-SD-LABEL: bitcast_i64_v8i16:
631; CHECK-SD:       // %bb.0:
632; CHECK-SD-NEXT:    dup.8h v0, w0
633; CHECK-SD-NEXT:    ret
634;
635; CHECK-GI-LABEL: bitcast_i64_v8i16:
636; CHECK-GI:       // %bb.0:
637; CHECK-GI-NEXT:    fmov d0, x0
638; CHECK-GI-NEXT:    dup.8h v0, v0[0]
639; CHECK-GI-NEXT:    ret
640  %b = bitcast i64 %a to <4 x i16>
641  %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> zeroinitializer
642  ret <8 x i16> %r
643}
644
645define <8 x i16> @bitcast_i64_v8i16_lane1(i64 %a) {
646; CHECK-LABEL: bitcast_i64_v8i16_lane1:
647; CHECK:       // %bb.0:
648; CHECK-NEXT:    fmov d0, x0
649; CHECK-NEXT:    dup.8h v0, v0[1]
650; CHECK-NEXT:    ret
651  %b = bitcast i64 %a to <4 x i16>
652  %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
653  ret <8 x i16> %r
654}
655
656define <8 x i16> @bitcast_f64_v8i16(double %a) {
657; CHECK-LABEL: bitcast_f64_v8i16:
658; CHECK:       // %bb.0:
659; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
660; CHECK-NEXT:    dup.8h v0, v0[0]
661; CHECK-NEXT:    ret
662  %b = bitcast double %a to <4 x i16>
663  %r = shufflevector <4 x i16> %b, <4 x i16> poison, <8 x i32> zeroinitializer
664  ret <8 x i16> %r
665}
666
667define <8 x half> @bitcast_i64_v8f16(i64 %a) {
668; CHECK-LABEL: bitcast_i64_v8f16:
669; CHECK:       // %bb.0:
670; CHECK-NEXT:    fmov d0, x0
671; CHECK-NEXT:    dup.8h v0, v0[0]
672; CHECK-NEXT:    ret
673  %b = bitcast i64 %a to <4 x half>
674  %r = shufflevector <4 x half> %b, <4 x half> poison, <8 x i32> zeroinitializer
675  ret <8 x half> %r
676}
677
678define <2 x i64> @bitcast_i64_v2f64(i64 %a) {
679; CHECK-SD-LABEL: bitcast_i64_v2f64:
680; CHECK-SD:       // %bb.0:
681; CHECK-SD-NEXT:    fmov d0, x0
682; CHECK-SD-NEXT:    dup.2d v0, v0[0]
683; CHECK-SD-NEXT:    ret
684;
685; CHECK-GI-LABEL: bitcast_i64_v2f64:
686; CHECK-GI:       // %bb.0:
687; CHECK-GI-NEXT:    dup.2d v0, x0
688; CHECK-GI-NEXT:    ret
689  %b = bitcast i64 %a to <1 x i64>
690  %r = shufflevector <1 x i64> %b, <1 x i64> poison, <2 x i32> zeroinitializer
691  ret <2 x i64> %r
692}
693
694define <2 x i64> @bitcast_v2f64_v2i64(<2 x double> %a) {
695; CHECK-LABEL: bitcast_v2f64_v2i64:
696; CHECK:       // %bb.0:
697; CHECK-NEXT:    dup.2d v0, v0[0]
698; CHECK-NEXT:    ret
699  %b = bitcast <2 x double> %a to <2 x i64>
700  %r = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer
701  ret <2 x i64> %r
702}
703
704define <2 x i64> @bitcast_v8i16_v2i64(<8 x i16> %a) {
705; CHECK-LABEL: bitcast_v8i16_v2i64:
706; CHECK:       // %bb.0:
707; CHECK-NEXT:    dup.2d v0, v0[0]
708; CHECK-NEXT:    ret
709  %b = bitcast <8 x i16> %a to <2 x i64>
710  %r = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer
711  ret <2 x i64> %r
712}
713
714define <8 x i16> @bitcast_v2f64_v8i16(<2 x i64> %a) {
715; CHECK-LABEL: bitcast_v2f64_v8i16:
716; CHECK:       // %bb.0:
717; CHECK-NEXT:    dup.8h v0, v0[0]
718; CHECK-NEXT:    ret
719  %b = bitcast <2 x i64> %a to <8 x i16>
720  %r = shufflevector <8 x i16> %b, <8 x i16> poison, <8 x i32> zeroinitializer
721  ret <8 x i16> %r
722}
723
724define <4 x i16> @dup_i16_v4i16_constant() {
725; CHECK-SD-LABEL: dup_i16_v4i16_constant:
726; CHECK-SD:       // %bb.0:
727; CHECK-SD-NEXT:    mov w8, #9211 // =0x23fb
728; CHECK-SD-NEXT:    dup.4h v0, w8
729; CHECK-SD-NEXT:    ret
730;
731; CHECK-GI-LABEL: dup_i16_v4i16_constant:
732; CHECK-GI:       // %bb.0:
733; CHECK-GI-NEXT:    adrp x8, .LCPI50_0
734; CHECK-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI50_0]
735; CHECK-GI-NEXT:    ret
736  ret <4 x i16> <i16 9211, i16 9211, i16 9211, i16 9211>
737}
738