xref: /llvm-project/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll (revision f6947e479e14e7904aa0b2539a95f5dfdc8f9295)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
3
4; CHECK: .LCPI0_0:
5; CHECK: 	.byte	0                               // 0x0
6; CHECK: 	.byte	16                              // 0x10
7; CHECK: 	.byte	32                              // 0x20
8; CHECK: 	.byte	48                              // 0x30
9; CHECK: 	.byte	2                               // 0x2
10; CHECK: 	.byte	18                              // 0x12
11; CHECK: 	.byte	34                              // 0x22
12; CHECK: 	.byte	50                              // 0x32
13; CHECK: 	.byte	4                               // 0x4
14; CHECK: 	.byte	20                              // 0x14
15; CHECK: 	.byte	36                              // 0x24
16; CHECK: 	.byte	52                              // 0x34
17; CHECK: 	.byte	6                               // 0x6
18; CHECK: 	.byte	22                              // 0x16
19; CHECK: 	.byte	38                              // 0x26
20; CHECK: 	.byte	54                              // 0x36
21define <16 x i8> @shuffle4_v4i8_16(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
22; CHECK-LABEL: shuffle4_v4i8_16:
23; CHECK:       // %bb.0:
24; CHECK-NEXT:    // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
25; CHECK-NEXT:    adrp x8, .LCPI0_0
26; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
27; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI0_0]
28; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
29; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
30; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
31; CHECK-NEXT:    ret
32  %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
33  %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
34  %z = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
35  ret <16 x i8> %z
36}
37
38; CHECK: .LCPI1_0:
39; CHECK: 	.byte	0                               // 0x0
40; CHECK: 	.byte	16                              // 0x10
41; CHECK: 	.byte	32                              // 0x20
42; CHECK: 	.byte	48                              // 0x30
43; CHECK: 	.byte	2                               // 0x2
44; CHECK: 	.byte	18                              // 0x12
45; CHECK: 	.byte	34                              // 0x22
46; CHECK: 	.byte	50                              // 0x32
47define <8 x i8> @shuffle4_v4i8_8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
48; CHECK-LABEL: shuffle4_v4i8_8:
49; CHECK:       // %bb.0:
50; CHECK-NEXT:    // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
51; CHECK-NEXT:    adrp x8, .LCPI1_0
52; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
53; CHECK-NEXT:    ldr d4, [x8, :lo12:.LCPI1_0]
54; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
55; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
56; CHECK-NEXT:    tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
57; CHECK-NEXT:    ret
58  %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
59  %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
60  %z = shufflevector <8 x i8> %x, <8 x i8> %y, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13>
61  ret <8 x i8> %z
62}
63
64; CHECK: .LCPI2_0:
65; CHECK: 	.byte	0                               // 0x0
66; CHECK: 	.byte	3                               // 0x3
67; CHECK: 	.byte	2                               // 0x2
68; CHECK: 	.byte	1                               // 0x1
69; CHECK: 	.byte	12                              // 0xc
70; CHECK: 	.byte	15                              // 0xf
71; CHECK: 	.byte	14                              // 0xe
72; CHECK: 	.byte	12                              // 0xc
73; CHECK: .LCPI2_1:
74; CHECK: 	.byte	4                               // 0x4
75; CHECK: 	.byte	7                               // 0x7
76; CHECK: 	.byte	6                               // 0x6
77; CHECK: 	.byte	7                               // 0x7
78; CHECK: 	.byte	8                               // 0x8
79; CHECK: 	.byte	10                              // 0xa
80; CHECK: 	.byte	9                               // 0x9
81; CHECK: 	.byte	11                              // 0xb
82; CHECK: 	.section	.rodata.cst16,"aM",@progbits,16
83; CHECK: 	.p2align	4
84; CHECK: .LCPI2_2:
85; CHECK: 	.byte	0                               // 0x0
86; CHECK: 	.byte	4                               // 0x4
87; CHECK: 	.byte	16                              // 0x10
88; CHECK: 	.byte	20                              // 0x14
89; CHECK: 	.byte	1                               // 0x1
90; CHECK: 	.byte	5                               // 0x5
91; CHECK: 	.byte	17                              // 0x11
92; CHECK: 	.byte	21                              // 0x15
93; CHECK: 	.byte	2                               // 0x2
94; CHECK: 	.byte	6                               // 0x6
95; CHECK: 	.byte	18                              // 0x12
96; CHECK: 	.byte	22                              // 0x16
97; CHECK: 	.byte	3                               // 0x3
98; CHECK: 	.byte	7                               // 0x7
99; CHECK: 	.byte	19                              // 0x13
100; CHECK: 	.byte	23                              // 0x17
101define <16 x i8> @shuffle4_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
102; CHECK-LABEL: shuffle4_v8i8:
103; CHECK:       // %bb.0:
104; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
105; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
106; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
107; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
108; CHECK-NEXT:    adrp x8, .LCPI2_0
109; CHECK-NEXT:    mov v0.d[1], v1.d[0]
110; CHECK-NEXT:    mov v2.d[1], v3.d[0]
111; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI2_0]
112; CHECK-NEXT:    adrp x8, .LCPI2_1
113; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI2_1]
114; CHECK-NEXT:    adrp x8, .LCPI2_2
115; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
116; CHECK-NEXT:    tbl v1.8b, { v2.16b }, v3.8b
117; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI2_2]
118; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
119; CHECK-NEXT:    ret
120  %x = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 1, i32 12, i32 15, i32 14, i32 12>
121  %y = shufflevector <8 x i8> %c, <8 x i8> %d, <8 x i32> <i32 4, i32 7, i32 6, i32 7, i32 8, i32 10, i32 9, i32 11>
122  %z = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
123  ret <16 x i8> %z
124}
125
126; CHECK: .LCPI3_0:
127; CHECK: 	.byte	0                               // 0x0
128; CHECK: 	.byte	3                               // 0x3
129; CHECK: 	.byte	2                               // 0x2
130; CHECK: 	.byte	1                               // 0x1
131; CHECK: 	.byte	12                              // 0xc
132; CHECK: 	.byte	15                              // 0xf
133; CHECK: 	.byte	14                              // 0xe
134; CHECK: 	.byte	12                              // 0xc
135; CHECK: 	.byte	255                             // 0xff
136; CHECK: 	.byte	255                             // 0xff
137; CHECK: 	.byte	255                             // 0xff
138; CHECK: 	.byte	255                             // 0xff
139; CHECK: 	.byte	255                             // 0xff
140; CHECK: 	.byte	255                             // 0xff
141; CHECK: 	.byte	255                             // 0xff
142; CHECK: 	.byte	255                             // 0xff
143; CHECK: .LCPI3_1:
144; CHECK: 	.byte	4                               // 0x4
145; CHECK: 	.byte	7                               // 0x7
146; CHECK: 	.byte	6                               // 0x6
147; CHECK: 	.byte	7                               // 0x7
148; CHECK: 	.byte	8                               // 0x8
149; CHECK: 	.byte	10                              // 0xa
150; CHECK: 	.byte	9                               // 0x9
151; CHECK: 	.byte	11                              // 0xb
152; CHECK: 	.byte	255                             // 0xff
153; CHECK: 	.byte	255                             // 0xff
154; CHECK: 	.byte	255                             // 0xff
155; CHECK: 	.byte	255                             // 0xff
156; CHECK: 	.byte	255                             // 0xff
157; CHECK: 	.byte	255                             // 0xff
158; CHECK: 	.byte	255                             // 0xff
159; CHECK: 	.byte	255                             // 0xff
160; CHECK: .LCPI3_2:
161; CHECK: 	.byte	16                              // 0x10
162; CHECK: 	.byte	20                              // 0x14
163; CHECK: 	.byte	0                               // 0x0
164; CHECK: 	.byte	4                               // 0x4
165; CHECK: 	.byte	17                              // 0x11
166; CHECK: 	.byte	21                              // 0x15
167; CHECK: 	.byte	1                               // 0x1
168; CHECK: 	.byte	5                               // 0x5
169; CHECK: 	.byte	18                              // 0x12
170; CHECK: 	.byte	22                              // 0x16
171; CHECK: 	.byte	2                               // 0x2
172; CHECK: 	.byte	6                               // 0x6
173; CHECK: 	.byte	19                              // 0x13
174; CHECK: 	.byte	23                              // 0x17
175; CHECK: 	.byte	3                               // 0x3
176; CHECK: 	.byte	7                               // 0x7
177define <16 x i8> @shuffle4_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
178; CHECK-LABEL: shuffle4_v16i8:
179; CHECK:       // %bb.0:
180; CHECK-NEXT:    adrp x8, .LCPI3_0
181; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
182; CHECK-NEXT:    adrp x8, .LCPI3_1
183; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI3_1]
184; CHECK-NEXT:    adrp x8, .LCPI3_2
185; CHECK-NEXT:    tbl v1.16b, { v0.16b }, v1.16b
186; CHECK-NEXT:    tbl v0.16b, { v2.16b }, v3.16b
187; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_2]
188; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
189; CHECK-NEXT:    ret
190  %x = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 1, i32 12, i32 15, i32 14, i32 12>
191  %y = shufflevector <16 x i8> %c, <16 x i8> %d, <8 x i32> <i32 4, i32 7, i32 6, i32 7, i32 8, i32 10, i32 9, i32 11>
192  %z = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
193  ret <16 x i8> %z
194}
195
196; CHECK: .LCPI4_0:
197; CHECK: 	.byte	0                               // 0x0
198; CHECK: 	.byte	1                               // 0x1
199; CHECK: 	.byte	8                               // 0x8
200; CHECK: 	.byte	9                               // 0x9
201; CHECK: 	.byte	16                              // 0x10
202; CHECK: 	.byte	17                              // 0x11
203; CHECK: 	.byte	24                              // 0x18
204; CHECK: 	.byte	25                              // 0x19
205; CHECK: 	.byte	2                               // 0x2
206; CHECK: 	.byte	3                               // 0x3
207; CHECK: 	.byte	10                              // 0xa
208; CHECK: 	.byte	11                              // 0xb
209; CHECK: 	.byte	18                              // 0x12
210; CHECK: 	.byte	19                              // 0x13
211; CHECK: 	.byte	26                              // 0x1a
212; CHECK: 	.byte	27                              // 0x1b
213define <8 x i16> @shuffle4_v8i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
214; CHECK-LABEL: shuffle4_v8i16:
215; CHECK:       // %bb.0:
216; CHECK-NEXT:    fmov d5, d2
217; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
218; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
219; CHECK-NEXT:    adrp x8, .LCPI4_0
220; CHECK-NEXT:    fmov d4, d0
221; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI4_0]
222; CHECK-NEXT:    mov v4.d[1], v1.d[0]
223; CHECK-NEXT:    mov v5.d[1], v3.d[0]
224; CHECK-NEXT:    tbl v0.16b, { v4.16b, v5.16b }, v0.16b
225; CHECK-NEXT:    ret
226  %x = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
227  %y = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
228  %z = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13>
229  ret <8 x i16> %z
230}
231
232define <4 x i32> @shuffle4_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
233; CHECK-LABEL: shuffle4_v4i32:
234; CHECK:       // %bb.0:
235; CHECK-NEXT:    zip1 v1.4s, v1.4s, v1.4s
236; CHECK-NEXT:    rev64 v3.4s, v3.4s
237; CHECK-NEXT:    ext v1.16b, v1.16b, v0.16b, #4
238; CHECK-NEXT:    zip2 v0.4s, v3.4s, v2.4s
239; CHECK-NEXT:    mov v0.d[1], v1.d[1]
240; CHECK-NEXT:    ret
241  %x = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
242  %y = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
243  %z = shufflevector <8 x i32> %x, <8 x i32> %y, <4 x i32> <i32 15, i32 10, i32 5, i32 0>
244  ret <4 x i32> %z
245}
246
247; CHECK: .LCPI6_0:
248; CHECK: 	.byte	0                               // 0x0
249; CHECK: 	.byte	7                               // 0x7
250; CHECK: 	.byte	255                             // 0xff
251; CHECK: 	.byte	1                               // 0x1
252; CHECK: 	.byte	255                             // 0xff
253; CHECK: 	.byte	255                             // 0xff
254; CHECK: 	.byte	255                             // 0xff
255; CHECK: 	.byte	255                             // 0xff
256; CHECK: 	.section	.rodata.cst16,"aM",@progbits,16
257; CHECK: 	.p2align	4
258; CHECK: .LCPI6_1:
259; CHECK: 	.byte	0                               // 0x0
260; CHECK: 	.byte	16                              // 0x10
261; CHECK: 	.byte	19                              // 0x13
262; CHECK: 	.byte	3                               // 0x3
263; CHECK: 	.byte	1                               // 0x1
264; CHECK: 	.byte	17                              // 0x11
265; CHECK: 	.byte	0                               // 0x0
266; CHECK: 	.byte	1                               // 0x1
267; CHECK: 	.byte	0                               // 0x0
268; CHECK: 	.byte	16                              // 0x10
269; CHECK: 	.byte	19                              // 0x13
270; CHECK: 	.byte	3                               // 0x3
271; CHECK: 	.byte	1                               // 0x1
272; CHECK: 	.byte	17                              // 0x11
273; CHECK: 	.byte	0                               // 0x0
274; CHECK: 	.byte	1                               // 0x1
275define <16 x i8> @shuffle4_v8i8_v16i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
276; CHECK-LABEL: shuffle4_v8i8_v16i8:
277; CHECK:       // %bb.0:
278; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
279; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
280; CHECK-NEXT:    adrp x8, .LCPI6_0
281; CHECK-NEXT:    mov v2.d[1], v2.d[0]
282; CHECK-NEXT:    mov v0.d[1], v0.d[0]
283; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI6_0]
284; CHECK-NEXT:    adrp x8, .LCPI6_1
285; CHECK-NEXT:    tbl v3.8b, { v2.16b }, v1.8b
286; CHECK-NEXT:    tbl v2.8b, { v0.16b }, v1.8b
287; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI6_1]
288; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
289; CHECK-NEXT:    ret
290  %x = shufflevector <8 x i8> %a, <8 x i8> %b, <4 x i32> <i32 0, i32 7, i32 5, i32 1>
291  %y = shufflevector <8 x i8> %c, <8 x i8> %d, <4 x i32> <i32 0, i32 7, i32 5, i32 1>
292  %z = shufflevector <4 x i8> %x, <4 x i8> %y, <16 x i32> <i32 0, i32 4, i32 7, i32 3, i32 1, i32 5, i32 0, i32 1, i32 0, i32 4, i32 7, i32 3, i32 1, i32 5, i32 0, i32 1>
293  ret <16 x i8> %z
294}
295
296; CHECK: .LCPI7_0:
297; CHECK: 	.byte	0                               // 0x0
298; CHECK: 	.byte	7                               // 0x7
299; CHECK: 	.byte	255                             // 0xff
300; CHECK: 	.byte	1                               // 0x1
301; CHECK: 	.byte	255                             // 0xff
302; CHECK: 	.byte	255                             // 0xff
303; CHECK: 	.byte	255                             // 0xff
304; CHECK: 	.byte	255                             // 0xff
305; CHECK: .LCPI7_1:
306; CHECK: 	.byte	0                               // 0x0
307; CHECK: 	.byte	8                               // 0x8
308; CHECK: 	.byte	11                              // 0xb
309; CHECK: 	.byte	3                               // 0x3
310; CHECK: 	.byte	1                               // 0x1
311; CHECK: 	.byte	9                               // 0x9
312; CHECK: 	.byte	0                               // 0x0
313; CHECK: 	.byte	1                               // 0x1
314define <8 x i8> @shuffle4_v8i8_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
315; CHECK-LABEL: shuffle4_v8i8_v8i8:
316; CHECK:       // %bb.0:
317; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
318; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
319; CHECK-NEXT:    adrp x8, .LCPI7_0
320; CHECK-NEXT:    mov v2.d[1], v2.d[0]
321; CHECK-NEXT:    mov v0.d[1], v0.d[0]
322; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI7_0]
323; CHECK-NEXT:    adrp x8, .LCPI7_1
324; CHECK-NEXT:    tbl v2.8b, { v2.16b }, v1.8b
325; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
326; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI7_1]
327; CHECK-NEXT:    mov v0.d[1], v2.d[0]
328; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
329; CHECK-NEXT:    ret
330  %x = shufflevector <8 x i8> %a, <8 x i8> %b, <4 x i32> <i32 0, i32 7, i32 5, i32 1>
331  %y = shufflevector <8 x i8> %c, <8 x i8> %d, <4 x i32> <i32 0, i32 7, i32 5, i32 1>
332  %z = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> <i32 0, i32 4, i32 7, i32 3, i32 1, i32 5, i32 0, i32 1>
333  ret <8 x i8> %z
334}
335
336; CHECK: .LCPI8_0:
337; CHECK: 	.byte	0                               // 0x0
338; CHECK: 	.byte	1                               // 0x1
339; CHECK: 	.byte	8                               // 0x8
340; CHECK: 	.byte	9                               // 0x9
341; CHECK: 	.byte	16                              // 0x10
342; CHECK: 	.byte	17                              // 0x11
343; CHECK: 	.byte	24                              // 0x18
344; CHECK: 	.byte	25                              // 0x19
345; CHECK: 	.byte	2                               // 0x2
346; CHECK: 	.byte	3                               // 0x3
347; CHECK: 	.byte	10                              // 0xa
348; CHECK: 	.byte	11                              // 0xb
349; CHECK: 	.byte	18                              // 0x12
350; CHECK: 	.byte	19                              // 0x13
351; CHECK: 	.byte	26                              // 0x1a
352; CHECK: 	.byte	27                              // 0x1b
353define <8 x i16> @shuffle4_v4i8_zext(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
354; CHECK-LABEL: shuffle4_v4i8_zext:
355; CHECK:       // %bb.0:
356; CHECK-NEXT:    fmov d5, d2
357; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
358; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
359; CHECK-NEXT:    adrp x8, .LCPI8_0
360; CHECK-NEXT:    fmov d4, d0
361; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
362; CHECK-NEXT:    mov v4.d[1], v1.d[0]
363; CHECK-NEXT:    mov v5.d[1], v3.d[0]
364; CHECK-NEXT:    bic v4.8h, #255, lsl #8
365; CHECK-NEXT:    bic v5.8h, #255, lsl #8
366; CHECK-NEXT:    tbl v0.16b, { v4.16b, v5.16b }, v0.16b
367; CHECK-NEXT:    ret
368  %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
369  %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
370  %xe = zext <8 x i8> %x to <8 x i16>
371  %ye = zext <8 x i8> %y to <8 x i16>
372  %z = shufflevector <8 x i16> %xe, <8 x i16> %ye, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13>
373  ret <8 x i16> %z
374}
375
376; CHECK: .LCPI9_0:
377; CHECK: 	.byte	0                               // 0x0
378; CHECK: 	.byte	16                              // 0x10
379; CHECK: 	.byte	32                              // 0x20
380; CHECK: 	.byte	48                              // 0x30
381; CHECK: 	.byte	2                               // 0x2
382; CHECK: 	.byte	18                              // 0x12
383; CHECK: 	.byte	34                              // 0x22
384; CHECK: 	.byte	50                              // 0x32
385; CHECK: 	.byte	4                               // 0x4
386; CHECK: 	.byte	20                              // 0x14
387; CHECK: 	.byte	36                              // 0x24
388; CHECK: 	.byte	52                              // 0x34
389; CHECK: 	.byte	6                               // 0x6
390; CHECK: 	.byte	22                              // 0x16
391; CHECK: 	.byte	38                              // 0x26
392; CHECK: 	.byte	54                              // 0x36
393define <16 x i8> @shuffle4_v4i16_trunc(<4 x i16> %ae, <4 x i16> %be, <4 x i16> %ce, <4 x i16> %de) {
394; CHECK-LABEL: shuffle4_v4i16_trunc:
395; CHECK:       // %bb.0:
396; CHECK-NEXT:    // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
397; CHECK-NEXT:    adrp x8, .LCPI9_0
398; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
399; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI9_0]
400; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
401; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
402; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
403; CHECK-NEXT:    ret
404  %a = trunc <4 x i16> %ae to <4 x i8>
405  %b = trunc <4 x i16> %be to <4 x i8>
406  %c = trunc <4 x i16> %ce to <4 x i8>
407  %d = trunc <4 x i16> %de to <4 x i8>
408  %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
409  %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
410  %z = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
411  ret <16 x i8> %z
412}
413
414; CHECK: .LCPI10_0:
415; CHECK: 	.byte	0                               // 0x0
416; CHECK: 	.byte	16                              // 0x10
417; CHECK: 	.byte	32                              // 0x20
418; CHECK: 	.byte	48                              // 0x30
419; CHECK: 	.byte	2                               // 0x2
420; CHECK: 	.byte	18                              // 0x12
421; CHECK: 	.byte	34                              // 0x22
422; CHECK: 	.byte	50                              // 0x32
423; CHECK: 	.byte	4                               // 0x4
424; CHECK: 	.byte	20                              // 0x14
425; CHECK: 	.byte	36                              // 0x24
426; CHECK: 	.byte	52                              // 0x34
427; CHECK: 	.byte	6                               // 0x6
428; CHECK: 	.byte	22                              // 0x16
429; CHECK: 	.byte	38                              // 0x26
430; CHECK: 	.byte	54                              // 0x36
431; CHECK: 	.text
432define <16 x i8> @shuffle4_v4i32_trunc(<4 x i32> %ae, <4 x i32> %be, <4 x i32> %ce, <4 x i32> %de) {
433; CHECK-LABEL: shuffle4_v4i32_trunc:
434; CHECK:       // %bb.0:
435; CHECK-NEXT:    xtn v4.4h, v0.4s
436; CHECK-NEXT:    adrp x8, .LCPI10_0
437; CHECK-NEXT:    xtn v5.4h, v1.4s
438; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI10_0]
439; CHECK-NEXT:    xtn v6.4h, v2.4s
440; CHECK-NEXT:    xtn v7.4h, v3.4s
441; CHECK-NEXT:    tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
442; CHECK-NEXT:    ret
443  %a = trunc <4 x i32> %ae to <4 x i8>
444  %b = trunc <4 x i32> %be to <4 x i8>
445  %c = trunc <4 x i32> %ce to <4 x i8>
446  %d = trunc <4 x i32> %de to <4 x i8>
447  %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
448  %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
449  %z = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
450  ret <16 x i8> %z
451}
452
453; CHECK: .LCPI11_0:
454; CHECK: 	.byte	0                               // 0x0
455; CHECK: 	.byte	16                              // 0x10
456; CHECK: 	.byte	32                              // 0x20
457; CHECK: 	.byte	2                               // 0x2
458; CHECK: 	.byte	18                              // 0x12
459; CHECK: 	.byte	34                              // 0x22
460; CHECK: 	.byte	4                               // 0x4
461; CHECK: 	.byte	20                              // 0x14
462; CHECK: 	.byte	36                              // 0x24
463; CHECK: 	.byte	6                               // 0x6
464; CHECK: 	.byte	22                              // 0x16
465; CHECK: 	.byte	38                              // 0x26
466; CHECK: 	.byte	255                             // 0xff
467; CHECK: 	.byte	255                             // 0xff
468; CHECK: 	.byte	255                             // 0xff
469; CHECK: 	.byte	255                             // 0xff
470define <12 x i8> @shuffle3_v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) {
471; CHECK-LABEL: shuffle3_v4i8:
472; CHECK:       // %bb.0:
473; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2
474; CHECK-NEXT:    adrp x8, .LCPI11_0
475; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI11_0]
476; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2
477; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2
478; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v3.16b
479; CHECK-NEXT:    ret
480  %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
481  %y = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
482  %z = shufflevector <8 x i8> %x, <8 x i8> %y, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
483  ret <12 x i8> %z
484}
485
486; CHECK: .LCPI12_0:
487; CHECK: 	.byte	0                               // 0x0
488; CHECK: 	.byte	1                               // 0x1
489; CHECK: 	.byte	8                               // 0x8
490; CHECK: 	.byte	9                               // 0x9
491; CHECK: 	.byte	16                              // 0x10
492; CHECK: 	.byte	17                              // 0x11
493; CHECK: 	.byte	2                               // 0x2
494; CHECK: 	.byte	3                               // 0x3
495; CHECK: 	.byte	10                              // 0xa
496; CHECK: 	.byte	11                              // 0xb
497; CHECK: 	.byte	18                              // 0x12
498; CHECK: 	.byte	19                              // 0x13
499; CHECK: 	.byte	4                               // 0x4
500; CHECK: 	.byte	5                               // 0x5
501; CHECK: 	.byte	12                              // 0xc
502; CHECK: 	.byte	13                              // 0xd
503define <8 x i16> @shuffle3_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) {
504; CHECK-LABEL: shuffle3_v4i16:
505; CHECK:       // %bb.0:
506; CHECK-NEXT:    fmov d3, d2
507; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
508; CHECK-NEXT:    adrp x8, .LCPI12_0
509; CHECK-NEXT:    fmov d2, d0
510; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI12_0]
511; CHECK-NEXT:    mov v2.d[1], v1.d[0]
512; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
513; CHECK-NEXT:    ret
514  %x = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
515  %y = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
516  %z = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6>
517  ret <8 x i16> %z
518}
519
520define <4 x i32> @shuffle3_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
521; CHECK-LABEL: shuffle3_v4i32:
522; CHECK:       // %bb.0:
523; CHECK-NEXT:    trn1 v1.4s, v0.4s, v1.4s
524; CHECK-NEXT:    mov v1.d[1], v0.d[0]
525; CHECK-NEXT:    mov v1.s[2], v2.s[0]
526; CHECK-NEXT:    mov v0.16b, v1.16b
527; CHECK-NEXT:    ret
528  %x = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
529  %y = shufflevector <4 x i32> %c, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
530  %z = shufflevector <8 x i32> %x, <8 x i32> %y, <4 x i32> <i32 0, i32 4, i32 8, i32 1>
531  ret <4 x i32> %z
532}
533
534; CHECK: .LCPI14_0:
535; CHECK: 	.byte	4                               // 0x4
536; CHECK: 	.byte	8                               // 0x8
537; CHECK: 	.byte	255                             // 0xff
538; CHECK: 	.byte	255                             // 0xff
539; CHECK: 	.byte	14                              // 0xe
540; CHECK: 	.byte	3                               // 0x3
541; CHECK: 	.byte	255                             // 0xff
542; CHECK: 	.byte	255                             // 0xff
543; CHECK: 	.section	.rodata.cst16,"aM",@progbits,16
544; CHECK: 	.p2align	4
545; CHECK: .LCPI14_1:
546; CHECK: 	.byte	255                             // 0xff
547; CHECK: 	.byte	255                             // 0xff
548; CHECK: 	.byte	15                              // 0xf
549; CHECK: 	.byte	27                              // 0x1b
550; CHECK: 	.byte	255                             // 0xff
551; CHECK: 	.byte	255                             // 0xff
552; CHECK: 	.byte	24                              // 0x18
553; CHECK: 	.byte	12                              // 0xc
554; CHECK: 	.byte	255                             // 0xff
555; CHECK: 	.byte	255                             // 0xff
556; CHECK: 	.byte	255                             // 0xff
557; CHECK: 	.byte	255                             // 0xff
558; CHECK: 	.byte	255                             // 0xff
559; CHECK: 	.byte	255                             // 0xff
560; CHECK: 	.byte	255                             // 0xff
561; CHECK: 	.byte	255                             // 0xff
562define <8 x i8> @insert4_v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) {
563; CHECK-LABEL: insert4_v8i8:
564; CHECK:       // %bb.0:
565; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
566; CHECK-NEXT:    mov v4.16b, v3.16b
567; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
568; CHECK-NEXT:    adrp x8, .LCPI14_0
569; CHECK-NEXT:    adrp x9, .LCPI14_1
570; CHECK-NEXT:    mov v0.d[1], v2.d[0]
571; CHECK-NEXT:    mov v3.16b, v1.16b
572; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI14_0]
573; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI14_1]
574; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
575; CHECK-NEXT:    tbl v1.16b, { v3.16b, v4.16b }, v2.16b
576; CHECK-NEXT:    trn1 v0.4h, v1.4h, v0.4h
577; CHECK-NEXT:    trn2 v0.4h, v0.4h, v1.4h
578; CHECK-NEXT:    ret
579  %e1 = extractelement <8 x i8> %a, i32 4
580  %e2 = extractelement <8 x i8> %c, i32 0
581  %e3 = extractelement <16 x i8> %b, i32 15
582  %e4 = extractelement <16 x i8> %d, i32 11
583  %e5 = extractelement <8 x i8> %c, i32 6
584  %e6 = extractelement <8 x i8> %a, i32 3
585  %e7 = extractelement <16 x i8> %d, i32 8
586  %e8 = extractelement <16 x i8> %b, i32 12
587  %i1 = insertelement <8 x i8> undef, i8 %e1, i32 0
588  %i2 = insertelement <8 x i8> %i1, i8 %e2, i32 1
589  %i3 = insertelement <8 x i8> %i2, i8 %e3, i32 2
590  %i4 = insertelement <8 x i8> %i3, i8 %e4, i32 3
591  %i5 = insertelement <8 x i8> %i4, i8 %e5, i32 4
592  %i6 = insertelement <8 x i8> %i5, i8 %e6, i32 5
593  %i7 = insertelement <8 x i8> %i6, i8 %e7, i32 6
594  %i8 = insertelement <8 x i8> %i7, i8 %e8, i32 7
595  ret <8 x i8> %i8
596}
597
598; CHECK: .LCPI15_0:
599; CHECK: .byte 255                             // 0xff
600; CHECK: .byte 255                             // 0xff
601; CHECK: .byte 15                              // 0xf
602; CHECK: .byte 27                              // 0x1b
603; CHECK: .byte 255                             // 0xff
604; CHECK: .byte 255                             // 0xff
605; CHECK: .byte 24                              // 0x18
606; CHECK: .byte 12                              // 0xc
607; CHECK: .byte 255                             // 0xff
608; CHECK: .byte 255                             // 0xff
609; CHECK: .byte 15                              // 0xf
610; CHECK: .byte 27                              // 0x1b
611; CHECK: .byte 255                             // 0xff
612; CHECK: .byte 255                             // 0xff
613; CHECK: .byte 24                              // 0x18
614; CHECK: .byte 12                              // 0xc
615; CHECK: .LCPI15_1:
616; CHECK: .byte 20                              // 0x14
617; CHECK: .byte 24                              // 0x18
618; CHECK: .byte 2                               // 0x2
619; CHECK: .byte 3                               // 0x3
620; CHECK: .byte 30                              // 0x1e
621; CHECK: .byte 19                              // 0x13
622; CHECK: .byte 6                               // 0x6
623; CHECK: .byte 7                               // 0x7
624; CHECK: .byte 20                              // 0x14
625; CHECK: .byte 24                              // 0x18
626; CHECK: .byte 10                              // 0xa
627; CHECK: .byte 11                              // 0xb
628; CHECK: .byte 30                              // 0x1e
629; CHECK: .byte 19                              // 0x13
630; CHECK: .byte 14                              // 0xe
631; CHECK: .byte 15                              // 0xf
632define <16 x i8> @insert4_v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) {
633; CHECK-LABEL: insert4_v16i8:
634; CHECK:       // %bb.0:
635; CHECK-NEXT:    mov v4.16b, v3.16b
636; CHECK-NEXT:    adrp x8, .LCPI15_0
637; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q31_q0
638; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
639; CHECK-NEXT:    mov v3.16b, v1.16b
640; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI15_0]
641; CHECK-NEXT:    mov v0.d[1], v2.d[0]
642; CHECK-NEXT:    adrp x8, .LCPI15_1
643; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_1]
644; CHECK-NEXT:    tbl v31.16b, { v3.16b, v4.16b }, v5.16b
645; CHECK-NEXT:    tbl v0.16b, { v31.16b, v0.16b }, v1.16b
646; CHECK-NEXT:    ret
647  %e1 = extractelement <8 x i8> %a, i32 4
648  %e2 = extractelement <8 x i8> %c, i32 0
649  %e3 = extractelement <16 x i8> %b, i32 15
650  %e4 = extractelement <16 x i8> %d, i32 11
651  %e5 = extractelement <8 x i8> %c, i32 6
652  %e6 = extractelement <8 x i8> %a, i32 3
653  %e7 = extractelement <16 x i8> %d, i32 8
654  %e8 = extractelement <16 x i8> %b, i32 12
655  %e9 = extractelement <8 x i8> %a, i32 4
656  %e10 = extractelement <8 x i8> %c, i32 0
657  %e11 = extractelement <16 x i8> %b, i32 15
658  %e12 = extractelement <16 x i8> %d, i32 11
659  %e13 = extractelement <8 x i8> %c, i32 6
660  %e14 = extractelement <8 x i8> %a, i32 3
661  %e15 = extractelement <16 x i8> %d, i32 8
662  %e16 = extractelement <16 x i8> %b, i32 12
663  %i1 = insertelement <16 x i8> undef, i8 %e1, i32 0
664  %i2 = insertelement <16 x i8> %i1, i8 %e2, i32 1
665  %i3 = insertelement <16 x i8> %i2, i8 %e3, i32 2
666  %i4 = insertelement <16 x i8> %i3, i8 %e4, i32 3
667  %i5 = insertelement <16 x i8> %i4, i8 %e5, i32 4
668  %i6 = insertelement <16 x i8> %i5, i8 %e6, i32 5
669  %i7 = insertelement <16 x i8> %i6, i8 %e7, i32 6
670  %i8 = insertelement <16 x i8> %i7, i8 %e8, i32 7
671  %i9 = insertelement <16 x i8> %i8, i8 %e9, i32 8
672  %i10 = insertelement <16 x i8> %i9, i8 %e10, i32 9
673  %i11 = insertelement <16 x i8> %i10, i8 %e11, i32 10
674  %i12 = insertelement <16 x i8> %i11, i8 %e12, i32 11
675  %i13 = insertelement <16 x i8> %i12, i8 %e13, i32 12
676  %i14 = insertelement <16 x i8> %i13, i8 %e14, i32 13
677  %i15 = insertelement <16 x i8> %i14, i8 %e15, i32 14
678  %i16 = insertelement <16 x i8> %i15, i8 %e16, i32 15
679  ret <16 x i8> %i16
680}
681
682
683; CHECK: .LCPI16_0:
684; CHECK: 	.byte	0
685; CHECK: 	.byte	1
686; CHECK: 	.byte	4
687; CHECK: 	.byte	5
688; CHECK: 	.byte	16
689; CHECK: 	.byte	17
690; CHECK: 	.byte	20
691; CHECK: 	.byte	21
692; CHECK: 	.byte	32
693; CHECK: 	.byte	33
694; CHECK: 	.byte	36
695; CHECK: 	.byte	37
696; CHECK: 	.byte	48
697; CHECK: 	.byte	49
698; CHECK: 	.byte	52
699; CHECK: 	.byte	53
700define <16 x i16> @test(<2 x double> %l213, <2 x double> %l231, <2 x double> %l249, <2 x double> %l267, <2 x double> %l285, <2 x double> %l303, <2 x double> %l321, <2 x double> %l339) {
701; CHECK-LABEL: test:
702; CHECK:       // %bb.0:
703; CHECK-NEXT:    frintm v0.2d, v0.2d
704; CHECK-NEXT:    frintm v4.2d, v4.2d
705; CHECK-NEXT:    adrp x8, .LCPI16_0
706; CHECK-NEXT:    frintm v1.2d, v1.2d
707; CHECK-NEXT:    frintm v5.2d, v5.2d
708; CHECK-NEXT:    frintm v2.2d, v2.2d
709; CHECK-NEXT:    frintm v6.2d, v6.2d
710; CHECK-NEXT:    frintm v3.2d, v3.2d
711; CHECK-NEXT:    frintm v7.2d, v7.2d
712; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
713; CHECK-NEXT:    fcvtzs v4.2d, v4.2d
714; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
715; CHECK-NEXT:    fcvtzs v5.2d, v5.2d
716; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
717; CHECK-NEXT:    fcvtzs v6.2d, v6.2d
718; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
719; CHECK-NEXT:    fcvtzs v7.2d, v7.2d
720; CHECK-NEXT:    xtn v16.2s, v0.2d
721; CHECK-NEXT:    xtn v20.2s, v4.2d
722; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI16_0]
723; CHECK-NEXT:    xtn v17.2s, v1.2d
724; CHECK-NEXT:    xtn v21.2s, v5.2d
725; CHECK-NEXT:    xtn v18.2s, v2.2d
726; CHECK-NEXT:    xtn v22.2s, v6.2d
727; CHECK-NEXT:    xtn v19.2s, v3.2d
728; CHECK-NEXT:    xtn v23.2s, v7.2d
729; CHECK-NEXT:    tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
730; CHECK-NEXT:    tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
731; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
732; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
733; CHECK-NEXT:    ret
734  %l214 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l213)
735  %l215 = fptosi <2 x double> %l214 to <2 x i16>
736  %l232 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l231)
737  %l233 = fptosi <2 x double> %l232 to <2 x i16>
738  %l250 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l249)
739  %l251 = fptosi <2 x double> %l250 to <2 x i16>
740  %l268 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l267)
741  %l269 = fptosi <2 x double> %l268 to <2 x i16>
742  %l286 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l285)
743  %l287 = fptosi <2 x double> %l286 to <2 x i16>
744  %l304 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l303)
745  %l305 = fptosi <2 x double> %l304 to <2 x i16>
746  %l322 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l321)
747  %l323 = fptosi <2 x double> %l322 to <2 x i16>
748  %l340 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l339)
749  %l341 = fptosi <2 x double> %l340 to <2 x i16>
750  %l342 = shufflevector <2 x i16> %l215, <2 x i16> %l233, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
751  %l343 = shufflevector <2 x i16> %l251, <2 x i16> %l269, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
752  %l344 = shufflevector <2 x i16> %l287, <2 x i16> %l305, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
753  %l345 = shufflevector <2 x i16> %l323, <2 x i16> %l341, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
754  %l346 = shufflevector <4 x i16> %l342, <4 x i16> %l343, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
755  %l347 = shufflevector <4 x i16> %l344, <4 x i16> %l345, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
756  %interleaved.vec = shufflevector <8 x i16> %l346, <8 x i16> %l347, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
757  ret <16 x i16> %interleaved.vec
758}
759
760declare <2 x double> @llvm.floor.v2f64(<2 x double> %l213)
761