xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll (revision 8232ab76d0bae090b1720a8d096c795e400d2525)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4
5target triple = "aarch64-unknown-linux-gnu"
6
7; REVB pattern for shuffle v32i8 -> v16i16
8define void @test_revbv16i16(ptr %a) #0 {
9; CHECK-LABEL: test_revbv16i16:
10; CHECK:       // %bb.0:
11; CHECK-NEXT:    ptrue p0.b, vl32
12; CHECK-NEXT:    ptrue p1.h
13; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
14; CHECK-NEXT:    revb z0.h, p1/m, z0.h
15; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
16; CHECK-NEXT:    ret
17  %tmp1 = load <32 x i8>, ptr %a
18  %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14, i32 17, i32 16, i32 19, i32 18, i32 21, i32 20, i32 23, i32 22, i32 undef, i32 24, i32 27, i32 undef, i32 29, i32 28, i32 undef, i32 undef>
19  store <32 x i8> %tmp2, ptr %a
20  ret void
21}
22
23; REVB pattern for shuffle v32i8 -> v8i32
24define void @test_revbv8i32(ptr %a) #0 {
25; CHECK-LABEL: test_revbv8i32:
26; CHECK:       // %bb.0:
27; CHECK-NEXT:    ptrue p0.b, vl32
28; CHECK-NEXT:    ptrue p1.s
29; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
30; CHECK-NEXT:    revb z0.s, p1/m, z0.s
31; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
32; CHECK-NEXT:    ret
33  %tmp1 = load <32 x i8>, ptr %a
34  %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
35  store <32 x i8> %tmp2, ptr %a
36  ret void
37}
38
39; REVB pattern for shuffle v32i8 -> v4i64
40define void @test_revbv4i64(ptr %a) #0 {
41; CHECK-LABEL: test_revbv4i64:
42; CHECK:       // %bb.0:
43; CHECK-NEXT:    ptrue p0.b, vl32
44; CHECK-NEXT:    ptrue p1.d
45; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
46; CHECK-NEXT:    revb z0.d, p1/m, z0.d
47; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
48; CHECK-NEXT:    ret
49  %tmp1 = load <32 x i8>, ptr %a
50  %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 31, i32 30, i32 29, i32 undef, i32 27, i32 undef, i32 undef, i32 undef>
51  store <32 x i8> %tmp2, ptr %a
52  ret void
53}
54
55; REVH pattern for shuffle v16i16 -> v8i32
56define void @test_revhv8i32(ptr %a) #0 {
57; CHECK-LABEL: test_revhv8i32:
58; CHECK:       // %bb.0:
59; CHECK-NEXT:    ptrue p0.h, vl16
60; CHECK-NEXT:    ptrue p1.s
61; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
62; CHECK-NEXT:    revh z0.s, p1/m, z0.s
63; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
64; CHECK-NEXT:    ret
65  %tmp1 = load <16 x i16>, ptr %a
66  %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
67  store <16 x i16> %tmp2, ptr %a
68  ret void
69}
70
71; REVH pattern for shuffle v16f16 -> v8f32
72define void @test_revhv8f32(ptr %a) #0 {
73; CHECK-LABEL: test_revhv8f32:
74; CHECK:       // %bb.0:
75; CHECK-NEXT:    ptrue p0.h, vl16
76; CHECK-NEXT:    ptrue p1.s
77; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
78; CHECK-NEXT:    revh z0.s, p1/m, z0.s
79; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
80; CHECK-NEXT:    ret
81  %tmp1 = load <16 x half>, ptr %a
82  %tmp2 = shufflevector <16 x half> %tmp1, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
83  store <16 x half> %tmp2, ptr %a
84  ret void
85}
86
87; REVH pattern for shuffle v16i16 -> v4i64
88define void @test_revhv4i64(ptr %a) #0 {
89; CHECK-LABEL: test_revhv4i64:
90; CHECK:       // %bb.0:
91; CHECK-NEXT:    ptrue p0.h, vl16
92; CHECK-NEXT:    ptrue p1.d
93; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
94; CHECK-NEXT:    revh z0.d, p1/m, z0.d
95; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
96; CHECK-NEXT:    ret
97  %tmp1 = load <16 x i16>, ptr %a
98  %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
99  store <16 x i16> %tmp2, ptr %a
100  ret void
101}
102
103; REVW pattern for shuffle v8i32 -> v4i64
104define void @test_revwv4i64(ptr %a) #0 {
105; CHECK-LABEL: test_revwv4i64:
106; CHECK:       // %bb.0:
107; CHECK-NEXT:    ptrue p0.s, vl8
108; CHECK-NEXT:    ptrue p1.d
109; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
110; CHECK-NEXT:    revw z0.d, p1/m, z0.d
111; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
112; CHECK-NEXT:    ret
113  %tmp1 = load <8 x i32>, ptr %a
114  %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
115  store <8 x i32> %tmp2, ptr %a
116  ret void
117}
118
119; REVW pattern for shuffle v8f32 -> v4f64
120define void @test_revwv4f64(ptr %a) #0 {
121; CHECK-LABEL: test_revwv4f64:
122; CHECK:       // %bb.0:
123; CHECK-NEXT:    ptrue p0.s, vl8
124; CHECK-NEXT:    ptrue p1.d
125; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
126; CHECK-NEXT:    revw z0.d, p1/m, z0.d
127; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
128; CHECK-NEXT:    ret
129  %tmp1 = load <8 x float>, ptr %a
130  %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
131  store <8 x float> %tmp2, ptr %a
132  ret void
133}
134
135; Don't use SVE for 128-bit vectors
136define <16 x i8> @test_revv16i8(ptr %a) #0 {
137; CHECK-LABEL: test_revv16i8:
138; CHECK:       // %bb.0:
139; CHECK-NEXT:    ldr q0, [x0]
140; CHECK-NEXT:    rev64 v0.16b, v0.16b
141; CHECK-NEXT:    ret
142  %tmp1 = load <16 x i8>, ptr %a
143  %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
144  ret <16 x i8> %tmp2
145}
146
147; REVW pattern for shuffle two v8i32 inputs with the second input available.
148define void @test_revwv8i32v8i32(ptr %a, ptr %b) #0 {
149; CHECK-LABEL: test_revwv8i32v8i32:
150; CHECK:       // %bb.0:
151; CHECK-NEXT:    ptrue p0.s, vl8
152; CHECK-NEXT:    ptrue p1.d
153; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
154; CHECK-NEXT:    revw z0.d, p1/m, z0.d
155; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
156; CHECK-NEXT:    ret
157  %tmp1 = load <8 x i32>, ptr %a
158  %tmp2 = load <8 x i32>, ptr %b
159  %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
160  store <8 x i32> %tmp3, ptr %a
161  ret void
162}
163
164; REVH pattern for shuffle v32i16 with 256 bits and 512 bits SVE.
165define void @test_revhv32i16(ptr %a) #0 {
166; VBITS_GE_256-LABEL: test_revhv32i16:
167; VBITS_GE_256:       // %bb.0:
168; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
169; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
170; VBITS_GE_256-NEXT:    ptrue p1.d
171; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
172; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
173; VBITS_GE_256-NEXT:    revh z0.d, p1/m, z0.d
174; VBITS_GE_256-NEXT:    revh z1.d, p1/m, z1.d
175; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
176; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
177; VBITS_GE_256-NEXT:    ret
178;
179; VBITS_GE_512-LABEL: test_revhv32i16:
180; VBITS_GE_512:       // %bb.0:
181; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
182; VBITS_GE_512-NEXT:    ptrue p1.d
183; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
184; VBITS_GE_512-NEXT:    revh z0.d, p1/m, z0.d
185; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
186; VBITS_GE_512-NEXT:    ret
187  %tmp1 = load <32 x i16>, ptr %a
188  %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
189  store <32 x i16> %tmp2, ptr %a
190  ret void
191}
192
193; Only support to reverse bytes / halfwords / words within elements
194define void @test_rev_elts_fail(ptr %a) #1 {
195; CHECK-LABEL: test_rev_elts_fail:
196; CHECK:       // %bb.0:
197; CHECK-NEXT:    ptrue p0.d
198; CHECK-NEXT:    adrp x8, .LCPI11_0
199; CHECK-NEXT:    add x8, x8, :lo12:.LCPI11_0
200; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
201; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x8]
202; CHECK-NEXT:    tbl z0.d, { z0.d }, z1.d
203; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
204; CHECK-NEXT:    ret
205  %tmp1 = load <4 x i64>, ptr %a
206  %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
207  store <4 x i64> %tmp2, ptr %a
208  ret void
209}
210
211; This is the same test as above, but with sve2p1 it can use the REVD instruction to reverse
212; the double-words within quard-words.
213define void @test_revdv4i64_sve2p1(ptr %a) #2 {
214; CHECK-LABEL: test_revdv4i64_sve2p1:
215; CHECK:       // %bb.0:
216; CHECK-NEXT:    ptrue p0.d, vl4
217; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
218; CHECK-NEXT:    revd z0.q, p0/m, z0.q
219; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
220; CHECK-NEXT:    ret
221  %tmp1 = load <4 x i64>, ptr %a
222  %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
223  store <4 x i64> %tmp2, ptr %a
224  ret void
225}
226
227define void @test_revdv4f64_sve2p1(ptr %a) #2 {
228; CHECK-LABEL: test_revdv4f64_sve2p1:
229; CHECK:       // %bb.0:
230; CHECK-NEXT:    ptrue p0.d, vl4
231; CHECK-NEXT:    ptrue p1.d
232; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
233; CHECK-NEXT:    revd z0.q, p1/m, z0.q
234; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
235; CHECK-NEXT:    ret
236  %tmp1 = load <4 x double>, ptr %a
237  %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
238  store <4 x double> %tmp2, ptr %a
239  ret void
240}
241
242; REV instruction will reverse the order of all elements in the vector.
243; When the vector length and the target register size are inconsistent,
244; the correctness of generated REV instruction for shuffle pattern cannot be guaranteed.
245
246; sve-vector-bits-min=256, sve-vector-bits-max is not set, REV inst can't be generated.
247define void @test_revv8i32(ptr %a) #0 {
248; VBITS_GE_256-LABEL: test_revv8i32:
249; VBITS_GE_256:       // %bb.0:
250; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
251; VBITS_GE_256-NEXT:    index z0.s, #7, #-1
252; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
253; VBITS_GE_256-NEXT:    tbl z0.s, { z1.s }, z0.s
254; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
255; VBITS_GE_256-NEXT:    ret
256;
257; VBITS_GE_512-LABEL: test_revv8i32:
258; VBITS_GE_512:       // %bb.0:
259; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
260; VBITS_GE_512-NEXT:    adrp x8, .LCPI14_0
261; VBITS_GE_512-NEXT:    add x8, x8, :lo12:.LCPI14_0
262; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
263; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
264; VBITS_GE_512-NEXT:    ld1w { z1.s }, p1/z, [x8]
265; VBITS_GE_512-NEXT:    tbl z0.s, { z0.s }, z1.s
266; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
267; VBITS_GE_512-NEXT:    ret
268  %tmp1 = load <8 x i32>, ptr %a
269  %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
270  store <8 x i32> %tmp2, ptr %a
271  ret void
272}
273
274; REV pattern for v32i8 shuffle with vscale_range(2,2)
275define void @test_revv32i8_vl256(ptr %a) #1 {
276; CHECK-LABEL: test_revv32i8_vl256:
277; CHECK:       // %bb.0:
278; CHECK-NEXT:    ptrue p0.b
279; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
280; CHECK-NEXT:    rev z0.b, z0.b
281; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
282; CHECK-NEXT:    ret
283  %tmp1 = load <32 x i8>, ptr %a
284  %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
285  store <32 x i8> %tmp2, ptr %a
286  ret void
287}
288
289; REV pattern for v16i16 shuffle with vscale_range(2,2)
290define void @test_revv16i16_vl256(ptr %a) #1 {
291; CHECK-LABEL: test_revv16i16_vl256:
292; CHECK:       // %bb.0:
293; CHECK-NEXT:    ptrue p0.h
294; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
295; CHECK-NEXT:    rev z0.h, z0.h
296; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
297; CHECK-NEXT:    ret
298  %tmp1 = load <16 x i16>, ptr %a
299  %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
300  store <16 x i16> %tmp2, ptr %a
301  ret void
302}
303
304; REV pattern for v8f32 shuffle with vscale_range(2,2)
305define void @test_revv8f32_vl256(ptr %a) #1 {
306; CHECK-LABEL: test_revv8f32_vl256:
307; CHECK:       // %bb.0:
308; CHECK-NEXT:    ptrue p0.s
309; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
310; CHECK-NEXT:    rev z0.s, z0.s
311; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
312; CHECK-NEXT:    ret
313  %tmp1 = load <8 x float>, ptr %a
314  %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
315  store <8 x float> %tmp2, ptr %a
316  ret void
317}
318
319; REV pattern for v4f64 shuffle with vscale_range(2,2)
320define void @test_revv4f64_vl256(ptr %a) #1 {
321; CHECK-LABEL: test_revv4f64_vl256:
322; CHECK:       // %bb.0:
323; CHECK-NEXT:    ptrue p0.d
324; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
325; CHECK-NEXT:    rev z0.d, z0.d
326; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
327; CHECK-NEXT:    ret
328  %tmp1 = load <4 x double>, ptr %a
329  %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
330  store <4 x double> %tmp2, ptr %a
331  ret void
332}
333
334; REV pattern for shuffle two v8i32 inputs with the second input available, vscale_range(2,2).
335define void @test_revv8i32v8i32(ptr %a, ptr %b) #1 {
336; CHECK-LABEL: test_revv8i32v8i32:
337; CHECK:       // %bb.0:
338; CHECK-NEXT:    ptrue p0.s
339; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
340; CHECK-NEXT:    rev z0.s, z0.s
341; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
342; CHECK-NEXT:    ret
343  %tmp1 = load <8 x i32>, ptr %a
344  %tmp2 = load <8 x i32>, ptr %b
345  %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
346  store <8 x i32> %tmp3, ptr %a
347  ret void
348}
349
350; Illegal REV pattern.
351define void @test_rev_fail(ptr %a) #1 {
352; CHECK-LABEL: test_rev_fail:
353; CHECK:       // %bb.0:
354; CHECK-NEXT:    ptrue p0.h
355; CHECK-NEXT:    adrp x8, .LCPI20_0
356; CHECK-NEXT:    add x8, x8, :lo12:.LCPI20_0
357; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
358; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x8]
359; CHECK-NEXT:    tbl z0.h, { z0.h }, z1.h
360; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
361; CHECK-NEXT:    ret
362  %tmp1 = load <16 x i16>, ptr %a
363  %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
364  store <16 x i16> %tmp2, ptr %a
365  ret void
366}
367
368; Don't use SVE for 128-bit shuffle with two inputs
369define void @test_revv8i16v8i16(ptr %a, ptr %b, ptr %c) #1 {
370; CHECK-LABEL: test_revv8i16v8i16:
371; CHECK:       // %bb.0:
372; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
373; CHECK-NEXT:    sub x9, sp, #48
374; CHECK-NEXT:    mov x29, sp
375; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
376; CHECK-NEXT:    .cfi_def_cfa w29, 16
377; CHECK-NEXT:    .cfi_offset w30, -8
378; CHECK-NEXT:    .cfi_offset w29, -16
379; CHECK-NEXT:    mov x8, sp
380; CHECK-NEXT:    ldr q0, [x1]
381; CHECK-NEXT:    ldr q1, [x0]
382; CHECK-NEXT:    orr x9, x8, #0x1e
383; CHECK-NEXT:    orr x10, x8, #0x1c
384; CHECK-NEXT:    ptrue p0.h
385; CHECK-NEXT:    st1 { v0.h }[4], [x9]
386; CHECK-NEXT:    orr x9, x8, #0x18
387; CHECK-NEXT:    st1 { v0.h }[7], [x9]
388; CHECK-NEXT:    orr x9, x8, #0xe
389; CHECK-NEXT:    st1 { v1.h }[4], [x9]
390; CHECK-NEXT:    orr x9, x8, #0xc
391; CHECK-NEXT:    st1 { v1.h }[5], [x9]
392; CHECK-NEXT:    orr x9, x8, #0x8
393; CHECK-NEXT:    st1 { v0.h }[5], [x10]
394; CHECK-NEXT:    orr x10, x8, #0x10
395; CHECK-NEXT:    st1 { v1.h }[7], [x9]
396; CHECK-NEXT:    orr x9, x8, #0x4
397; CHECK-NEXT:    st1 { v0.h }[3], [x10]
398; CHECK-NEXT:    mov w10, #26 // =0x1a
399; CHECK-NEXT:    st1 { v1.h }[1], [x9]
400; CHECK-NEXT:    orr x9, x8, #0x2
401; CHECK-NEXT:    st1 { v1.h }[2], [x9]
402; CHECK-NEXT:    orr x9, x8, x10
403; CHECK-NEXT:    mov w10, #20 // =0x14
404; CHECK-NEXT:    st1 { v0.h }[6], [x9]
405; CHECK-NEXT:    orr x9, x8, x10
406; CHECK-NEXT:    mov w10, #18 // =0x12
407; CHECK-NEXT:    st1 { v0.h }[1], [x9]
408; CHECK-NEXT:    orr x9, x8, x10
409; CHECK-NEXT:    st1 { v0.h }[2], [x9]
410; CHECK-NEXT:    mov w9, #10 // =0xa
411; CHECK-NEXT:    orr x9, x8, x9
412; CHECK-NEXT:    st1 { v1.h }[3], [x8]
413; CHECK-NEXT:    st1 { v1.h }[6], [x9]
414; CHECK-NEXT:    str h0, [sp, #22]
415; CHECK-NEXT:    str h1, [sp, #6]
416; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8]
417; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
418; CHECK-NEXT:    mov sp, x29
419; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
420; CHECK-NEXT:    ret
421  %tmp1 = load <8 x i16>, ptr %a
422  %tmp2 = load <8 x i16>, ptr %b
423  %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
424  store <16 x i16> %tmp3, ptr %c
425  ret void
426}
427
428attributes #0 = { "target-features"="+sve" }
429attributes #1 = { "target-features"="+sve" vscale_range(2,2) }
430attributes #2 = { "target-features"="+sve2p1" }
431