xref: /llvm-project/llvm/test/CodeGen/ARM/bf16-shuffle.ll (revision 7b3bbd83c0c24087072ec5b22a76799ab31f87d5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=arm-eabi -mattr=+v8.6a,+neon -float-abi=hard < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16
3; RUN: llc -mtriple=arm-eabi -mattr=+v8.6a,+neon,+bf16 -float-abi=hard < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16
4; RUN: llc -mtriple=arm-eabi -mattr=+v8.6a,+neon,+fullfp16,+bf16 -float-abi=hard < %s | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
5
6%struct.float16x4x2_t = type { [2 x <4 x bfloat>] }
7%struct.float16x8x2_t = type { [2 x <8 x bfloat>] }
8
9define dso_local <4 x bfloat> @test_vbsl_bf16(<4 x i16> %a, <4 x bfloat> %b, <4 x bfloat> %c) {
10; CHECK-LABEL: test_vbsl_bf16:
11; CHECK:       @ %bb.0: @ %entry
12; CHECK-NEXT:    vbsl d0, d1, d2
13; CHECK-NEXT:    bx lr
14entry:
15  %0 = bitcast <4 x i16> %a to <8 x i8>
16  %1 = bitcast <4 x bfloat> %b to <8 x i8>
17  %2 = bitcast <4 x bfloat> %c to <8 x i8>
18  %vbsl_v.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2)
19  %3 = bitcast <8 x i8> %vbsl_v.i to <4 x bfloat>
20  ret <4 x bfloat> %3
21}
22
23define dso_local <8 x bfloat> @test_vbslq_bf16(<8 x i16> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
24; CHECK-LABEL: test_vbslq_bf16:
25; CHECK:       @ %bb.0: @ %entry
26; CHECK-NEXT:    vbsl q0, q1, q2
27; CHECK-NEXT:    bx lr
28entry:
29  %0 = bitcast <8 x i16> %a to <16 x i8>
30  %1 = bitcast <8 x bfloat> %b to <16 x i8>
31  %2 = bitcast <8 x bfloat> %c to <16 x i8>
32  %vbslq_v.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
33  %3 = bitcast <16 x i8> %vbslq_v.i to <8 x bfloat>
34  ret <8 x bfloat> %3
35}
36
37define dso_local %struct.float16x4x2_t @test_vzip_bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
38; CHECK-LABEL: test_vzip_bf16:
39; CHECK:       @ %bb.0: @ %entry
40; CHECK-NEXT:    vzip.16 d0, d1
41; CHECK-NEXT:    bx lr
42entry:
43  %vzip.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
44  %vzip1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
45  %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vzip.i, 0, 0
46  %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vzip1.i, 0, 1
47  ret %struct.float16x4x2_t %.fca.0.1.insert
48}
49
50define dso_local %struct.float16x8x2_t @test_vzipq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
51; CHECK-LABEL: test_vzipq_bf16:
52; CHECK:       @ %bb.0: @ %entry
53; CHECK-NEXT:    vzip.16 q0, q1
54; CHECK-NEXT:    bx lr
55entry:
56  %vzip.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
57  %vzip1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
58  %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vzip.i, 0, 0
59  %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vzip1.i, 0, 1
60  ret %struct.float16x8x2_t %.fca.0.1.insert
61}
62
63define dso_local %struct.float16x4x2_t @test_vuzp_bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
64; CHECK-LABEL: test_vuzp_bf16:
65; CHECK:       @ %bb.0: @ %entry
66; CHECK-NEXT:    vuzp.16 d0, d1
67; CHECK-NEXT:    bx lr
68entry:
69  %vuzp.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
70  %vuzp1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
71  %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vuzp.i, 0, 0
72  %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vuzp1.i, 0, 1
73  ret %struct.float16x4x2_t %.fca.0.1.insert
74}
75
76define dso_local %struct.float16x8x2_t @test_vuzpq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
77; CHECK-LABEL: test_vuzpq_bf16:
78; CHECK:       @ %bb.0: @ %entry
79; CHECK-NEXT:    vuzp.16 q0, q1
80; CHECK-NEXT:    bx lr
81entry:
82  %vuzp.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
83  %vuzp1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
84  %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vuzp.i, 0, 0
85  %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vuzp1.i, 0, 1
86  ret %struct.float16x8x2_t %.fca.0.1.insert
87}
88
89define dso_local %struct.float16x4x2_t @test_vtrn_bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
90; CHECK-LABEL: test_vtrn_bf16:
91; CHECK:       @ %bb.0: @ %entry
92; CHECK-NEXT:    vtrn.16 d0, d1
93; CHECK-NEXT:    bx lr
94entry:
95  %vtrn.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
96  %vtrn1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
97  %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vtrn.i, 0, 0
98  %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vtrn1.i, 0, 1
99  ret %struct.float16x4x2_t %.fca.0.1.insert
100}
101
102define dso_local %struct.float16x8x2_t @test_vtrnq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
103; CHECK-LABEL: test_vtrnq_bf16:
104; CHECK:       @ %bb.0: @ %entry
105; CHECK-NEXT:    vtrn.16 q0, q1
106; CHECK-NEXT:    bx lr
107entry:
108  %vtrn.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
109  %vtrn1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
110  %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vtrn.i, 0, 0
111  %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vtrn1.i, 0, 1
112  ret %struct.float16x8x2_t %.fca.0.1.insert
113}
114
115define dso_local <4 x bfloat> @test_vmov_n_bf16(float %a.coerce) {
116; CHECK-NOFP16-LABEL: test_vmov_n_bf16:
117; CHECK-NOFP16:       @ %bb.0: @ %entry
118; CHECK-NOFP16-NEXT:    .pad #4
119; CHECK-NOFP16-NEXT:    sub sp, sp, #4
120; CHECK-NOFP16-NEXT:    vmov r0, s0
121; CHECK-NOFP16-NEXT:    strh r0, [sp, #2]
122; CHECK-NOFP16-NEXT:    add r0, sp, #2
123; CHECK-NOFP16-NEXT:    vld1.16 {d0[]}, [r0:16]
124; CHECK-NOFP16-NEXT:    add sp, sp, #4
125; CHECK-NOFP16-NEXT:    bx lr
126;
127; CHECK-FP16-LABEL: test_vmov_n_bf16:
128; CHECK-FP16:       @ %bb.0: @ %entry
129; CHECK-FP16-NEXT:    @ kill: def $s0 killed $s0 def $d0
130; CHECK-FP16-NEXT:    vdup.16 d0, d0[0]
131; CHECK-FP16-NEXT:    bx lr
132entry:
133  %0 = bitcast float %a.coerce to i32
134  %tmp.0.extract.trunc = trunc i32 %0 to i16
135  %1 = bitcast i16 %tmp.0.extract.trunc to bfloat
136  %vecinit = insertelement <4 x bfloat> undef, bfloat %1, i32 0
137  %vecinit4 = shufflevector <4 x bfloat> %vecinit, <4 x bfloat> undef, <4 x i32> zeroinitializer
138  ret <4 x bfloat> %vecinit4
139}
140
141define dso_local <8 x bfloat> @test_vmovq_n_bf16(float %a.coerce) {
142; CHECK-NOFP16-LABEL: test_vmovq_n_bf16:
143; CHECK-NOFP16:       @ %bb.0: @ %entry
144; CHECK-NOFP16-NEXT:    .pad #4
145; CHECK-NOFP16-NEXT:    sub sp, sp, #4
146; CHECK-NOFP16-NEXT:    vmov r0, s0
147; CHECK-NOFP16-NEXT:    strh r0, [sp, #2]
148; CHECK-NOFP16-NEXT:    add r0, sp, #2
149; CHECK-NOFP16-NEXT:    vld1.16 {d0[], d1[]}, [r0:16]
150; CHECK-NOFP16-NEXT:    add sp, sp, #4
151; CHECK-NOFP16-NEXT:    bx lr
152;
153; CHECK-FP16-LABEL: test_vmovq_n_bf16:
154; CHECK-FP16:       @ %bb.0: @ %entry
155; CHECK-FP16-NEXT:    @ kill: def $s0 killed $s0 def $d0
156; CHECK-FP16-NEXT:    vdup.16 q0, d0[0]
157; CHECK-FP16-NEXT:    bx lr
158entry:
159  %0 = bitcast float %a.coerce to i32
160  %tmp.0.extract.trunc = trunc i32 %0 to i16
161  %1 = bitcast i16 %tmp.0.extract.trunc to bfloat
162  %vecinit = insertelement <8 x bfloat> undef, bfloat %1, i32 0
163  %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer
164  ret <8 x bfloat> %vecinit8
165}
166
167define dso_local <4 x bfloat> @test_vdup_n_bf16(float %a.coerce) {
168; CHECK-NOFP16-LABEL: test_vdup_n_bf16:
169; CHECK-NOFP16:       @ %bb.0: @ %entry
170; CHECK-NOFP16-NEXT:    .pad #4
171; CHECK-NOFP16-NEXT:    sub sp, sp, #4
172; CHECK-NOFP16-NEXT:    vmov r0, s0
173; CHECK-NOFP16-NEXT:    strh r0, [sp, #2]
174; CHECK-NOFP16-NEXT:    add r0, sp, #2
175; CHECK-NOFP16-NEXT:    vld1.16 {d0[]}, [r0:16]
176; CHECK-NOFP16-NEXT:    add sp, sp, #4
177; CHECK-NOFP16-NEXT:    bx lr
178;
179; CHECK-FP16-LABEL: test_vdup_n_bf16:
180; CHECK-FP16:       @ %bb.0: @ %entry
181; CHECK-FP16-NEXT:    @ kill: def $s0 killed $s0 def $d0
182; CHECK-FP16-NEXT:    vdup.16 d0, d0[0]
183; CHECK-FP16-NEXT:    bx lr
184entry:
185  %0 = bitcast float %a.coerce to i32
186  %tmp.0.extract.trunc = trunc i32 %0 to i16
187  %1 = bitcast i16 %tmp.0.extract.trunc to bfloat
188  %vecinit = insertelement <4 x bfloat> undef, bfloat %1, i32 0
189  %vecinit4 = shufflevector <4 x bfloat> %vecinit, <4 x bfloat> undef, <4 x i32> zeroinitializer
190  ret <4 x bfloat> %vecinit4
191}
192
193define dso_local <8 x bfloat> @test_vdupq_n_bf16(float %a.coerce) {
194; CHECK-NOFP16-LABEL: test_vdupq_n_bf16:
195; CHECK-NOFP16:       @ %bb.0: @ %entry
196; CHECK-NOFP16-NEXT:    .pad #4
197; CHECK-NOFP16-NEXT:    sub sp, sp, #4
198; CHECK-NOFP16-NEXT:    vmov r0, s0
199; CHECK-NOFP16-NEXT:    strh r0, [sp, #2]
200; CHECK-NOFP16-NEXT:    add r0, sp, #2
201; CHECK-NOFP16-NEXT:    vld1.16 {d0[], d1[]}, [r0:16]
202; CHECK-NOFP16-NEXT:    add sp, sp, #4
203; CHECK-NOFP16-NEXT:    bx lr
204;
205; CHECK-FP16-LABEL: test_vdupq_n_bf16:
206; CHECK-FP16:       @ %bb.0: @ %entry
207; CHECK-FP16-NEXT:    @ kill: def $s0 killed $s0 def $d0
208; CHECK-FP16-NEXT:    vdup.16 q0, d0[0]
209; CHECK-FP16-NEXT:    bx lr
210entry:
211  %0 = bitcast float %a.coerce to i32
212  %tmp.0.extract.trunc = trunc i32 %0 to i16
213  %1 = bitcast i16 %tmp.0.extract.trunc to bfloat
214  %vecinit = insertelement <8 x bfloat> undef, bfloat %1, i32 0
215  %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer
216  ret <8 x bfloat> %vecinit8
217}
218
219define dso_local <4 x bfloat> @test_vdup_lane_bf16(<4 x bfloat> %a) {
220; CHECK-LABEL: test_vdup_lane_bf16:
221; CHECK:       @ %bb.0: @ %entry
222; CHECK-NEXT:    vdup.16 d0, d0[3]
223; CHECK-NEXT:    bx lr
224entry:
225  %shuffle = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
226  ret <4 x bfloat> %shuffle
227}
228
229define dso_local <8 x bfloat> @test_vdupq_lane_bf16(<4 x bfloat> %a) {
230; CHECK-LABEL: test_vdupq_lane_bf16:
231; CHECK:       @ %bb.0: @ %entry
232; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
233; CHECK-NEXT:    vdup.16 q0, d0[3]
234; CHECK-NEXT:    bx lr
235entry:
236  %shuffle = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
237  ret <8 x bfloat> %shuffle
238}
239
240define dso_local <4 x bfloat> @test_vext_bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
241; CHECK-LABEL: test_vext_bf16:
242; CHECK:       @ %bb.0: @ %entry
243; CHECK-NEXT:    vext.16 d0, d0, d1, #2
244; CHECK-NEXT:    bx lr
245entry:
246  %vext = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
247  ret <4 x bfloat> %vext
248}
249
250define dso_local <8 x bfloat> @test_vextq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
251; CHECK-LABEL: test_vextq_bf16:
252; CHECK:       @ %bb.0: @ %entry
253; CHECK-NEXT:    vext.16 q0, q0, q1, #5
254; CHECK-NEXT:    bx lr
255entry:
256  %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
257  ret <8 x bfloat> %vext
258}
259
260define dso_local <4 x bfloat> @test_vext_aligned_bf16(<8 x bfloat> %a) {
261; CHECK-LABEL: test_vext_aligned_bf16:
262; CHECK:       @ %bb.0: @ %entry
263; CHECK-NEXT:    vmov.f64 d0, d1
264; CHECK-NEXT:    bx lr
265entry:
266  %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
267  ret <4 x bfloat> %vext
268}
269
270define dso_local <4 x bfloat> @test_vext_unaligned_bf16(<8 x bfloat> %a) {
271; CHECK-LABEL: test_vext_unaligned_bf16:
272; CHECK:       @ %bb.0: @ %entry
273; CHECK-NEXT:    vext.16 d0, d0, d1, #3
274; CHECK-NEXT:    bx lr
275entry:
276  %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
277  ret <4 x bfloat> %vext
278}
279
280define arm_aapcs_vfpcc <8 x bfloat> @shuffle3step0_bf16(<32 x bfloat> %src) {
281; CHECK-NOFP16-LABEL: shuffle3step0_bf16:
282; CHECK-NOFP16:       @ %bb.0: @ %entry
283; CHECK-NOFP16-NEXT:    vmov r1, s0
284; CHECK-NOFP16-NEXT:    vmov.u16 r0, d0[3]
285; CHECK-NOFP16-NEXT:    vrev32.16 d16, d3
286; CHECK-NOFP16-NEXT:    vext.16 d17, d4, d5, #2
287; CHECK-NOFP16-NEXT:    vext.16 d16, d16, d3, #1
288; CHECK-NOFP16-NEXT:    vext.16 d16, d17, d16, #2
289; CHECK-NOFP16-NEXT:    vext.16 d16, d16, d17, #1
290; CHECK-NOFP16-NEXT:    vext.16 d17, d16, d16, #1
291; CHECK-NOFP16-NEXT:    vmov.16 d16[0], r1
292; CHECK-NOFP16-NEXT:    vmov.16 d16[1], r0
293; CHECK-NOFP16-NEXT:    vmov r0, s3
294; CHECK-NOFP16-NEXT:    vmov.16 d16[2], r0
295; CHECK-NOFP16-NEXT:    vmov.u16 r0, d2[1]
296; CHECK-NOFP16-NEXT:    vmov.16 d16[3], r0
297; CHECK-NOFP16-NEXT:    vorr q0, q8, q8
298; CHECK-NOFP16-NEXT:    bx lr
299;
300; CHECK-FP16-LABEL: shuffle3step0_bf16:
301; CHECK-FP16:       @ %bb.0: @ %entry
302; CHECK-FP16-NEXT:    vmov r1, s0
303; CHECK-FP16-NEXT:    vext.16 d17, d4, d5, #2
304; CHECK-FP16-NEXT:    vmovx.f16 s8, s1
305; CHECK-FP16-NEXT:    vrev32.16 d16, d3
306; CHECK-FP16-NEXT:    vmov r0, s8
307; CHECK-FP16-NEXT:    vext.16 d16, d16, d3, #1
308; CHECK-FP16-NEXT:    vext.16 d16, d17, d16, #2
309; CHECK-FP16-NEXT:    vext.16 d16, d16, d17, #1
310; CHECK-FP16-NEXT:    vext.16 d17, d16, d16, #1
311; CHECK-FP16-NEXT:    vmov.16 d16[0], r1
312; CHECK-FP16-NEXT:    vmov.16 d16[1], r0
313; CHECK-FP16-NEXT:    vmov r0, s3
314; CHECK-FP16-NEXT:    vmovx.f16 s0, s4
315; CHECK-FP16-NEXT:    vmov.16 d16[2], r0
316; CHECK-FP16-NEXT:    vmov r0, s0
317; CHECK-FP16-NEXT:    vmov.16 d16[3], r0
318; CHECK-FP16-NEXT:    vorr q0, q8, q8
319; CHECK-FP16-NEXT:    bx lr
320entry:
321  %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
322  ret <8 x bfloat> %s1
323}
324
325define arm_aapcs_vfpcc <8 x bfloat> @shuffle3step1_bf16(<32 x bfloat> %src) {
326; CHECK-NOFP16-LABEL: shuffle3step1_bf16:
327; CHECK-NOFP16:       @ %bb.0: @ %entry
328; CHECK-NOFP16-NEXT:    vorr q3, q0, q0
329; CHECK-NOFP16-NEXT:    vmov.u16 r1, d6[1]
330; CHECK-NOFP16-NEXT:    vmov r0, s14
331; CHECK-NOFP16-NEXT:    vmov.16 d0[0], r1
332; CHECK-NOFP16-NEXT:    vmov.16 d0[1], r0
333; CHECK-NOFP16-NEXT:    vmov.u16 r0, d7[3]
334; CHECK-NOFP16-NEXT:    vmov.16 d0[2], r0
335; CHECK-NOFP16-NEXT:    vmov r0, s5
336; CHECK-NOFP16-NEXT:    vdup.16 q1, d3[1]
337; CHECK-NOFP16-NEXT:    vmov r1, s4
338; CHECK-NOFP16-NEXT:    vmov.16 d0[3], r0
339; CHECK-NOFP16-NEXT:    vmov r0, s8
340; CHECK-NOFP16-NEXT:    vmov.16 d1[0], r1
341; CHECK-NOFP16-NEXT:    vmov.16 d1[1], r0
342; CHECK-NOFP16-NEXT:    vmov.u16 r0, d4[3]
343; CHECK-NOFP16-NEXT:    vmov.16 d1[2], r0
344; CHECK-NOFP16-NEXT:    vmov r0, s11
345; CHECK-NOFP16-NEXT:    vmov.16 d1[3], r0
346; CHECK-NOFP16-NEXT:    bx lr
347;
348; CHECK-FP16-LABEL: shuffle3step1_bf16:
349; CHECK-FP16:       @ %bb.0: @ %entry
350; CHECK-FP16-NEXT:    vorr q3, q0, q0
351; CHECK-FP16-NEXT:    vmovx.f16 s0, s12
352; CHECK-FP16-NEXT:    vmovx.f16 s12, s15
353; CHECK-FP16-NEXT:    vmov r1, s0
354; CHECK-FP16-NEXT:    vmov r0, s14
355; CHECK-FP16-NEXT:    vmov.16 d0[0], r1
356; CHECK-FP16-NEXT:    vmov.16 d0[1], r0
357; CHECK-FP16-NEXT:    vmov r0, s12
358; CHECK-FP16-NEXT:    vmov.16 d0[2], r0
359; CHECK-FP16-NEXT:    vmov r0, s5
360; CHECK-FP16-NEXT:    vdup.16 q1, d3[1]
361; CHECK-FP16-NEXT:    vmov r1, s4
362; CHECK-FP16-NEXT:    vmovx.f16 s4, s9
363; CHECK-FP16-NEXT:    vmov.16 d0[3], r0
364; CHECK-FP16-NEXT:    vmov r0, s8
365; CHECK-FP16-NEXT:    vmov.16 d1[0], r1
366; CHECK-FP16-NEXT:    vmov.16 d1[1], r0
367; CHECK-FP16-NEXT:    vmov r0, s4
368; CHECK-FP16-NEXT:    vmov.16 d1[2], r0
369; CHECK-FP16-NEXT:    vmov r0, s11
370; CHECK-FP16-NEXT:    vmov.16 d1[3], r0
371; CHECK-FP16-NEXT:    bx lr
372entry:
373  %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
374  ret <8 x bfloat> %s1
375}
376
377define arm_aapcs_vfpcc <8 x bfloat> @shuffle3step2_bf16(<32 x bfloat> %src) {
378; CHECK-NOFP16-LABEL: shuffle3step2_bf16:
379; CHECK-NOFP16:       @ %bb.0: @ %entry
380; CHECK-NOFP16-NEXT:    vext.16 d16, d0, d1, #2
381; CHECK-NOFP16-NEXT:    vmov.u16 r0, d4[1]
382; CHECK-NOFP16-NEXT:    vext.16 d17, d16, d2, #3
383; CHECK-NOFP16-NEXT:    vext.16 d16, d2, d16, #1
384; CHECK-NOFP16-NEXT:    vdup.16 q1, d3[2]
385; CHECK-NOFP16-NEXT:    vext.16 d16, d16, d17, #2
386; CHECK-NOFP16-NEXT:    vmov r1, s4
387; CHECK-NOFP16-NEXT:    vext.16 d0, d16, d16, #1
388; CHECK-NOFP16-NEXT:    vmov.16 d1[0], r1
389; CHECK-NOFP16-NEXT:    vmov.16 d1[1], r0
390; CHECK-NOFP16-NEXT:    vmov r0, s10
391; CHECK-NOFP16-NEXT:    vmov.16 d1[2], r0
392; CHECK-NOFP16-NEXT:    vmov.u16 r0, d5[3]
393; CHECK-NOFP16-NEXT:    vmov.16 d1[3], r0
394; CHECK-NOFP16-NEXT:    bx lr
395;
396; CHECK-FP16-LABEL: shuffle3step2_bf16:
397; CHECK-FP16:       @ %bb.0: @ %entry
398; CHECK-FP16-NEXT:    vext.16 d16, d0, d1, #2
399; CHECK-FP16-NEXT:    vmovx.f16 s12, s8
400; CHECK-FP16-NEXT:    vmov r0, s12
401; CHECK-FP16-NEXT:    vext.16 d17, d16, d2, #3
402; CHECK-FP16-NEXT:    vext.16 d16, d2, d16, #1
403; CHECK-FP16-NEXT:    vdup.16 q1, d3[2]
404; CHECK-FP16-NEXT:    vext.16 d16, d16, d17, #2
405; CHECK-FP16-NEXT:    vmov r1, s4
406; CHECK-FP16-NEXT:    vmovx.f16 s4, s11
407; CHECK-FP16-NEXT:    vext.16 d0, d16, d16, #1
408; CHECK-FP16-NEXT:    vmov.16 d1[0], r1
409; CHECK-FP16-NEXT:    vmov.16 d1[1], r0
410; CHECK-FP16-NEXT:    vmov r0, s10
411; CHECK-FP16-NEXT:    vmov.16 d1[2], r0
412; CHECK-FP16-NEXT:    vmov r0, s4
413; CHECK-FP16-NEXT:    vmov.16 d1[3], r0
414; CHECK-FP16-NEXT:    bx lr
415entry:
416  %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
417  ret <8 x bfloat> %s1
418}
419
420
421define dso_local <4 x bfloat> @test_vrev64_bf16(<4 x bfloat> %a) {
422; CHECK-LABEL: test_vrev64_bf16:
423; CHECK:       @ %bb.0: @ %entry
424; CHECK-NEXT:    vrev64.16 d0, d0
425; CHECK-NEXT:    bx lr
426entry:
427  %shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
428  ret <4 x bfloat> %shuffle.i
429}
430
431define dso_local <8 x bfloat> @test_vrev64q_bf16(<8 x bfloat> %a) {
432; CHECK-LABEL: test_vrev64q_bf16:
433; CHECK:       @ %bb.0: @ %entry
434; CHECK-NEXT:    vrev64.16 q0, q0
435; CHECK-NEXT:    bx lr
436entry:
437  %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
438  ret <8 x bfloat> %shuffle.i
439}
440
441define dso_local <4 x bfloat> @test_vrev32_bf16(<4 x bfloat> %a) {
442; CHECK-LABEL: test_vrev32_bf16:
443; CHECK:       @ %bb.0: @ %entry
444; CHECK-NEXT:    vrev32.16 d0, d0
445; CHECK-NEXT:    bx lr
446entry:
447  %shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
448  ret <4 x bfloat> %shuffle.i
449}
450
451define dso_local <8 x bfloat> @test_vrev32q_bf16(<8 x bfloat> %a) {
452; CHECK-LABEL: test_vrev32q_bf16:
453; CHECK:       @ %bb.0: @ %entry
454; CHECK-NEXT:    vrev32.16 q0, q0
455; CHECK-NEXT:    bx lr
456entry:
457  %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
458  ret <8 x bfloat> %shuffle.i
459}
460
461define <4 x bfloat> @test_vld_dup1_4xbfloat(ptr %b) {
462; CHECK-LABEL: test_vld_dup1_4xbfloat:
463; CHECK:       @ %bb.0: @ %entry
464; CHECK-NEXT:    vld1.16 {d0[]}, [r0:16]
465; CHECK-NEXT:    bx lr
466entry:
467  %b1 = load bfloat, ptr %b, align 2
468  %vecinit = insertelement <4 x bfloat> undef, bfloat %b1, i32 0
469  %vecinit2 = insertelement <4 x bfloat> %vecinit, bfloat %b1, i32 1
470  %vecinit3 = insertelement <4 x bfloat> %vecinit2, bfloat %b1, i32 2
471  %vecinit4 = insertelement <4 x bfloat> %vecinit3, bfloat %b1, i32 3
472  ret <4 x bfloat> %vecinit4
473}
474
475define <8 x bfloat> @test_vld_dup1_8xbfloat(ptr %b) local_unnamed_addr {
476; CHECK-LABEL: test_vld_dup1_8xbfloat:
477; CHECK:       @ %bb.0: @ %entry
478; CHECK-NEXT:    vld1.16 {d0[], d1[]}, [r0:16]
479; CHECK-NEXT:    bx lr
480entry:
481  %b1 = load bfloat, ptr %b, align 2
482  %vecinit = insertelement <8 x bfloat> undef, bfloat %b1, i32 0
483  %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer
484  ret <8 x bfloat> %vecinit8
485}
486
487define <8 x bfloat> @test_shufflevector8xbfloat(<4 x bfloat> %a) {
488; CHECK-LABEL: test_shufflevector8xbfloat:
489; CHECK:       @ %bb.0: @ %entry
490; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
491; CHECK-NEXT:    vmov.f64 d1, d0
492; CHECK-NEXT:    bx lr
493entry:
494  %r = shufflevector <4 x bfloat> %a, <4 x bfloat> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
495  ret <8 x bfloat> %r
496}
497
498declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>)
499declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
500