xref: /llvm-project/llvm/test/CodeGen/AArch64/bf16-shuffle.ll (revision f6947e479e14e7904aa0b2539a95f5dfdc8f9295)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64 -mattr=+v8.6a,+neon < %s | FileCheck %s
3; RUN: llc -mtriple=aarch64 -mattr=+v8.6a,+neon,+bf16 < %s | FileCheck %s
4; RUN: llc -mtriple=aarch64 -mattr=+v8.6a,+neon,+fullfp16,+bf16 < %s | FileCheck %s
5
6%struct.float16x4x2_t = type { [2 x <4 x bfloat>] }
7%struct.float16x8x2_t = type { [2 x <8 x bfloat>] }
8
9define dso_local %struct.float16x4x2_t @test_vzip_bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
10; CHECK-LABEL: test_vzip_bf16:
11; CHECK:       // %bb.0: // %entry
12; CHECK-NEXT:    zip1 v2.4h, v0.4h, v1.4h
13; CHECK-NEXT:    zip2 v1.4h, v0.4h, v1.4h
14; CHECK-NEXT:    fmov d0, d2
15; CHECK-NEXT:    ret
16entry:
17  %vzip.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
18  %vzip1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
19  %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vzip.i, 0, 0
20  %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vzip1.i, 0, 1
21  ret %struct.float16x4x2_t %.fca.0.1.insert
22}
23
24define dso_local %struct.float16x8x2_t @test_vzipq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
25; CHECK-LABEL: test_vzipq_bf16:
26; CHECK:       // %bb.0: // %entry
27; CHECK-NEXT:    zip1 v2.8h, v0.8h, v1.8h
28; CHECK-NEXT:    zip2 v1.8h, v0.8h, v1.8h
29; CHECK-NEXT:    mov v0.16b, v2.16b
30; CHECK-NEXT:    ret
31entry:
32  %vzip.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
33  %vzip1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
34  %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vzip.i, 0, 0
35  %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vzip1.i, 0, 1
36  ret %struct.float16x8x2_t %.fca.0.1.insert
37}
38
39define dso_local %struct.float16x4x2_t @test_vuzp_bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
40; CHECK-LABEL: test_vuzp_bf16:
41; CHECK:       // %bb.0: // %entry
42; CHECK-NEXT:    uzp1 v2.4h, v0.4h, v1.4h
43; CHECK-NEXT:    uzp2 v1.4h, v0.4h, v1.4h
44; CHECK-NEXT:    fmov d0, d2
45; CHECK-NEXT:    ret
46entry:
47  %vuzp.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
48  %vuzp1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
49  %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vuzp.i, 0, 0
50  %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vuzp1.i, 0, 1
51  ret %struct.float16x4x2_t %.fca.0.1.insert
52}
53
54define dso_local %struct.float16x8x2_t @test_vuzpq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
55; CHECK-LABEL: test_vuzpq_bf16:
56; CHECK:       // %bb.0: // %entry
57; CHECK-NEXT:    uzp1 v2.8h, v0.8h, v1.8h
58; CHECK-NEXT:    uzp2 v1.8h, v0.8h, v1.8h
59; CHECK-NEXT:    mov v0.16b, v2.16b
60; CHECK-NEXT:    ret
61entry:
62  %vuzp.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
63  %vuzp1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
64  %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vuzp.i, 0, 0
65  %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vuzp1.i, 0, 1
66  ret %struct.float16x8x2_t %.fca.0.1.insert
67}
68
69define dso_local %struct.float16x4x2_t @test_vtrn_bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
70; CHECK-LABEL: test_vtrn_bf16:
71; CHECK:       // %bb.0: // %entry
72; CHECK-NEXT:    trn1 v2.4h, v0.4h, v1.4h
73; CHECK-NEXT:    trn2 v1.4h, v0.4h, v1.4h
74; CHECK-NEXT:    fmov d0, d2
75; CHECK-NEXT:    ret
76entry:
77  %vtrn.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
78  %vtrn1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
79  %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vtrn.i, 0, 0
80  %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vtrn1.i, 0, 1
81  ret %struct.float16x4x2_t %.fca.0.1.insert
82}
83
84define dso_local %struct.float16x8x2_t @test_vtrnq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
85; CHECK-LABEL: test_vtrnq_bf16:
86; CHECK:       // %bb.0: // %entry
87; CHECK-NEXT:    trn1 v2.8h, v0.8h, v1.8h
88; CHECK-NEXT:    trn2 v1.8h, v0.8h, v1.8h
89; CHECK-NEXT:    mov v0.16b, v2.16b
90; CHECK-NEXT:    ret
91entry:
92  %vtrn.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
93  %vtrn1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
94  %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vtrn.i, 0, 0
95  %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vtrn1.i, 0, 1
96  ret %struct.float16x8x2_t %.fca.0.1.insert
97}
98
99define dso_local <4 x bfloat> @test_vmov_n_bf16(float %a.coerce) {
100; CHECK-LABEL: test_vmov_n_bf16:
101; CHECK:       // %bb.0: // %entry
102; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
103; CHECK-NEXT:    dup v0.4h, v0.h[0]
104; CHECK-NEXT:    ret
105entry:
106  %0 = bitcast float %a.coerce to i32
107  %tmp.0.extract.trunc = trunc i32 %0 to i16
108  %1 = bitcast i16 %tmp.0.extract.trunc to bfloat
109  %vecinit = insertelement <4 x bfloat> undef, bfloat %1, i32 0
110  %vecinit4 = shufflevector <4 x bfloat> %vecinit, <4 x bfloat> undef, <4 x i32> zeroinitializer
111  ret <4 x bfloat> %vecinit4
112}
113
114define dso_local <8 x bfloat> @test_vmovq_n_bf16(float %a.coerce) {
115; CHECK-LABEL: test_vmovq_n_bf16:
116; CHECK:       // %bb.0: // %entry
117; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
118; CHECK-NEXT:    dup v0.8h, v0.h[0]
119; CHECK-NEXT:    ret
120entry:
121  %0 = bitcast float %a.coerce to i32
122  %tmp.0.extract.trunc = trunc i32 %0 to i16
123  %1 = bitcast i16 %tmp.0.extract.trunc to bfloat
124  %vecinit = insertelement <8 x bfloat> undef, bfloat %1, i32 0
125  %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer
126  ret <8 x bfloat> %vecinit8
127}
128
129define dso_local <4 x bfloat> @test_vdup_n_bf16(float %a.coerce) {
130; CHECK-LABEL: test_vdup_n_bf16:
131; CHECK:       // %bb.0: // %entry
132; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
133; CHECK-NEXT:    dup v0.4h, v0.h[0]
134; CHECK-NEXT:    ret
135entry:
136  %0 = bitcast float %a.coerce to i32
137  %tmp.0.extract.trunc = trunc i32 %0 to i16
138  %1 = bitcast i16 %tmp.0.extract.trunc to bfloat
139  %vecinit = insertelement <4 x bfloat> undef, bfloat %1, i32 0
140  %vecinit4 = shufflevector <4 x bfloat> %vecinit, <4 x bfloat> undef, <4 x i32> zeroinitializer
141  ret <4 x bfloat> %vecinit4
142}
143
144define dso_local <8 x bfloat> @test_vdupq_n_bf16(float %a.coerce) {
145; CHECK-LABEL: test_vdupq_n_bf16:
146; CHECK:       // %bb.0: // %entry
147; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
148; CHECK-NEXT:    dup v0.8h, v0.h[0]
149; CHECK-NEXT:    ret
150entry:
151  %0 = bitcast float %a.coerce to i32
152  %tmp.0.extract.trunc = trunc i32 %0 to i16
153  %1 = bitcast i16 %tmp.0.extract.trunc to bfloat
154  %vecinit = insertelement <8 x bfloat> undef, bfloat %1, i32 0
155  %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer
156  ret <8 x bfloat> %vecinit8
157}
158
159define dso_local <4 x bfloat> @test_vdup_lane_bf16(<4 x bfloat> %a) {
160; CHECK-LABEL: test_vdup_lane_bf16:
161; CHECK:       // %bb.0: // %entry
162; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
163; CHECK-NEXT:    dup v0.4h, v0.h[3]
164; CHECK-NEXT:    ret
165entry:
166  %shuffle = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
167  ret <4 x bfloat> %shuffle
168}
169
170define dso_local <8 x bfloat> @test_vdupq_lane_bf16(<4 x bfloat> %a) {
171; CHECK-LABEL: test_vdupq_lane_bf16:
172; CHECK:       // %bb.0: // %entry
173; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
174; CHECK-NEXT:    dup v0.8h, v0.h[3]
175; CHECK-NEXT:    ret
176entry:
177  %shuffle = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
178  ret <8 x bfloat> %shuffle
179}
180
181define dso_local <4 x bfloat> @test_vext_bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
182; CHECK-LABEL: test_vext_bf16:
183; CHECK:       // %bb.0: // %entry
184; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
185; CHECK-NEXT:    ret
186entry:
187  %vext = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
188  ret <4 x bfloat> %vext
189}
190
191define dso_local <8 x bfloat> @test_vextq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
192; CHECK-LABEL: test_vextq_bf16:
193; CHECK:       // %bb.0: // %entry
194; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #10
195; CHECK-NEXT:    ret
196entry:
197  %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
198  ret <8 x bfloat> %vext
199}
200
201define dso_local <4 x bfloat> @test_vext_aligned_bf16(<8 x bfloat> %a) {
202; CHECK-LABEL: test_vext_aligned_bf16:
203; CHECK:       // %bb.0: // %entry
204; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
205; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
206; CHECK-NEXT:    ret
207entry:
208  %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
209  ret <4 x bfloat> %vext
210}
211
212define dso_local <4 x bfloat> @test_vext_unaligned_bf16(<8 x bfloat> %a) {
213; CHECK-LABEL: test_vext_unaligned_bf16:
214; CHECK:       // %bb.0: // %entry
215; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #6
216; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
217; CHECK-NEXT:    ret
218entry:
219  %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
220  ret <4 x bfloat> %vext
221}
222
223define <8 x bfloat> @shuffle3step0_bf16(<32 x bfloat> %src) {
224; CHECK-LABEL: shuffle3step0_bf16:
225; CHECK:       // %bb.0: // %entry
226; CHECK-NEXT:    adrp x8, .LCPI16_0
227; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
228; CHECK-NEXT:    mov v3.16b, v2.16b
229; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI16_0]
230; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
231; CHECK-NEXT:    adrp x8, .LCPI16_1
232; CHECK-NEXT:    tbl v2.16b, { v0.16b, v1.16b }, v4.16b
233; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI16_1]
234; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
235; CHECK-NEXT:    ret
236entry:
237  %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
238  ret <8 x bfloat> %s1
239}
240
241define <8 x bfloat> @shuffle3step1_bf16(<32 x bfloat> %src) {
242; CHECK-LABEL: shuffle3step1_bf16:
243; CHECK:       // %bb.0: // %entry
244; CHECK-NEXT:    adrp x8, .LCPI17_0
245; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
246; CHECK-NEXT:    mov v3.16b, v2.16b
247; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI17_0]
248; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
249; CHECK-NEXT:    adrp x8, .LCPI17_1
250; CHECK-NEXT:    tbl v2.16b, { v0.16b, v1.16b }, v4.16b
251; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI17_1]
252; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
253; CHECK-NEXT:    ret
254entry:
255  %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
256  ret <8 x bfloat> %s1
257}
258
259define <8 x bfloat> @shuffle3step2_bf16(<32 x bfloat> %src) {
260; CHECK-LABEL: shuffle3step2_bf16:
261; CHECK:       // %bb.0: // %entry
262; CHECK-NEXT:    adrp x8, .LCPI18_0
263; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
264; CHECK-NEXT:    mov v3.16b, v2.16b
265; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI18_0]
266; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
267; CHECK-NEXT:    adrp x8, .LCPI18_1
268; CHECK-NEXT:    tbl v2.16b, { v0.16b, v1.16b }, v4.16b
269; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI18_1]
270; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
271; CHECK-NEXT:    ret
272entry:
273  %s1 = shufflevector <32 x bfloat> %src, <32 x bfloat> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
274  ret <8 x bfloat> %s1
275}
276
277
278define dso_local <4 x bfloat> @test_vrev64_bf16(<4 x bfloat> %a) {
279; CHECK-LABEL: test_vrev64_bf16:
280; CHECK:       // %bb.0: // %entry
281; CHECK-NEXT:    rev64 v0.4h, v0.4h
282; CHECK-NEXT:    ret
283entry:
284  %shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
285  ret <4 x bfloat> %shuffle.i
286}
287
288define dso_local <8 x bfloat> @test_vrev64q_bf16(<8 x bfloat> %a) {
289; CHECK-LABEL: test_vrev64q_bf16:
290; CHECK:       // %bb.0: // %entry
291; CHECK-NEXT:    rev64 v0.8h, v0.8h
292; CHECK-NEXT:    ret
293entry:
294  %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
295  ret <8 x bfloat> %shuffle.i
296}
297
298define dso_local <4 x bfloat> @test_vrev32_bf16(<4 x bfloat> %a) {
299; CHECK-LABEL: test_vrev32_bf16:
300; CHECK:       // %bb.0: // %entry
301; CHECK-NEXT:    rev32 v0.4h, v0.4h
302; CHECK-NEXT:    ret
303entry:
304  %shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
305  ret <4 x bfloat> %shuffle.i
306}
307
308define dso_local <8 x bfloat> @test_vrev32q_bf16(<8 x bfloat> %a) {
309; CHECK-LABEL: test_vrev32q_bf16:
310; CHECK:       // %bb.0: // %entry
311; CHECK-NEXT:    rev32 v0.8h, v0.8h
312; CHECK-NEXT:    ret
313entry:
314  %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
315  ret <8 x bfloat> %shuffle.i
316}
317
318define <4 x bfloat> @test_vld_dup1_4xbfloat(ptr %b) {
319; CHECK-LABEL: test_vld_dup1_4xbfloat:
320; CHECK:       // %bb.0: // %entry
321; CHECK-NEXT:    ld1r { v0.4h }, [x0]
322; CHECK-NEXT:    ret
323entry:
324  %b1 = load bfloat, ptr %b, align 2
325  %vecinit = insertelement <4 x bfloat> undef, bfloat %b1, i32 0
326  %vecinit2 = insertelement <4 x bfloat> %vecinit, bfloat %b1, i32 1
327  %vecinit3 = insertelement <4 x bfloat> %vecinit2, bfloat %b1, i32 2
328  %vecinit4 = insertelement <4 x bfloat> %vecinit3, bfloat %b1, i32 3
329  ret <4 x bfloat> %vecinit4
330}
331
332define <8 x bfloat> @test_vld_dup1_8xbfloat(ptr %b) local_unnamed_addr {
333; CHECK-LABEL: test_vld_dup1_8xbfloat:
334; CHECK:       // %bb.0: // %entry
335; CHECK-NEXT:    ld1r { v0.8h }, [x0]
336; CHECK-NEXT:    ret
337entry:
338  %b1 = load bfloat, ptr %b, align 2
339  %vecinit = insertelement <8 x bfloat> undef, bfloat %b1, i32 0
340  %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer
341  ret <8 x bfloat> %vecinit8
342}
343
344define <8 x bfloat> @test_shufflevector8xbfloat(<4 x bfloat> %a) {
345; CHECK-LABEL: test_shufflevector8xbfloat:
346; CHECK:       // %bb.0: // %entry
347; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
348; CHECK-NEXT:    mov v0.d[1], v0.d[0]
349; CHECK-NEXT:    ret
350entry:
351  %r = shufflevector <4 x bfloat> %a, <4 x bfloat> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
352  ret <8 x bfloat> %r
353}
354
355