xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll (revision 2c49586e1b9ab917877a6af8e1669854899687d3)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MVE
3; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MVEFP
4
5define arm_aapcs_vfpcc <4 x float> @fpext_4(<4 x half> %src1) {
6; CHECK-LABEL: fpext_4:
7; CHECK:       @ %bb.0: @ %entry
8; CHECK-NEXT:    vcvtt.f32.f16 s3, s1
9; CHECK-NEXT:    vcvtb.f32.f16 s2, s1
10; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
11; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
12; CHECK-NEXT:    bx lr
13entry:
14  %out = fpext <4 x half> %src1 to <4 x float>
15  ret <4 x float> %out
16}
17
18define arm_aapcs_vfpcc <8 x float> @fpext_8(<8 x half> %src1) {
19; CHECK-LABEL: fpext_8:
20; CHECK:       @ %bb.0: @ %entry
21; CHECK-NEXT:    vcvtt.f32.f16 s11, s1
22; CHECK-NEXT:    vcvtb.f32.f16 s10, s1
23; CHECK-NEXT:    vcvtt.f32.f16 s9, s0
24; CHECK-NEXT:    vcvtb.f32.f16 s8, s0
25; CHECK-NEXT:    vcvtt.f32.f16 s7, s3
26; CHECK-NEXT:    vcvtb.f32.f16 s6, s3
27; CHECK-NEXT:    vcvtt.f32.f16 s5, s2
28; CHECK-NEXT:    vcvtb.f32.f16 s4, s2
29; CHECK-NEXT:    vmov q0, q2
30; CHECK-NEXT:    bx lr
31entry:
32  %out = fpext <8 x half> %src1 to <8 x float>
33  ret <8 x float> %out
34}
35
36
37define arm_aapcs_vfpcc <4 x half> @fptrunc_4(<4 x float> %src1) {
38; CHECK-LABEL: fptrunc_4:
39; CHECK:       @ %bb.0: @ %entry
40; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
41; CHECK-NEXT:    vcvtt.f16.f32 s0, s1
42; CHECK-NEXT:    vcvtb.f16.f32 s1, s2
43; CHECK-NEXT:    vcvtt.f16.f32 s1, s3
44; CHECK-NEXT:    bx lr
45entry:
46  %out = fptrunc <4 x float> %src1 to <4 x half>
47  ret <4 x half> %out
48}
49
50define arm_aapcs_vfpcc <8 x half> @fptrunc_8(<8 x float> %src1) {
51; CHECK-LABEL: fptrunc_8:
52; CHECK:       @ %bb.0: @ %entry
53; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
54; CHECK-NEXT:    vcvtt.f16.f32 s0, s1
55; CHECK-NEXT:    vcvtb.f16.f32 s1, s2
56; CHECK-NEXT:    vcvtb.f16.f32 s2, s4
57; CHECK-NEXT:    vcvtt.f16.f32 s1, s3
58; CHECK-NEXT:    vcvtb.f16.f32 s3, s6
59; CHECK-NEXT:    vcvtt.f16.f32 s2, s5
60; CHECK-NEXT:    vcvtt.f16.f32 s3, s7
61; CHECK-NEXT:    bx lr
62entry:
63  %out = fptrunc <8 x float> %src1 to <8 x half>
64  ret <8 x half> %out
65}
66
67
68define arm_aapcs_vfpcc <8 x half> @shuffle_trunc1(<4 x float> %src1, <4 x float> %src2) {
69; CHECK-MVE-LABEL: shuffle_trunc1:
70; CHECK-MVE:       @ %bb.0: @ %entry
71; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
72; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s1
73; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s2
74; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s3
75; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s4
76; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s5
77; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s6
78; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s7
79; CHECK-MVE-NEXT:    bx lr
80;
81; CHECK-MVEFP-LABEL: shuffle_trunc1:
82; CHECK-MVEFP:       @ %bb.0: @ %entry
83; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q0, q0
84; CHECK-MVEFP-NEXT:    vcvtt.f16.f32 q0, q1
85; CHECK-MVEFP-NEXT:    bx lr
86entry:
87  %strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
88  %out = fptrunc <8 x float> %strided.vec to <8 x half>
89  ret <8 x half> %out
90}
91
92define arm_aapcs_vfpcc <8 x half> @shuffle_trunc2(<4 x float> %src1, <4 x float> %src2) {
93; CHECK-MVE-LABEL: shuffle_trunc2:
94; CHECK-MVE:       @ %bb.0: @ %entry
95; CHECK-MVE-NEXT:    vmov q2, q0
96; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s4
97; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s5
98; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s6
99; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s7
100; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s8
101; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s9
102; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s10
103; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s11
104; CHECK-MVE-NEXT:    bx lr
105;
106; CHECK-MVEFP-LABEL: shuffle_trunc2:
107; CHECK-MVEFP:       @ %bb.0: @ %entry
108; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q1, q1
109; CHECK-MVEFP-NEXT:    vcvtt.f16.f32 q1, q0
110; CHECK-MVEFP-NEXT:    vmov q0, q1
111; CHECK-MVEFP-NEXT:    bx lr
112entry:
113  %strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
114  %out = fptrunc <8 x float> %strided.vec to <8 x half>
115  ret <8 x half> %out
116}
117
118define arm_aapcs_vfpcc <16 x half> @shuffle_trunc3(<8 x float> %src1, <8 x float> %src2) {
119; CHECK-MVE-LABEL: shuffle_trunc3:
120; CHECK-MVE:       @ %bb.0: @ %entry
121; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
122; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s1
123; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s2
124; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s3
125; CHECK-MVE-NEXT:    vcvtb.f16.f32 s4, s4
126; CHECK-MVE-NEXT:    vcvtb.f16.f32 s5, s5
127; CHECK-MVE-NEXT:    vcvtb.f16.f32 s6, s6
128; CHECK-MVE-NEXT:    vcvtb.f16.f32 s7, s7
129; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s8
130; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s9
131; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s10
132; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s11
133; CHECK-MVE-NEXT:    vcvtt.f16.f32 s4, s12
134; CHECK-MVE-NEXT:    vcvtt.f16.f32 s5, s13
135; CHECK-MVE-NEXT:    vcvtt.f16.f32 s6, s14
136; CHECK-MVE-NEXT:    vcvtt.f16.f32 s7, s15
137; CHECK-MVE-NEXT:    bx lr
138;
139; CHECK-MVEFP-LABEL: shuffle_trunc3:
140; CHECK-MVEFP:       @ %bb.0: @ %entry
141; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q0, q0
142; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q1, q1
143; CHECK-MVEFP-NEXT:    vcvtt.f16.f32 q0, q2
144; CHECK-MVEFP-NEXT:    vcvtt.f16.f32 q1, q3
145; CHECK-MVEFP-NEXT:    bx lr
146entry:
147  %strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
148  %out = fptrunc <16 x float> %strided.vec to <16 x half>
149  ret <16 x half> %out
150}
151
152define arm_aapcs_vfpcc <16 x half> @shuffle_trunc4(<8 x float> %src1, <8 x float> %src2) {
153; CHECK-MVE-LABEL: shuffle_trunc4:
154; CHECK-MVE:       @ %bb.0: @ %entry
155; CHECK-MVE-NEXT:    .vsave {d8, d9}
156; CHECK-MVE-NEXT:    vpush {d8, d9}
157; CHECK-MVE-NEXT:    vmov q4, q0
158; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s8
159; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s9
160; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s10
161; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s11
162; CHECK-MVE-NEXT:    vcvtb.f16.f32 s8, s12
163; CHECK-MVE-NEXT:    vcvtb.f16.f32 s9, s13
164; CHECK-MVE-NEXT:    vcvtb.f16.f32 s10, s14
165; CHECK-MVE-NEXT:    vcvtb.f16.f32 s11, s15
166; CHECK-MVE-NEXT:    vcvtt.f16.f32 s8, s4
167; CHECK-MVE-NEXT:    vcvtt.f16.f32 s9, s5
168; CHECK-MVE-NEXT:    vcvtt.f16.f32 s10, s6
169; CHECK-MVE-NEXT:    vcvtt.f16.f32 s11, s7
170; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s16
171; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s17
172; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s18
173; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s19
174; CHECK-MVE-NEXT:    vmov q1, q2
175; CHECK-MVE-NEXT:    vpop {d8, d9}
176; CHECK-MVE-NEXT:    bx lr
177;
178; CHECK-MVEFP-LABEL: shuffle_trunc4:
179; CHECK-MVEFP:       @ %bb.0: @ %entry
180; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q2, q2
181; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q3, q3
182; CHECK-MVEFP-NEXT:    vcvtt.f16.f32 q2, q0
183; CHECK-MVEFP-NEXT:    vcvtt.f16.f32 q3, q1
184; CHECK-MVEFP-NEXT:    vmov q0, q2
185; CHECK-MVEFP-NEXT:    vmov q1, q3
186; CHECK-MVEFP-NEXT:    bx lr
187entry:
188  %strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
189  %out = fptrunc <16 x float> %strided.vec to <16 x half>
190  ret <16 x half> %out
191}
192
193define arm_aapcs_vfpcc <8 x half> @shuffle_trunc5(<4 x float> %src1, <4 x float> %src2) {
194; CHECK-MVE-LABEL: shuffle_trunc5:
195; CHECK-MVE:       @ %bb.0: @ %entry
196; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
197; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s1
198; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s2
199; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s3
200; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s4
201; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s5
202; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s6
203; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s7
204; CHECK-MVE-NEXT:    bx lr
205;
206; CHECK-MVEFP-LABEL: shuffle_trunc5:
207; CHECK-MVEFP:       @ %bb.0: @ %entry
208; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q0, q0
209; CHECK-MVEFP-NEXT:    vcvtt.f16.f32 q0, q1
210; CHECK-MVEFP-NEXT:    bx lr
211entry:
212  %out1 = fptrunc <4 x float> %src1 to <4 x half>
213  %out2 = fptrunc <4 x float> %src2 to <4 x half>
214  %s = shufflevector <4 x half> %out1, <4 x half> %out2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
215  ret <8 x half> %s
216}
217
218define arm_aapcs_vfpcc <8 x half> @shuffle_trunc6(<4 x float> %src1, <4 x float> %src2) {
219; CHECK-MVE-LABEL: shuffle_trunc6:
220; CHECK-MVE:       @ %bb.0: @ %entry
221; CHECK-MVE-NEXT:    vmov q2, q0
222; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s4
223; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s5
224; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s6
225; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s7
226; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s8
227; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s9
228; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s10
229; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s11
230; CHECK-MVE-NEXT:    bx lr
231;
232; CHECK-MVEFP-LABEL: shuffle_trunc6:
233; CHECK-MVEFP:       @ %bb.0: @ %entry
234; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q1, q1
235; CHECK-MVEFP-NEXT:    vcvtt.f16.f32 q1, q0
236; CHECK-MVEFP-NEXT:    vmov q0, q1
237; CHECK-MVEFP-NEXT:    bx lr
238entry:
239  %out1 = fptrunc <4 x float> %src1 to <4 x half>
240  %out2 = fptrunc <4 x float> %src2 to <4 x half>
241  %s = shufflevector <4 x half> %out1, <4 x half> %out2, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
242  ret <8 x half> %s
243}
244
245define arm_aapcs_vfpcc <16 x half> @shuffle_trunc7(<8 x float> %src1, <8 x float> %src2) {
246; CHECK-MVE-LABEL: shuffle_trunc7:
247; CHECK-MVE:       @ %bb.0: @ %entry
248; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
249; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s1
250; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s2
251; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s3
252; CHECK-MVE-NEXT:    vcvtb.f16.f32 s4, s4
253; CHECK-MVE-NEXT:    vcvtb.f16.f32 s5, s5
254; CHECK-MVE-NEXT:    vcvtb.f16.f32 s6, s6
255; CHECK-MVE-NEXT:    vcvtb.f16.f32 s7, s7
256; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s8
257; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s9
258; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s10
259; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s11
260; CHECK-MVE-NEXT:    vcvtt.f16.f32 s4, s12
261; CHECK-MVE-NEXT:    vcvtt.f16.f32 s5, s13
262; CHECK-MVE-NEXT:    vcvtt.f16.f32 s6, s14
263; CHECK-MVE-NEXT:    vcvtt.f16.f32 s7, s15
264; CHECK-MVE-NEXT:    bx lr
265;
266; CHECK-MVEFP-LABEL: shuffle_trunc7:
267; CHECK-MVEFP:       @ %bb.0: @ %entry
268; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q0, q0
269; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q1, q1
270; CHECK-MVEFP-NEXT:    vcvtt.f16.f32 q0, q2
271; CHECK-MVEFP-NEXT:    vcvtt.f16.f32 q1, q3
272; CHECK-MVEFP-NEXT:    bx lr
273entry:
274  %out1 = fptrunc <8 x float> %src1 to <8 x half>
275  %out2 = fptrunc <8 x float> %src2 to <8 x half>
276  %s = shufflevector <8 x half> %out1, <8 x half> %out2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
277  ret <16 x half> %s
278}
279
280define arm_aapcs_vfpcc <16 x half> @shuffle_trunc8(<8 x float> %src1, <8 x float> %src2) {
281; CHECK-MVE-LABEL: shuffle_trunc8:
282; CHECK-MVE:       @ %bb.0: @ %entry
283; CHECK-MVE-NEXT:    .vsave {d8, d9}
284; CHECK-MVE-NEXT:    vpush {d8, d9}
285; CHECK-MVE-NEXT:    vmov q4, q0
286; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s8
287; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s9
288; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s10
289; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s11
290; CHECK-MVE-NEXT:    vcvtb.f16.f32 s8, s12
291; CHECK-MVE-NEXT:    vcvtb.f16.f32 s9, s13
292; CHECK-MVE-NEXT:    vcvtb.f16.f32 s10, s14
293; CHECK-MVE-NEXT:    vcvtb.f16.f32 s11, s15
294; CHECK-MVE-NEXT:    vcvtt.f16.f32 s8, s4
295; CHECK-MVE-NEXT:    vcvtt.f16.f32 s9, s5
296; CHECK-MVE-NEXT:    vcvtt.f16.f32 s10, s6
297; CHECK-MVE-NEXT:    vcvtt.f16.f32 s11, s7
298; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s16
299; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s17
300; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s18
301; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s19
302; CHECK-MVE-NEXT:    vmov q1, q2
303; CHECK-MVE-NEXT:    vpop {d8, d9}
304; CHECK-MVE-NEXT:    bx lr
305;
306; CHECK-MVEFP-LABEL: shuffle_trunc8:
307; CHECK-MVEFP:       @ %bb.0: @ %entry
308; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q2, q2
309; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q3, q3
310; CHECK-MVEFP-NEXT:    vcvtt.f16.f32 q2, q0
311; CHECK-MVEFP-NEXT:    vcvtt.f16.f32 q3, q1
312; CHECK-MVEFP-NEXT:    vmov q0, q2
313; CHECK-MVEFP-NEXT:    vmov q1, q3
314; CHECK-MVEFP-NEXT:    bx lr
315entry:
316  %out1 = fptrunc <8 x float> %src1 to <8 x half>
317  %out2 = fptrunc <8 x float> %src2 to <8 x half>
318  %s = shufflevector <8 x half> %out1, <8 x half> %out2, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
319  ret <16 x half> %s
320}
321
322
323
324
325define arm_aapcs_vfpcc <4 x float> @load_ext_4(ptr %src) {
326; CHECK-MVE-LABEL: load_ext_4:
327; CHECK-MVE:       @ %bb.0: @ %entry
328; CHECK-MVE-NEXT:    ldrd r0, r1, [r0]
329; CHECK-MVE-NEXT:    vmov.32 q0[0], r0
330; CHECK-MVE-NEXT:    vmov.32 q0[1], r1
331; CHECK-MVE-NEXT:    vcvtt.f32.f16 s3, s1
332; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s1
333; CHECK-MVE-NEXT:    vcvtt.f32.f16 s1, s0
334; CHECK-MVE-NEXT:    vcvtb.f32.f16 s0, s0
335; CHECK-MVE-NEXT:    bx lr
336;
337; CHECK-MVEFP-LABEL: load_ext_4:
338; CHECK-MVEFP:       @ %bb.0: @ %entry
339; CHECK-MVEFP-NEXT:    vldrh.u32 q0, [r0]
340; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 q0, q0
341; CHECK-MVEFP-NEXT:    bx lr
342entry:
343  %wide.load = load <4 x half>, ptr %src, align 4
344  %e = fpext <4 x half> %wide.load to <4 x float>
345  ret <4 x float> %e
346}
347
348define arm_aapcs_vfpcc <8 x float> @load_ext_8(ptr %src) {
349; CHECK-MVE-LABEL: load_ext_8:
350; CHECK-MVE:       @ %bb.0: @ %entry
351; CHECK-MVE-NEXT:    vldrw.u32 q2, [r0]
352; CHECK-MVE-NEXT:    vcvtt.f32.f16 s3, s9
353; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s9
354; CHECK-MVE-NEXT:    vcvtt.f32.f16 s1, s8
355; CHECK-MVE-NEXT:    vcvtb.f32.f16 s0, s8
356; CHECK-MVE-NEXT:    vcvtt.f32.f16 s7, s11
357; CHECK-MVE-NEXT:    vcvtb.f32.f16 s6, s11
358; CHECK-MVE-NEXT:    vcvtt.f32.f16 s5, s10
359; CHECK-MVE-NEXT:    vcvtb.f32.f16 s4, s10
360; CHECK-MVE-NEXT:    bx lr
361;
362; CHECK-MVEFP-LABEL: load_ext_8:
363; CHECK-MVEFP:       @ %bb.0: @ %entry
364; CHECK-MVEFP-NEXT:    vldrh.u32 q0, [r0]
365; CHECK-MVEFP-NEXT:    vldrh.u32 q1, [r0, #8]
366; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 q0, q0
367; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 q1, q1
368; CHECK-MVEFP-NEXT:    bx lr
369entry:
370  %wide.load = load <8 x half>, ptr %src, align 4
371  %e = fpext <8 x half> %wide.load to <8 x float>
372  ret <8 x float> %e
373}
374
375define arm_aapcs_vfpcc <16 x float> @load_ext_16(ptr %src) {
376; CHECK-MVE-LABEL: load_ext_16:
377; CHECK-MVE:       @ %bb.0: @ %entry
378; CHECK-MVE-NEXT:    .vsave {d8, d9}
379; CHECK-MVE-NEXT:    vpush {d8, d9}
380; CHECK-MVE-NEXT:    vldrw.u32 q2, [r0], #16
381; CHECK-MVE-NEXT:    vldrw.u32 q4, [r0]
382; CHECK-MVE-NEXT:    vcvtt.f32.f16 s3, s9
383; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s9
384; CHECK-MVE-NEXT:    vcvtt.f32.f16 s1, s8
385; CHECK-MVE-NEXT:    vcvtb.f32.f16 s0, s8
386; CHECK-MVE-NEXT:    vcvtt.f32.f16 s7, s11
387; CHECK-MVE-NEXT:    vcvtb.f32.f16 s6, s11
388; CHECK-MVE-NEXT:    vcvtt.f32.f16 s5, s10
389; CHECK-MVE-NEXT:    vcvtb.f32.f16 s4, s10
390; CHECK-MVE-NEXT:    vcvtt.f32.f16 s11, s17
391; CHECK-MVE-NEXT:    vcvtb.f32.f16 s10, s17
392; CHECK-MVE-NEXT:    vcvtt.f32.f16 s9, s16
393; CHECK-MVE-NEXT:    vcvtb.f32.f16 s8, s16
394; CHECK-MVE-NEXT:    vcvtt.f32.f16 s15, s19
395; CHECK-MVE-NEXT:    vcvtb.f32.f16 s14, s19
396; CHECK-MVE-NEXT:    vcvtt.f32.f16 s13, s18
397; CHECK-MVE-NEXT:    vcvtb.f32.f16 s12, s18
398; CHECK-MVE-NEXT:    vpop {d8, d9}
399; CHECK-MVE-NEXT:    bx lr
400;
401; CHECK-MVEFP-LABEL: load_ext_16:
402; CHECK-MVEFP:       @ %bb.0: @ %entry
403; CHECK-MVEFP-NEXT:    vldrh.u32 q0, [r0]
404; CHECK-MVEFP-NEXT:    vldrh.u32 q1, [r0, #8]
405; CHECK-MVEFP-NEXT:    vldrh.u32 q2, [r0, #16]
406; CHECK-MVEFP-NEXT:    vldrh.u32 q3, [r0, #24]
407; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 q0, q0
408; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 q1, q1
409; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 q2, q2
410; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 q3, q3
411; CHECK-MVEFP-NEXT:    bx lr
412entry:
413  %wide.load = load <16 x half>, ptr %src, align 4
414  %e = fpext <16 x half> %wide.load to <16 x float>
415  ret <16 x float> %e
416}
417
418define arm_aapcs_vfpcc <4 x float> @load_shuffleext_8(ptr %src) {
419; CHECK-MVE-LABEL: load_shuffleext_8:
420; CHECK-MVE:       @ %bb.0: @ %entry
421; CHECK-MVE-NEXT:    vldrw.u32 q0, [r0]
422; CHECK-MVE-NEXT:    vcvtb.f32.f16 s3, s3
423; CHECK-MVE-NEXT:    vcvtb.f32.f16 s2, s2
424; CHECK-MVE-NEXT:    vcvtb.f32.f16 s1, s1
425; CHECK-MVE-NEXT:    vcvtb.f32.f16 s0, s0
426; CHECK-MVE-NEXT:    bx lr
427;
428; CHECK-MVEFP-LABEL: load_shuffleext_8:
429; CHECK-MVEFP:       @ %bb.0: @ %entry
430; CHECK-MVEFP-NEXT:    vldrw.u32 q0, [r0]
431; CHECK-MVEFP-NEXT:    vcvtb.f32.f16 q0, q0
432; CHECK-MVEFP-NEXT:    bx lr
433entry:
434  %wide.load = load <8 x half>, ptr %src, align 4
435  %sh = shufflevector <8 x half> %wide.load, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
436  %e = fpext <4 x half> %sh to <4 x float>
437  ret <4 x float> %e
438}
439
440define arm_aapcs_vfpcc <8 x float> @load_shuffleext_16(ptr %src) {
441; CHECK-LABEL: load_shuffleext_16:
442; CHECK:       @ %bb.0: @ %entry
443; CHECK-NEXT:    vld20.16 {q2, q3}, [r0]
444; CHECK-NEXT:    vld21.16 {q2, q3}, [r0]
445; CHECK-NEXT:    vcvtt.f32.f16 s3, s9
446; CHECK-NEXT:    vcvtb.f32.f16 s2, s9
447; CHECK-NEXT:    vcvtt.f32.f16 s1, s8
448; CHECK-NEXT:    vcvtb.f32.f16 s0, s8
449; CHECK-NEXT:    vcvtt.f32.f16 s7, s11
450; CHECK-NEXT:    vcvtb.f32.f16 s6, s11
451; CHECK-NEXT:    vcvtt.f32.f16 s5, s10
452; CHECK-NEXT:    vcvtb.f32.f16 s4, s10
453; CHECK-NEXT:    bx lr
454entry:
455  %wide.load = load <16 x half>, ptr %src, align 4
456  %sh = shufflevector <16 x half> %wide.load, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
457  %e = fpext <8 x half> %sh to <8 x float>
458  ret <8 x float> %e
459}
460
461
462
463
464define arm_aapcs_vfpcc void @store_trunc_4(ptr %src, <4 x float> %val) {
465; CHECK-MVE-LABEL: store_trunc_4:
466; CHECK-MVE:       @ %bb.0: @ %entry
467; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
468; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s1
469; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s2
470; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s3
471; CHECK-MVE-NEXT:    vmov r1, r2, d0
472; CHECK-MVE-NEXT:    strd r1, r2, [r0]
473; CHECK-MVE-NEXT:    bx lr
474;
475; CHECK-MVEFP-LABEL: store_trunc_4:
476; CHECK-MVEFP:       @ %bb.0: @ %entry
477; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q0, q0
478; CHECK-MVEFP-NEXT:    vstrh.32 q0, [r0]
479; CHECK-MVEFP-NEXT:    bx lr
480entry:
481  %e = fptrunc <4 x float> %val to <4 x half>
482  store <4 x half> %e, ptr %src, align 4
483  ret void
484}
485
486define arm_aapcs_vfpcc void @store_trunc_8(ptr %src, <8 x float> %val) {
487; CHECK-MVE-LABEL: store_trunc_8:
488; CHECK-MVE:       @ %bb.0: @ %entry
489; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
490; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s1
491; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s2
492; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s4
493; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s3
494; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s6
495; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s5
496; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s7
497; CHECK-MVE-NEXT:    vstrw.32 q0, [r0]
498; CHECK-MVE-NEXT:    bx lr
499;
500; CHECK-MVEFP-LABEL: store_trunc_8:
501; CHECK-MVEFP:       @ %bb.0: @ %entry
502; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q1, q1
503; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q0, q0
504; CHECK-MVEFP-NEXT:    vstrh.32 q1, [r0, #8]
505; CHECK-MVEFP-NEXT:    vstrh.32 q0, [r0]
506; CHECK-MVEFP-NEXT:    bx lr
507entry:
508  %e = fptrunc <8 x float> %val to <8 x half>
509  store <8 x half> %e, ptr %src, align 4
510  ret void
511}
512
513define arm_aapcs_vfpcc void @store_trunc_16(ptr %src, <16 x float> %val) {
514; CHECK-MVE-LABEL: store_trunc_16:
515; CHECK-MVE:       @ %bb.0: @ %entry
516; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
517; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s1
518; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s2
519; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s4
520; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s3
521; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s6
522; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s5
523; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s7
524; CHECK-MVE-NEXT:    vstrb.8 q0, [r0], #16
525; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s8
526; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s10
527; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s12
528; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s14
529; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s9
530; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s11
531; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s13
532; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s15
533; CHECK-MVE-NEXT:    vstrw.32 q0, [r0]
534; CHECK-MVE-NEXT:    bx lr
535;
536; CHECK-MVEFP-LABEL: store_trunc_16:
537; CHECK-MVEFP:       @ %bb.0: @ %entry
538; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q3, q3
539; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q2, q2
540; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q1, q1
541; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q0, q0
542; CHECK-MVEFP-NEXT:    vstrh.32 q3, [r0, #24]
543; CHECK-MVEFP-NEXT:    vstrh.32 q2, [r0, #16]
544; CHECK-MVEFP-NEXT:    vstrh.32 q1, [r0, #8]
545; CHECK-MVEFP-NEXT:    vstrh.32 q0, [r0]
546; CHECK-MVEFP-NEXT:    bx lr
547entry:
548  %e = fptrunc <16 x float> %val to <16 x half>
549  store <16 x half> %e, ptr %src, align 4
550  ret void
551}
552
553define arm_aapcs_vfpcc void @store_shuffletrunc_8(ptr %src, <4 x float> %val1, <4 x float> %val2) {
554; CHECK-MVE-LABEL: store_shuffletrunc_8:
555; CHECK-MVE:       @ %bb.0: @ %entry
556; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
557; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s1
558; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s2
559; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s3
560; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s4
561; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s5
562; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s6
563; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s7
564; CHECK-MVE-NEXT:    vstrw.32 q0, [r0]
565; CHECK-MVE-NEXT:    bx lr
566;
567; CHECK-MVEFP-LABEL: store_shuffletrunc_8:
568; CHECK-MVEFP:       @ %bb.0: @ %entry
569; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q0, q0
570; CHECK-MVEFP-NEXT:    vcvtt.f16.f32 q0, q1
571; CHECK-MVEFP-NEXT:    vstrw.32 q0, [r0]
572; CHECK-MVEFP-NEXT:    bx lr
573entry:
574  %strided.vec = shufflevector <4 x float> %val1, <4 x float> %val2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
575  %out = fptrunc <8 x float> %strided.vec to <8 x half>
576  store <8 x half> %out, ptr %src, align 4
577  ret void
578}
579
580define arm_aapcs_vfpcc void @store_shuffletrunc_16(ptr %src, <8 x float> %val1, <8 x float> %val2) {
581; CHECK-MVE-LABEL: store_shuffletrunc_16:
582; CHECK-MVE:       @ %bb.0: @ %entry
583; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
584; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s1
585; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s2
586; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s3
587; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s8
588; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s9
589; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s10
590; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s11
591; CHECK-MVE-NEXT:    vstrb.8 q0, [r0], #16
592; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s4
593; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s5
594; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s6
595; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s7
596; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s12
597; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s13
598; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s14
599; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s15
600; CHECK-MVE-NEXT:    vstrw.32 q0, [r0]
601; CHECK-MVE-NEXT:    bx lr
602;
603; CHECK-MVEFP-LABEL: store_shuffletrunc_16:
604; CHECK-MVEFP:       @ %bb.0: @ %entry
605; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q1, q1
606; CHECK-MVEFP-NEXT:    vcvtb.f16.f32 q0, q0
607; CHECK-MVEFP-NEXT:    vcvtt.f16.f32 q1, q3
608; CHECK-MVEFP-NEXT:    vcvtt.f16.f32 q0, q2
609; CHECK-MVEFP-NEXT:    vstrw.32 q1, [r0, #16]
610; CHECK-MVEFP-NEXT:    vstrw.32 q0, [r0]
611; CHECK-MVEFP-NEXT:    bx lr
612entry:
613  %strided.vec = shufflevector <8 x float> %val1, <8 x float> %val2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
614  %out = fptrunc <16 x float> %strided.vec to <16 x half>
615  store <16 x half> %out, ptr %src, align 4
616  ret void
617}
618