xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll (revision cc82f1290a1e2157a6c0530d78d8cc84d2b8553d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; FCVT H -> S
10;
11
12; Don't use SVE for 64-bit vectors.
13define void @fcvt_v2f16_v2f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
14; CHECK-LABEL: fcvt_v2f16_v2f32:
15; CHECK:       // %bb.0:
16; CHECK-NEXT:    ldr s0, [x0]
17; CHECK-NEXT:    fcvtl v0.4s, v0.4h
18; CHECK-NEXT:    str d0, [x1]
19; CHECK-NEXT:    ret
20  %op1 = load <2 x half>, ptr %a
21  %res = fpext <2 x half> %op1 to <2 x float>
22  store <2 x float> %res, ptr %b
23  ret void
24}
25
26; Don't use SVE for 128-bit vectors.
27define void @fcvt_v4f16_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
28; CHECK-LABEL: fcvt_v4f16_v4f32:
29; CHECK:       // %bb.0:
30; CHECK-NEXT:    ldr d0, [x0]
31; CHECK-NEXT:    fcvtl v0.4s, v0.4h
32; CHECK-NEXT:    str q0, [x1]
33; CHECK-NEXT:    ret
34  %op1 = load <4 x half>, ptr %a
35  %res = fpext <4 x half> %op1 to <4 x float>
36  store <4 x float> %res, ptr %b
37  ret void
38}
39
40define void @fcvt_v8f16_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
41; CHECK-LABEL: fcvt_v8f16_v8f32:
42; CHECK:       // %bb.0:
43; CHECK-NEXT:    ptrue p0.s, vl8
44; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
45; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
46; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
47; CHECK-NEXT:    ret
48  %op1 = load <8 x half>, ptr %a
49  %res = fpext <8 x half> %op1 to <8 x float>
50  store <8 x float> %res, ptr %b
51  ret void
52}
53
54define void @fcvt_v16f16_v16f32(ptr %a, ptr %b) #0 {
55; VBITS_GE_256-LABEL: fcvt_v16f16_v16f32:
56; VBITS_GE_256:       // %bb.0:
57; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
58; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
59; VBITS_GE_256-NEXT:    ld1h { z0.s }, p0/z, [x0, x8, lsl #1]
60; VBITS_GE_256-NEXT:    ld1h { z1.s }, p0/z, [x0]
61; VBITS_GE_256-NEXT:    fcvt z0.s, p0/m, z0.h
62; VBITS_GE_256-NEXT:    fcvt z1.s, p0/m, z1.h
63; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
64; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
65; VBITS_GE_256-NEXT:    ret
66;
67; VBITS_GE_512-LABEL: fcvt_v16f16_v16f32:
68; VBITS_GE_512:       // %bb.0:
69; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
70; VBITS_GE_512-NEXT:    ld1h { z0.s }, p0/z, [x0]
71; VBITS_GE_512-NEXT:    fcvt z0.s, p0/m, z0.h
72; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
73; VBITS_GE_512-NEXT:    ret
74  %op1 = load <16 x half>, ptr %a
75  %res = fpext <16 x half> %op1 to <16 x float>
76  store <16 x float> %res, ptr %b
77  ret void
78}
79
80define void @fcvt_v32f16_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
81; CHECK-LABEL: fcvt_v32f16_v32f32:
82; CHECK:       // %bb.0:
83; CHECK-NEXT:    ptrue p0.s, vl32
84; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
85; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
86; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
87; CHECK-NEXT:    ret
88  %op1 = load <32 x half>, ptr %a
89  %res = fpext <32 x half> %op1 to <32 x float>
90  store <32 x float> %res, ptr %b
91  ret void
92}
93
94define void @fcvt_v64f16_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
95; CHECK-LABEL: fcvt_v64f16_v64f32:
96; CHECK:       // %bb.0:
97; CHECK-NEXT:    ptrue p0.s, vl64
98; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
99; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
100; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
101; CHECK-NEXT:    ret
102  %op1 = load <64 x half>, ptr %a
103  %res = fpext <64 x half> %op1 to <64 x float>
104  store <64 x float> %res, ptr %b
105  ret void
106}
107
108;
109; FCVT H -> D
110;
111
112; Don't use SVE for 64-bit vectors.
113define void @fcvt_v1f16_v1f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
114; CHECK-LABEL: fcvt_v1f16_v1f64:
115; CHECK:       // %bb.0:
116; CHECK-NEXT:    ldr h0, [x0]
117; CHECK-NEXT:    fcvt d0, h0
118; CHECK-NEXT:    str d0, [x1]
119; CHECK-NEXT:    ret
120  %op1 = load <1 x half>, ptr %a
121  %res = fpext <1 x half> %op1 to <1 x double>
122  store <1 x double> %res, ptr %b
123  ret void
124}
125
126; v2f16 is not legal for NEON, so use SVE
127define void @fcvt_v2f16_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
128; CHECK-LABEL: fcvt_v2f16_v2f64:
129; CHECK:       // %bb.0:
130; CHECK-NEXT:    ldr s0, [x0]
131; CHECK-NEXT:    ptrue p0.d, vl4
132; CHECK-NEXT:    uunpklo z0.s, z0.h
133; CHECK-NEXT:    uunpklo z0.d, z0.s
134; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
135; CHECK-NEXT:    str q0, [x1]
136; CHECK-NEXT:    ret
137  %op1 = load <2 x half>, ptr %a
138  %res = fpext <2 x half> %op1 to <2 x double>
139  store <2 x double> %res, ptr %b
140  ret void
141}
142
143define void @fcvt_v4f16_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
144; CHECK-LABEL: fcvt_v4f16_v4f64:
145; CHECK:       // %bb.0:
146; CHECK-NEXT:    ptrue p0.d, vl4
147; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
148; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
149; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
150; CHECK-NEXT:    ret
151  %op1 = load <4 x half>, ptr %a
152  %res = fpext <4 x half> %op1 to <4 x double>
153  store <4 x double> %res, ptr %b
154  ret void
155}
156
157define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) #0 {
158; VBITS_GE_256-LABEL: fcvt_v8f16_v8f64:
159; VBITS_GE_256:       // %bb.0:
160; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
161; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
162; VBITS_GE_256-NEXT:    ld1h { z0.d }, p0/z, [x0, x8, lsl #1]
163; VBITS_GE_256-NEXT:    ld1h { z1.d }, p0/z, [x0]
164; VBITS_GE_256-NEXT:    fcvt z0.d, p0/m, z0.h
165; VBITS_GE_256-NEXT:    fcvt z1.d, p0/m, z1.h
166; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
167; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
168; VBITS_GE_256-NEXT:    ret
169;
170; VBITS_GE_512-LABEL: fcvt_v8f16_v8f64:
171; VBITS_GE_512:       // %bb.0:
172; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
173; VBITS_GE_512-NEXT:    ld1h { z0.d }, p0/z, [x0]
174; VBITS_GE_512-NEXT:    fcvt z0.d, p0/m, z0.h
175; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
176; VBITS_GE_512-NEXT:    ret
177  %op1 = load <8 x half>, ptr %a
178  %res = fpext <8 x half> %op1 to <8 x double>
179  store <8 x double> %res, ptr %b
180  ret void
181}
182
183define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
184; CHECK-LABEL: fcvt_v16f16_v16f64:
185; CHECK:       // %bb.0:
186; CHECK-NEXT:    ptrue p0.d, vl16
187; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
188; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
189; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
190; CHECK-NEXT:    ret
191  %op1 = load <16 x half>, ptr %a
192  %res = fpext <16 x half> %op1 to <16 x double>
193  store <16 x double> %res, ptr %b
194  ret void
195}
196
197define void @fcvt_v32f16_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
198; CHECK-LABEL: fcvt_v32f16_v32f64:
199; CHECK:       // %bb.0:
200; CHECK-NEXT:    ptrue p0.d, vl32
201; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
202; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
203; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
204; CHECK-NEXT:    ret
205  %op1 = load <32 x half>, ptr %a
206  %res = fpext <32 x half> %op1 to <32 x double>
207  store <32 x double> %res, ptr %b
208  ret void
209}
210
211;
212; FCVT S -> D
213;
214
215; Don't use SVE for 64-bit vectors.
216define void @fcvt_v1f32_v1f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
217; CHECK-LABEL: fcvt_v1f32_v1f64:
218; CHECK:       // %bb.0:
219; CHECK-NEXT:    ldr s0, [x0]
220; CHECK-NEXT:    fcvtl v0.2d, v0.2s
221; CHECK-NEXT:    str d0, [x1]
222; CHECK-NEXT:    ret
223  %op1 = load <1 x float>, ptr %a
224  %res = fpext <1 x float> %op1 to <1 x double>
225  store <1 x double> %res, ptr %b
226  ret void
227}
228
229; Don't use SVE for 128-bit vectors.
230define void @fcvt_v2f32_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
231; CHECK-LABEL: fcvt_v2f32_v2f64:
232; CHECK:       // %bb.0:
233; CHECK-NEXT:    ldr d0, [x0]
234; CHECK-NEXT:    fcvtl v0.2d, v0.2s
235; CHECK-NEXT:    str q0, [x1]
236; CHECK-NEXT:    ret
237  %op1 = load <2 x float>, ptr %a
238  %res = fpext <2 x float> %op1 to <2 x double>
239  store <2 x double> %res, ptr %b
240  ret void
241}
242
243define void @fcvt_v4f32_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
244; CHECK-LABEL: fcvt_v4f32_v4f64:
245; CHECK:       // %bb.0:
246; CHECK-NEXT:    ptrue p0.d, vl4
247; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
248; CHECK-NEXT:    fcvt z0.d, p0/m, z0.s
249; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
250; CHECK-NEXT:    ret
251  %op1 = load <4 x float>, ptr %a
252  %res = fpext <4 x float> %op1 to <4 x double>
253  store <4 x double> %res, ptr %b
254  ret void
255}
256
257define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) #0 {
258; VBITS_GE_256-LABEL: fcvt_v8f32_v8f64:
259; VBITS_GE_256:       // %bb.0:
260; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
261; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
262; VBITS_GE_256-NEXT:    ld1w { z0.d }, p0/z, [x0, x8, lsl #2]
263; VBITS_GE_256-NEXT:    ld1w { z1.d }, p0/z, [x0]
264; VBITS_GE_256-NEXT:    fcvt z0.d, p0/m, z0.s
265; VBITS_GE_256-NEXT:    fcvt z1.d, p0/m, z1.s
266; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
267; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
268; VBITS_GE_256-NEXT:    ret
269;
270; VBITS_GE_512-LABEL: fcvt_v8f32_v8f64:
271; VBITS_GE_512:       // %bb.0:
272; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
273; VBITS_GE_512-NEXT:    ld1w { z0.d }, p0/z, [x0]
274; VBITS_GE_512-NEXT:    fcvt z0.d, p0/m, z0.s
275; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
276; VBITS_GE_512-NEXT:    ret
277  %op1 = load <8 x float>, ptr %a
278  %res = fpext <8 x float> %op1 to <8 x double>
279  store <8 x double> %res, ptr %b
280  ret void
281}
282
283define void @fcvt_v16f32_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
284; CHECK-LABEL: fcvt_v16f32_v16f64:
285; CHECK:       // %bb.0:
286; CHECK-NEXT:    ptrue p0.d, vl16
287; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
288; CHECK-NEXT:    fcvt z0.d, p0/m, z0.s
289; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
290; CHECK-NEXT:    ret
291  %op1 = load <16 x float>, ptr %a
292  %res = fpext <16 x float> %op1 to <16 x double>
293  store <16 x double> %res, ptr %b
294  ret void
295}
296
297define void @fcvt_v32f32_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
298; CHECK-LABEL: fcvt_v32f32_v32f64:
299; CHECK:       // %bb.0:
300; CHECK-NEXT:    ptrue p0.d, vl32
301; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
302; CHECK-NEXT:    fcvt z0.d, p0/m, z0.s
303; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
304; CHECK-NEXT:    ret
305  %op1 = load <32 x float>, ptr %a
306  %res = fpext <32 x float> %op1 to <32 x double>
307  store <32 x double> %res, ptr %b
308  ret void
309}
310
311;
312; FCVT S -> H
313;
314
315; Don't use SVE for 64-bit vectors.
316define void @fcvt_v2f32_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
317; CHECK-LABEL: fcvt_v2f32_v2f16:
318; CHECK:       // %bb.0:
319; CHECK-NEXT:    ldr d0, [x0]
320; CHECK-NEXT:    fcvtn v0.4h, v0.4s
321; CHECK-NEXT:    str s0, [x1]
322; CHECK-NEXT:    ret
323  %op1 = load <2 x float>, ptr %a
324  %res = fptrunc <2 x float> %op1 to <2 x half>
325  store <2 x half> %res, ptr %b
326  ret void
327}
328
329; Don't use SVE for 128-bit vectors.
330define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
331; CHECK-LABEL: fcvt_v4f32_v4f16:
332; CHECK:       // %bb.0:
333; CHECK-NEXT:    ldr q0, [x0]
334; CHECK-NEXT:    fcvtn v0.4h, v0.4s
335; CHECK-NEXT:    str d0, [x1]
336; CHECK-NEXT:    ret
337  %op1 = load <4 x float>, ptr %a
338  %res = fptrunc <4 x float> %op1 to <4 x half>
339  store <4 x half> %res, ptr %b
340  ret void
341}
342
343define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
344; CHECK-LABEL: fcvt_v8f32_v8f16:
345; CHECK:       // %bb.0:
346; CHECK-NEXT:    ptrue p0.s, vl8
347; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
348; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
349; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
350; CHECK-NEXT:    ret
351  %op1 = load <8 x float>, ptr %a
352  %res = fptrunc <8 x float> %op1 to <8 x half>
353  store <8 x half> %res, ptr %b
354  ret void
355}
356
357define void @fcvt_v16f32_v16f16(ptr %a, ptr %b) #0 {
358; VBITS_GE_256-LABEL: fcvt_v16f32_v16f16:
359; VBITS_GE_256:       // %bb.0:
360; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
361; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
362; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
363; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
364; VBITS_GE_256-NEXT:    fcvt z0.h, p0/m, z0.s
365; VBITS_GE_256-NEXT:    fcvt z1.h, p0/m, z1.s
366; VBITS_GE_256-NEXT:    st1h { z0.s }, p0, [x1, x8, lsl #1]
367; VBITS_GE_256-NEXT:    st1h { z1.s }, p0, [x1]
368; VBITS_GE_256-NEXT:    ret
369;
370; VBITS_GE_512-LABEL: fcvt_v16f32_v16f16:
371; VBITS_GE_512:       // %bb.0:
372; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
373; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
374; VBITS_GE_512-NEXT:    fcvt z0.h, p0/m, z0.s
375; VBITS_GE_512-NEXT:    st1h { z0.s }, p0, [x1]
376; VBITS_GE_512-NEXT:    ret
377  %op1 = load <16 x float>, ptr %a
378  %res = fptrunc <16 x float> %op1 to <16 x half>
379  store <16 x half> %res, ptr %b
380  ret void
381}
382
383define void @fcvt_v32f32_v32f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
384; CHECK-LABEL: fcvt_v32f32_v32f16:
385; CHECK:       // %bb.0:
386; CHECK-NEXT:    ptrue p0.s, vl32
387; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
388; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
389; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
390; CHECK-NEXT:    ret
391  %op1 = load <32 x float>, ptr %a
392  %res = fptrunc <32 x float> %op1 to <32 x half>
393  store <32 x half> %res, ptr %b
394  ret void
395}
396
397define void @fcvt_v64f32_v64f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
398; CHECK-LABEL: fcvt_v64f32_v64f16:
399; CHECK:       // %bb.0:
400; CHECK-NEXT:    ptrue p0.s, vl64
401; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
402; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
403; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
404; CHECK-NEXT:    ret
405  %op1 = load <64 x float>, ptr %a
406  %res = fptrunc <64 x float> %op1 to <64 x half>
407  store <64 x half> %res, ptr %b
408  ret void
409}
410
411;
412; FCVT D -> H
413;
414
415; Don't use SVE for 64-bit vectors.
416define void @fcvt_v1f64_v1f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
417; CHECK-LABEL: fcvt_v1f64_v1f16:
418; CHECK:       // %bb.0:
419; CHECK-NEXT:    ldr d0, [x0]
420; CHECK-NEXT:    fcvt h0, d0
421; CHECK-NEXT:    str h0, [x1]
422; CHECK-NEXT:    ret
423  %op1 = load <1 x double>, ptr %a
424  %res = fptrunc <1 x double> %op1 to <1 x half>
425  store <1 x half> %res, ptr %b
426  ret void
427}
428
429; v2f16 is not legal for NEON, so use SVE
430define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
431; CHECK-LABEL: fcvt_v2f64_v2f16:
432; CHECK:       // %bb.0:
433; CHECK-NEXT:    ptrue p0.d
434; CHECK-NEXT:    ldr q0, [x0]
435; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
436; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
437; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
438; CHECK-NEXT:    str s0, [x1]
439; CHECK-NEXT:    ret
440  %op1 = load <2 x double>, ptr %a
441  %res = fptrunc <2 x double> %op1 to <2 x half>
442  store <2 x half> %res, ptr %b
443  ret void
444}
445
446define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
447; CHECK-LABEL: fcvt_v4f64_v4f16:
448; CHECK:       // %bb.0:
449; CHECK-NEXT:    ptrue p0.d, vl4
450; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
451; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
452; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
453; CHECK-NEXT:    ret
454  %op1 = load <4 x double>, ptr %a
455  %res = fptrunc <4 x double> %op1 to <4 x half>
456  store <4 x half> %res, ptr %b
457  ret void
458}
459
460define void @fcvt_v8f64_v8f16(ptr %a, ptr %b) #0 {
461; VBITS_GE_256-LABEL: fcvt_v8f64_v8f16:
462; VBITS_GE_256:       // %bb.0:
463; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
464; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
465; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
466; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
467; VBITS_GE_256-NEXT:    ptrue p0.d
468; VBITS_GE_256-NEXT:    fcvt z0.h, p0/m, z0.d
469; VBITS_GE_256-NEXT:    fcvt z1.h, p0/m, z1.d
470; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
471; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
472; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
473; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
474; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
475; VBITS_GE_256-NEXT:    str q1, [x1]
476; VBITS_GE_256-NEXT:    ret
477;
478; VBITS_GE_512-LABEL: fcvt_v8f64_v8f16:
479; VBITS_GE_512:       // %bb.0:
480; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
481; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
482; VBITS_GE_512-NEXT:    fcvt z0.h, p0/m, z0.d
483; VBITS_GE_512-NEXT:    st1h { z0.d }, p0, [x1]
484; VBITS_GE_512-NEXT:    ret
485  %op1 = load <8 x double>, ptr %a
486  %res = fptrunc <8 x double> %op1 to <8 x half>
487  store <8 x half> %res, ptr %b
488  ret void
489}
490
491define void @fcvt_v16f64_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
492; CHECK-LABEL: fcvt_v16f64_v16f16:
493; CHECK:       // %bb.0:
494; CHECK-NEXT:    ptrue p0.d, vl16
495; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
496; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
497; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
498; CHECK-NEXT:    ret
499  %op1 = load <16 x double>, ptr %a
500  %res = fptrunc <16 x double> %op1 to <16 x half>
501  store <16 x half> %res, ptr %b
502  ret void
503}
504
505define void @fcvt_v32f64_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
506; CHECK-LABEL: fcvt_v32f64_v32f16:
507; CHECK:       // %bb.0:
508; CHECK-NEXT:    ptrue p0.d, vl32
509; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
510; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
511; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
512; CHECK-NEXT:    ret
513  %op1 = load <32 x double>, ptr %a
514  %res = fptrunc <32 x double> %op1 to <32 x half>
515  store <32 x half> %res, ptr %b
516  ret void
517}
518
519;
520; FCVT D -> S
521;
522
523; Don't use SVE for 64-bit vectors.
524define void @fcvt_v1f64_v1f32(<1 x double> %op1, ptr %b) vscale_range(2,0) #0 {
525; CHECK-LABEL: fcvt_v1f64_v1f32:
526; CHECK:       // %bb.0:
527; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
528; CHECK-NEXT:    fcvtn v0.2s, v0.2d
529; CHECK-NEXT:    str s0, [x0]
530; CHECK-NEXT:    ret
531  %res = fptrunc <1 x double> %op1 to <1 x float>
532  store <1 x float> %res, ptr %b
533  ret void
534}
535
536; Don't use SVE for 128-bit vectors.
537define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) vscale_range(2,0) #0 {
538; CHECK-LABEL: fcvt_v2f64_v2f32:
539; CHECK:       // %bb.0:
540; CHECK-NEXT:    fcvtn v0.2s, v0.2d
541; CHECK-NEXT:    str d0, [x0]
542; CHECK-NEXT:    ret
543  %res = fptrunc <2 x double> %op1 to <2 x float>
544  store <2 x float> %res, ptr %b
545  ret void
546}
547
548define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
549; CHECK-LABEL: fcvt_v4f64_v4f32:
550; CHECK:       // %bb.0:
551; CHECK-NEXT:    ptrue p0.d, vl4
552; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
553; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
554; CHECK-NEXT:    st1w { z0.d }, p0, [x1]
555; CHECK-NEXT:    ret
556  %op1 = load <4 x double>, ptr %a
557  %res = fptrunc <4 x double> %op1 to <4 x float>
558  store <4 x float> %res, ptr %b
559  ret void
560}
561
562define void @fcvt_v8f64_v8f32(ptr %a, ptr %b) #0 {
563; VBITS_GE_256-LABEL: fcvt_v8f64_v8f32:
564; VBITS_GE_256:       // %bb.0:
565; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
566; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
567; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
568; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
569; VBITS_GE_256-NEXT:    fcvt z0.s, p0/m, z0.d
570; VBITS_GE_256-NEXT:    fcvt z1.s, p0/m, z1.d
571; VBITS_GE_256-NEXT:    st1w { z0.d }, p0, [x1, x8, lsl #2]
572; VBITS_GE_256-NEXT:    st1w { z1.d }, p0, [x1]
573; VBITS_GE_256-NEXT:    ret
574;
575; VBITS_GE_512-LABEL: fcvt_v8f64_v8f32:
576; VBITS_GE_512:       // %bb.0:
577; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
578; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
579; VBITS_GE_512-NEXT:    fcvt z0.s, p0/m, z0.d
580; VBITS_GE_512-NEXT:    st1w { z0.d }, p0, [x1]
581; VBITS_GE_512-NEXT:    ret
582  %op1 = load <8 x double>, ptr %a
583  %res = fptrunc <8 x double> %op1 to <8 x float>
584  store <8 x float> %res, ptr %b
585  ret void
586}
587
588define void @fcvt_v16f64_v16f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
589; CHECK-LABEL: fcvt_v16f64_v16f32:
590; CHECK:       // %bb.0:
591; CHECK-NEXT:    ptrue p0.d, vl16
592; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
593; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
594; CHECK-NEXT:    st1w { z0.d }, p0, [x1]
595; CHECK-NEXT:    ret
596  %op1 = load <16 x double>, ptr %a
597  %res = fptrunc <16 x double> %op1 to <16 x float>
598  store <16 x float> %res, ptr %b
599  ret void
600}
601
602define void @fcvt_v32f64_v32f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
603; CHECK-LABEL: fcvt_v32f64_v32f32:
604; CHECK:       // %bb.0:
605; CHECK-NEXT:    ptrue p0.d, vl32
606; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
607; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
608; CHECK-NEXT:    st1w { z0.d }, p0, [x1]
609; CHECK-NEXT:    ret
610  %op1 = load <32 x double>, ptr %a
611  %res = fptrunc <32 x double> %op1 to <32 x float>
612  store <32 x float> %res, ptr %b
613  ret void
614}
615
616attributes #0 = { "target-features"="+sve" }
617