xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll (revision db158c7c830807caeeb0691739c41f1d522029e9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; truncate i16 -> i8
10;
11
12define <16 x i8> @trunc_v16i16_v16i8(ptr %in) vscale_range(2,0) #0 {
13; CHECK-LABEL: trunc_v16i16_v16i8:
14; CHECK:       // %bb.0:
15; CHECK-NEXT:    ptrue p0.h, vl16
16; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
17; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
18; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
19; CHECK-NEXT:    ret
20  %a = load <16 x i16>, ptr %in
21  %b = trunc <16 x i16> %a to <16 x i8>
22  ret <16 x i8> %b
23}
24
25; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
26define void @trunc_v32i16_v32i8(ptr %in, ptr %out) #0 {
27; VBITS_GE_256-LABEL: trunc_v32i16_v32i8:
28; VBITS_GE_256:       // %bb.0:
29; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
30; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
31; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
32; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
33; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
34; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
35; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
36; VBITS_GE_256-NEXT:    splice z1.b, p0, z1.b, z0.b
37; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
38; VBITS_GE_256-NEXT:    add z0.b, z1.b, z1.b
39; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1]
40; VBITS_GE_256-NEXT:    ret
41;
42; VBITS_GE_512-LABEL: trunc_v32i16_v32i8:
43; VBITS_GE_512:       // %bb.0:
44; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
45; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
46; VBITS_GE_512-NEXT:    ptrue p0.b, vl32
47; VBITS_GE_512-NEXT:    uzp1 z0.b, z0.b, z0.b
48; VBITS_GE_512-NEXT:    add z0.b, z0.b, z0.b
49; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x1]
50; VBITS_GE_512-NEXT:    ret
51  %a = load <32 x i16>, ptr %in
52  %b = trunc <32 x i16> %a to <32 x i8>
53  %c = add <32 x i8> %b, %b
54  store <32 x i8> %c, ptr %out
55  ret void
56}
57
58; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
59define void @trunc_v64i16_v64i8(ptr %in, ptr %out) vscale_range(8,0) #0 {
60; CHECK-LABEL: trunc_v64i16_v64i8:
61; CHECK:       // %bb.0:
62; CHECK-NEXT:    ptrue p0.h, vl64
63; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
64; CHECK-NEXT:    ptrue p0.b, vl64
65; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
66; CHECK-NEXT:    add z0.b, z0.b, z0.b
67; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
68; CHECK-NEXT:    ret
69  %a = load <64 x i16>, ptr %in
70  %b = trunc <64 x i16> %a to <64 x i8>
71  %c = add <64 x i8> %b, %b
72  store <64 x i8> %c, ptr %out
73  ret void
74}
75
76; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
77define void @trunc_v128i16_v128i8(ptr %in, ptr %out) vscale_range(16,0) #0 {
78; CHECK-LABEL: trunc_v128i16_v128i8:
79; CHECK:       // %bb.0:
80; CHECK-NEXT:    ptrue p0.h, vl128
81; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
82; CHECK-NEXT:    ptrue p0.b, vl128
83; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
84; CHECK-NEXT:    add z0.b, z0.b, z0.b
85; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
86; CHECK-NEXT:    ret
87  %a = load <128 x i16>, ptr %in
88  %b = trunc <128 x i16> %a to <128 x i8>
89  %c = add <128 x i8> %b, %b
90  store <128 x i8> %c, ptr %out
91  ret void
92}
93
94;
95; truncate i32 -> i8
96;
97
98define <8 x i8> @trunc_v8i32_v8i8(ptr %in) vscale_range(2,0) #0 {
99; CHECK-LABEL: trunc_v8i32_v8i8:
100; CHECK:       // %bb.0:
101; CHECK-NEXT:    ptrue p0.s, vl8
102; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
103; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
104; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
105; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
106; CHECK-NEXT:    ret
107  %a = load <8 x i32>, ptr %in
108  %b = trunc <8 x i32> %a to <8 x i8>
109  ret <8 x i8> %b
110}
111
112define <16 x i8> @trunc_v16i32_v16i8(ptr %in) #0 {
113; VBITS_GE_256-LABEL: trunc_v16i32_v16i8:
114; VBITS_GE_256:       // %bb.0:
115; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
116; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
117; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
118; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
119; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
120; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
121; VBITS_GE_256-NEXT:    uzp1 z2.b, z0.b, z0.b
122; VBITS_GE_256-NEXT:    uzp1 z0.b, z1.b, z1.b
123; VBITS_GE_256-NEXT:    mov v0.d[1], v2.d[0]
124; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
125; VBITS_GE_256-NEXT:    ret
126;
127; VBITS_GE_512-LABEL: trunc_v16i32_v16i8:
128; VBITS_GE_512:       // %bb.0:
129; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
130; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
131; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
132; VBITS_GE_512-NEXT:    uzp1 z0.b, z0.b, z0.b
133; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
134; VBITS_GE_512-NEXT:    ret
135  %a = load <16 x i32>, ptr %in
136  %b = trunc <16 x i32> %a to <16 x i8>
137  ret <16 x i8> %b
138}
139
140; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
141define void @trunc_v32i32_v32i8(ptr %in, ptr %out) vscale_range(8,0) #0 {
142; CHECK-LABEL: trunc_v32i32_v32i8:
143; CHECK:       // %bb.0:
144; CHECK-NEXT:    ptrue p0.s, vl32
145; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
146; CHECK-NEXT:    ptrue p0.b, vl32
147; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
148; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
149; CHECK-NEXT:    add z0.b, z0.b, z0.b
150; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
151; CHECK-NEXT:    ret
152  %a = load <32 x i32>, ptr %in
153  %b = trunc <32 x i32> %a to <32 x i8>
154  %c = add <32 x i8> %b, %b
155  store <32 x i8> %c, ptr %out
156  ret void
157}
158
159; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
160define void @trunc_v64i32_v64i8(ptr %in, ptr %out) vscale_range(16,0) #0 {
161; CHECK-LABEL: trunc_v64i32_v64i8:
162; CHECK:       // %bb.0:
163; CHECK-NEXT:    ptrue p0.s, vl64
164; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
165; CHECK-NEXT:    ptrue p0.b, vl64
166; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
167; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
168; CHECK-NEXT:    add z0.b, z0.b, z0.b
169; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
170; CHECK-NEXT:    ret
171  %a = load <64 x i32>, ptr %in
172  %b = trunc <64 x i32> %a to <64 x i8>
173  %c = add <64 x i8> %b, %b
174  store <64 x i8> %c, ptr %out
175  ret void
176}
177
178;
179; truncate i32 -> i16
180;
181
182define <8 x i16> @trunc_v8i32_v8i16(ptr %in) vscale_range(2,0) #0 {
183; CHECK-LABEL: trunc_v8i32_v8i16:
184; CHECK:       // %bb.0:
185; CHECK-NEXT:    ptrue p0.s, vl8
186; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
187; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
188; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
189; CHECK-NEXT:    ret
190  %a = load <8 x i32>, ptr %in
191  %b = trunc <8 x i32> %a to <8 x i16>
192  ret <8 x i16> %b
193}
194
195; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
196define void @trunc_v16i32_v16i16(ptr %in, ptr %out) #0 {
197; VBITS_GE_256-LABEL: trunc_v16i32_v16i16:
198; VBITS_GE_256:       // %bb.0:
199; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
200; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
201; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
202; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
203; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
204; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
205; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
206; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
207; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
208; VBITS_GE_256-NEXT:    add z0.h, z1.h, z1.h
209; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
210; VBITS_GE_256-NEXT:    ret
211;
212; VBITS_GE_512-LABEL: trunc_v16i32_v16i16:
213; VBITS_GE_512:       // %bb.0:
214; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
215; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
216; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
217; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
218; VBITS_GE_512-NEXT:    add z0.h, z0.h, z0.h
219; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
220; VBITS_GE_512-NEXT:    ret
221  %a = load <16 x i32>, ptr %in
222  %b = trunc <16 x i32> %a to <16 x i16>
223  %c = add <16 x i16> %b, %b
224  store <16 x i16> %c, ptr %out
225  ret void
226}
227
228; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
229define void @trunc_v32i32_v32i16(ptr %in, ptr %out) vscale_range(8,0) #0 {
230; CHECK-LABEL: trunc_v32i32_v32i16:
231; CHECK:       // %bb.0:
232; CHECK-NEXT:    ptrue p0.s, vl32
233; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
234; CHECK-NEXT:    ptrue p0.h, vl32
235; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
236; CHECK-NEXT:    add z0.h, z0.h, z0.h
237; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
238; CHECK-NEXT:    ret
239  %a = load <32 x i32>, ptr %in
240  %b = trunc <32 x i32> %a to <32 x i16>
241  %c = add <32 x i16> %b, %b
242  store <32 x i16> %c, ptr %out
243  ret void
244}
245
246; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
247define void @trunc_v64i32_v64i16(ptr %in, ptr %out) vscale_range(16,0) #0 {
248; CHECK-LABEL: trunc_v64i32_v64i16:
249; CHECK:       // %bb.0:
250; CHECK-NEXT:    ptrue p0.s, vl64
251; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
252; CHECK-NEXT:    ptrue p0.h, vl64
253; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
254; CHECK-NEXT:    add z0.h, z0.h, z0.h
255; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
256; CHECK-NEXT:    ret
257  %a = load <64 x i32>, ptr %in
258  %b = trunc <64 x i32> %a to <64 x i16>
259  %c = add <64 x i16> %b, %b
260  store <64 x i16> %c, ptr %out
261  ret void
262}
263
264;
265; truncate i64 -> i8
266;
267
268; NOTE: v4i8 is not legal so result i8 elements are held within i16 containers.
269define <4 x i8> @trunc_v4i64_v4i8(ptr %in) vscale_range(2,0) #0 {
270; CHECK-LABEL: trunc_v4i64_v4i8:
271; CHECK:       // %bb.0:
272; CHECK-NEXT:    ptrue p0.d, vl4
273; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
274; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
275; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
276; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
277; CHECK-NEXT:    ret
278  %a = load <4 x i64>, ptr %in
279  %b = trunc <4 x i64> %a to <4 x i8>
280  ret <4 x i8> %b
281}
282
283define <8 x i8> @trunc_v8i64_v8i8(ptr %in) #0 {
284; VBITS_GE_256-LABEL: trunc_v8i64_v8i8:
285; VBITS_GE_256:       // %bb.0:
286; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
287; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
288; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
289; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
290; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
291; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
292; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
293; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
294; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
295; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
296; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
297; VBITS_GE_256-NEXT:    ret
298;
299; VBITS_GE_512-LABEL: trunc_v8i64_v8i8:
300; VBITS_GE_512:       // %bb.0:
301; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
302; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
303; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
304; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
305; VBITS_GE_512-NEXT:    uzp1 z0.b, z0.b, z0.b
306; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 killed $z0
307; VBITS_GE_512-NEXT:    ret
308  %a = load <8 x i64>, ptr %in
309  %b = trunc <8 x i64> %a to <8 x i8>
310  ret <8 x i8> %b
311}
312
313define <16 x i8> @trunc_v16i64_v16i8(ptr %in) vscale_range(8,0) #0 {
314; CHECK-LABEL: trunc_v16i64_v16i8:
315; CHECK:       // %bb.0:
316; CHECK-NEXT:    ptrue p0.d, vl16
317; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
318; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
319; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
320; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
321; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
322; CHECK-NEXT:    ret
323  %a = load <16 x i64>, ptr %in
324  %b = trunc <16 x i64> %a to <16 x i8>
325  ret <16 x i8> %b
326}
327
328; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
329define void @trunc_v32i64_v32i8(ptr %in, ptr %out) vscale_range(16,0) #0 {
330; CHECK-LABEL: trunc_v32i64_v32i8:
331; CHECK:       // %bb.0:
332; CHECK-NEXT:    ptrue p0.d, vl32
333; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
334; CHECK-NEXT:    ptrue p0.b, vl32
335; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
336; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
337; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
338; CHECK-NEXT:    add z0.b, z0.b, z0.b
339; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
340; CHECK-NEXT:    ret
341  %a = load <32 x i64>, ptr %in
342  %b = trunc <32 x i64> %a to <32 x i8>
343  %c = add <32 x i8> %b, %b
344  store <32 x i8> %c, ptr %out
345  ret void
346}
347
348;
349; truncate i64 -> i16
350;
351
352define <4 x i16> @trunc_v4i64_v4i16(ptr %in) vscale_range(2,0) #0 {
353; CHECK-LABEL: trunc_v4i64_v4i16:
354; CHECK:       // %bb.0:
355; CHECK-NEXT:    ptrue p0.d, vl4
356; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
357; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
358; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
359; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
360; CHECK-NEXT:    ret
361  %a = load <4 x i64>, ptr %in
362  %b = trunc <4 x i64> %a to <4 x i16>
363  ret <4 x i16> %b
364}
365
366define <8 x i16> @trunc_v8i64_v8i16(ptr %in) #0 {
367; VBITS_GE_256-LABEL: trunc_v8i64_v8i16:
368; VBITS_GE_256:       // %bb.0:
369; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
370; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
371; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
372; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
373; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
374; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
375; VBITS_GE_256-NEXT:    uzp1 z2.h, z0.h, z0.h
376; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
377; VBITS_GE_256-NEXT:    mov v0.d[1], v2.d[0]
378; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
379; VBITS_GE_256-NEXT:    ret
380;
381; VBITS_GE_512-LABEL: trunc_v8i64_v8i16:
382; VBITS_GE_512:       // %bb.0:
383; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
384; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
385; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
386; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
387; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 killed $z0
388; VBITS_GE_512-NEXT:    ret
389  %a = load <8 x i64>, ptr %in
390  %b = trunc <8 x i64> %a to <8 x i16>
391  ret <8 x i16> %b
392}
393
394; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
395define void @trunc_v16i64_v16i16(ptr %in, ptr %out) vscale_range(8,0) #0 {
396; CHECK-LABEL: trunc_v16i64_v16i16:
397; CHECK:       // %bb.0:
398; CHECK-NEXT:    ptrue p0.d, vl16
399; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
400; CHECK-NEXT:    ptrue p0.h, vl16
401; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
402; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
403; CHECK-NEXT:    add z0.h, z0.h, z0.h
404; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
405; CHECK-NEXT:    ret
406  %a = load <16 x i64>, ptr %in
407  %b = trunc <16 x i64> %a to <16 x i16>
408  %c = add <16 x i16> %b, %b
409  store <16 x i16> %c, ptr %out
410  ret void
411}
412
413; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
414define void @trunc_v32i64_v32i16(ptr %in, ptr %out) vscale_range(16,0) #0 {
415; CHECK-LABEL: trunc_v32i64_v32i16:
416; CHECK:       // %bb.0:
417; CHECK-NEXT:    ptrue p0.d, vl32
418; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
419; CHECK-NEXT:    ptrue p0.h, vl32
420; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
421; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
422; CHECK-NEXT:    add z0.h, z0.h, z0.h
423; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
424; CHECK-NEXT:    ret
425  %a = load <32 x i64>, ptr %in
426  %b = trunc <32 x i64> %a to <32 x i16>
427  %c = add <32 x i16> %b, %b
428  store <32 x i16> %c, ptr %out
429  ret void
430}
431
432;
433; truncate i64 -> i32
434;
435
436define <4 x i32> @trunc_v4i64_v4i32(ptr %in) vscale_range(2,0) #0 {
437; CHECK-LABEL: trunc_v4i64_v4i32:
438; CHECK:       // %bb.0:
439; CHECK-NEXT:    ptrue p0.d, vl4
440; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
441; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
442; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
443; CHECK-NEXT:    ret
444  %a = load <4 x i64>, ptr %in
445  %b = trunc <4 x i64> %a to <4 x i32>
446  ret <4 x i32> %b
447}
448
449; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
450define void @trunc_v8i64_v8i32(ptr %in, ptr %out) #0 {
451; VBITS_GE_256-LABEL: trunc_v8i64_v8i32:
452; VBITS_GE_256:       // %bb.0:
453; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
454; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
455; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
456; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
457; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
458; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
459; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
460; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
461; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
462; VBITS_GE_256-NEXT:    add z0.s, z1.s, z1.s
463; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
464; VBITS_GE_256-NEXT:    ret
465;
466; VBITS_GE_512-LABEL: trunc_v8i64_v8i32:
467; VBITS_GE_512:       // %bb.0:
468; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
469; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
470; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
471; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
472; VBITS_GE_512-NEXT:    add z0.s, z0.s, z0.s
473; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
474; VBITS_GE_512-NEXT:    ret
475  %a = load <8 x i64>, ptr %in
476  %b = trunc <8 x i64> %a to <8 x i32>
477  %c = add <8 x i32> %b, %b
478  store <8 x i32> %c, ptr %out
479  ret void
480}
481
482; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
483define void @trunc_v16i64_v16i32(ptr %in, ptr %out) vscale_range(8,0) #0 {
484; CHECK-LABEL: trunc_v16i64_v16i32:
485; CHECK:       // %bb.0:
486; CHECK-NEXT:    ptrue p0.d, vl16
487; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
488; CHECK-NEXT:    ptrue p0.s, vl16
489; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
490; CHECK-NEXT:    add z0.s, z0.s, z0.s
491; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
492; CHECK-NEXT:    ret
493  %a = load <16 x i64>, ptr %in
494  %b = trunc <16 x i64> %a to <16 x i32>
495  %c = add <16 x i32> %b, %b
496  store <16 x i32> %c, ptr %out
497  ret void
498}
499
500; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
501define void @trunc_v32i64_v32i32(ptr %in, ptr %out) vscale_range(16,0) #0 {
502; CHECK-LABEL: trunc_v32i64_v32i32:
503; CHECK:       // %bb.0:
504; CHECK-NEXT:    ptrue p0.d, vl32
505; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
506; CHECK-NEXT:    ptrue p0.s, vl32
507; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
508; CHECK-NEXT:    add z0.s, z0.s, z0.s
509; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
510; CHECK-NEXT:    ret
511  %a = load <32 x i64>, ptr %in
512  %b = trunc <32 x i64> %a to <32 x i32>
513  %c = add <32 x i32> %b, %b
514  store <32 x i32> %c, ptr %out
515  ret void
516}
517
518attributes #0 = { nounwind "target-features"="+sve" }
519