xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll (revision cc82f1290a1e2157a6c0530d78d8cc84d2b8553d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; sext i1 -> i32
10;
11
12; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg
13; type's element type is not byte based and thus cannot be lowered directly to
14; an SVE instruction.
15define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) vscale_range(2,0) #0 {
16; CHECK-LABEL: sext_v8i1_v8i32:
17; CHECK:       // %bb.0:
18; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
19; CHECK-NEXT:    ptrue p0.s, vl8
20; CHECK-NEXT:    uunpklo z0.h, z0.b
21; CHECK-NEXT:    uunpklo z0.s, z0.h
22; CHECK-NEXT:    lsl z0.s, z0.s, #31
23; CHECK-NEXT:    asr z0.s, z0.s, #31
24; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
25; CHECK-NEXT:    ret
26  %b = sext <8 x i1> %a to <8 x i32>
27  store <8 x i32> %b, ptr %out
28  ret void
29}
30
31;
32; sext i3 -> i64
33;
34
35; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg
36; type's element type is not power-of-2 based and thus cannot be lowered
37; directly to an SVE instruction.
38define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) vscale_range(2,0) #0 {
39; CHECK-LABEL: sext_v4i3_v4i64:
40; CHECK:       // %bb.0:
41; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
42; CHECK-NEXT:    ptrue p0.d, vl4
43; CHECK-NEXT:    uunpklo z0.s, z0.h
44; CHECK-NEXT:    uunpklo z0.d, z0.s
45; CHECK-NEXT:    lsl z0.d, z0.d, #61
46; CHECK-NEXT:    asr z0.d, z0.d, #61
47; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
48; CHECK-NEXT:    ret
49  %b = sext <4 x i3> %a to <4 x i64>
50  store <4 x i64> %b, ptr %out
51  ret void
52}
53
54;
55; sext i8 -> i16
56;
57
58define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) vscale_range(2,0) #0 {
59; CHECK-LABEL: sext_v16i8_v16i16:
60; CHECK:       // %bb.0:
61; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
62; CHECK-NEXT:    ptrue p0.h, vl16
63; CHECK-NEXT:    sunpklo z0.h, z0.b
64; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
65; CHECK-NEXT:    ret
66  %b = sext <16 x i8> %a to <16 x i16>
67  store <16 x i16>%b, ptr %out
68  ret void
69}
70
71; NOTE: Extra 'add' is to prevent the extend being combined with the load.
72define void @sext_v32i8_v32i16(ptr %in, ptr %out) #0 {
73; VBITS_GE_256-LABEL: sext_v32i8_v32i16:
74; VBITS_GE_256:       // %bb.0:
75; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
76; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
77; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
78; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
79; VBITS_GE_256-NEXT:    add z0.b, z0.b, z0.b
80; VBITS_GE_256-NEXT:    sunpklo z1.h, z0.b
81; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
82; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
83; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
84; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
85; VBITS_GE_256-NEXT:    ret
86;
87; VBITS_GE_512-LABEL: sext_v32i8_v32i16:
88; VBITS_GE_512:       // %bb.0:
89; VBITS_GE_512-NEXT:    ptrue p0.b, vl32
90; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
91; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
92; VBITS_GE_512-NEXT:    add z0.b, z0.b, z0.b
93; VBITS_GE_512-NEXT:    sunpklo z0.h, z0.b
94; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
95; VBITS_GE_512-NEXT:    ret
96  %a = load <32 x i8>, ptr %in
97  %b = add <32 x i8> %a, %a
98  %c = sext <32 x i8> %b to <32 x i16>
99  store <32 x i16> %c, ptr %out
100  ret void
101}
102
103define void @sext_v64i8_v64i16(ptr %in, ptr %out) vscale_range(8,0) #0 {
104; CHECK-LABEL: sext_v64i8_v64i16:
105; CHECK:       // %bb.0:
106; CHECK-NEXT:    ptrue p0.b, vl64
107; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
108; CHECK-NEXT:    ptrue p0.h, vl64
109; CHECK-NEXT:    add z0.b, z0.b, z0.b
110; CHECK-NEXT:    sunpklo z0.h, z0.b
111; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
112; CHECK-NEXT:    ret
113  %a = load <64 x i8>, ptr %in
114  %b = add <64 x i8> %a, %a
115  %c = sext <64 x i8> %b to <64 x i16>
116  store <64 x i16> %c, ptr %out
117  ret void
118}
119
120define void @sext_v128i8_v128i16(ptr %in, ptr %out) vscale_range(16,0) #0 {
121; CHECK-LABEL: sext_v128i8_v128i16:
122; CHECK:       // %bb.0:
123; CHECK-NEXT:    ptrue p0.b, vl128
124; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
125; CHECK-NEXT:    ptrue p0.h, vl128
126; CHECK-NEXT:    add z0.b, z0.b, z0.b
127; CHECK-NEXT:    sunpklo z0.h, z0.b
128; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
129; CHECK-NEXT:    ret
130  %a = load <128 x i8>, ptr %in
131  %b = add <128 x i8> %a, %a
132  %c = sext <128 x i8> %b to <128 x i16>
133  store <128 x i16> %c, ptr %out
134  ret void
135}
136
137;
138; sext i8 -> i32
139;
140
141define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) vscale_range(2,0) #0 {
142; CHECK-LABEL: sext_v8i8_v8i32:
143; CHECK:       // %bb.0:
144; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
145; CHECK-NEXT:    ptrue p0.s, vl8
146; CHECK-NEXT:    sunpklo z0.h, z0.b
147; CHECK-NEXT:    sunpklo z0.s, z0.h
148; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
149; CHECK-NEXT:    ret
150  %b = sext <8 x i8> %a to <8 x i32>
151  store <8 x i32>%b, ptr %out
152  ret void
153}
154
155define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) #0 {
156; VBITS_GE_256-LABEL: sext_v16i8_v16i32:
157; VBITS_GE_256:       // %bb.0:
158; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
159; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
160; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
161; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
162; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
163; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
164; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
165; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
166; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
167; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
168; VBITS_GE_256-NEXT:    ret
169;
170; VBITS_GE_512-LABEL: sext_v16i8_v16i32:
171; VBITS_GE_512:       // %bb.0:
172; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
173; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
174; VBITS_GE_512-NEXT:    sunpklo z0.h, z0.b
175; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
176; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
177; VBITS_GE_512-NEXT:    ret
178  %b = sext <16 x i8> %a to <16 x i32>
179  store <16 x i32> %b, ptr %out
180  ret void
181}
182
183define void @sext_v32i8_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 {
184; CHECK-LABEL: sext_v32i8_v32i32:
185; CHECK:       // %bb.0:
186; CHECK-NEXT:    ptrue p0.b, vl32
187; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
188; CHECK-NEXT:    ptrue p0.s, vl32
189; CHECK-NEXT:    add z0.b, z0.b, z0.b
190; CHECK-NEXT:    sunpklo z0.h, z0.b
191; CHECK-NEXT:    sunpklo z0.s, z0.h
192; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
193; CHECK-NEXT:    ret
194  %a = load <32 x i8>, ptr %in
195  %b = add <32 x i8> %a, %a
196  %c = sext <32 x i8> %b to <32 x i32>
197  store <32 x i32> %c, ptr %out
198  ret void
199}
200
201define void @sext_v64i8_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 {
202; CHECK-LABEL: sext_v64i8_v64i32:
203; CHECK:       // %bb.0:
204; CHECK-NEXT:    ptrue p0.b, vl64
205; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
206; CHECK-NEXT:    ptrue p0.s, vl64
207; CHECK-NEXT:    add z0.b, z0.b, z0.b
208; CHECK-NEXT:    sunpklo z0.h, z0.b
209; CHECK-NEXT:    sunpklo z0.s, z0.h
210; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
211; CHECK-NEXT:    ret
212  %a = load <64 x i8>, ptr %in
213  %b = add <64 x i8> %a, %a
214  %c = sext <64 x i8> %b to <64 x i32>
215  store <64 x i32> %c, ptr %out
216  ret void
217}
218
219;
220; sext i8 -> i64
221;
222
223; NOTE: v4i8 is an unpacked typed stored within a v4i16 container. The sign
224; extend is a two step process where the container is any_extend'd with the
225; result feeding an inreg sign extend.
226define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) vscale_range(2,0) #0 {
227; CHECK-LABEL: sext_v4i8_v4i64:
228; CHECK:       // %bb.0:
229; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
230; CHECK-NEXT:    ptrue p0.d, vl4
231; CHECK-NEXT:    uunpklo z0.s, z0.h
232; CHECK-NEXT:    uunpklo z0.d, z0.s
233; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
234; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
235; CHECK-NEXT:    ret
236  %b = sext <4 x i8> %a to <4 x i64>
237  store <4 x i64>%b, ptr %out
238  ret void
239}
240
241define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) #0 {
242; VBITS_GE_256-LABEL: sext_v8i8_v8i64:
243; VBITS_GE_256:       // %bb.0:
244; VBITS_GE_256-NEXT:    sshll v0.8h, v0.8b, #0
245; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
246; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
247; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
248; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
249; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
250; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
251; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
252; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
253; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
254; VBITS_GE_256-NEXT:    ret
255;
256; VBITS_GE_512-LABEL: sext_v8i8_v8i64:
257; VBITS_GE_512:       // %bb.0:
258; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
259; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
260; VBITS_GE_512-NEXT:    sunpklo z0.h, z0.b
261; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
262; VBITS_GE_512-NEXT:    sunpklo z0.d, z0.s
263; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
264; VBITS_GE_512-NEXT:    ret
265  %b = sext <8 x i8> %a to <8 x i64>
266  store <8 x i64>%b, ptr %out
267  ret void
268}
269
270define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) vscale_range(8,0) #0 {
271; CHECK-LABEL: sext_v16i8_v16i64:
272; CHECK:       // %bb.0:
273; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
274; CHECK-NEXT:    ptrue p0.d, vl16
275; CHECK-NEXT:    sunpklo z0.h, z0.b
276; CHECK-NEXT:    sunpklo z0.s, z0.h
277; CHECK-NEXT:    sunpklo z0.d, z0.s
278; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
279; CHECK-NEXT:    ret
280  %b = sext <16 x i8> %a to <16 x i64>
281  store <16 x i64> %b, ptr %out
282  ret void
283}
284
285define void @sext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
286; CHECK-LABEL: sext_v32i8_v32i64:
287; CHECK:       // %bb.0:
288; CHECK-NEXT:    ptrue p0.b, vl32
289; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
290; CHECK-NEXT:    ptrue p0.d, vl32
291; CHECK-NEXT:    add z0.b, z0.b, z0.b
292; CHECK-NEXT:    sunpklo z0.h, z0.b
293; CHECK-NEXT:    sunpklo z0.s, z0.h
294; CHECK-NEXT:    sunpklo z0.d, z0.s
295; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
296; CHECK-NEXT:    ret
297  %a = load <32 x i8>, ptr %in
298  %b = add <32 x i8> %a, %a
299  %c = sext <32 x i8> %b to <32 x i64>
300  store <32 x i64> %c, ptr %out
301  ret void
302}
303
304;
305; sext i16 -> i32
306;
307
308define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) vscale_range(2,0) #0 {
309; CHECK-LABEL: sext_v8i16_v8i32:
310; CHECK:       // %bb.0:
311; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
312; CHECK-NEXT:    ptrue p0.s, vl8
313; CHECK-NEXT:    sunpklo z0.s, z0.h
314; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
315; CHECK-NEXT:    ret
316  %b = sext <8 x i16> %a to <8 x i32>
317  store <8 x i32>%b, ptr %out
318  ret void
319}
320
321define void @sext_v16i16_v16i32(ptr %in, ptr %out) #0 {
322; VBITS_GE_256-LABEL: sext_v16i16_v16i32:
323; VBITS_GE_256:       // %bb.0:
324; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
325; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
326; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
327; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
328; VBITS_GE_256-NEXT:    add z0.h, z0.h, z0.h
329; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
330; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
331; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
332; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
333; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
334; VBITS_GE_256-NEXT:    ret
335;
336; VBITS_GE_512-LABEL: sext_v16i16_v16i32:
337; VBITS_GE_512:       // %bb.0:
338; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
339; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
340; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
341; VBITS_GE_512-NEXT:    add z0.h, z0.h, z0.h
342; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
343; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
344; VBITS_GE_512-NEXT:    ret
345  %a = load <16 x i16>, ptr %in
346  %b = add <16 x i16> %a, %a
347  %c = sext <16 x i16> %b to <16 x i32>
348  store <16 x i32> %c, ptr %out
349  ret void
350}
351
352define void @sext_v32i16_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 {
353; CHECK-LABEL: sext_v32i16_v32i32:
354; CHECK:       // %bb.0:
355; CHECK-NEXT:    ptrue p0.h, vl32
356; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
357; CHECK-NEXT:    ptrue p0.s, vl32
358; CHECK-NEXT:    add z0.h, z0.h, z0.h
359; CHECK-NEXT:    sunpklo z0.s, z0.h
360; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
361; CHECK-NEXT:    ret
362  %a = load <32 x i16>, ptr %in
363  %b = add <32 x i16> %a, %a
364  %c = sext <32 x i16> %b to <32 x i32>
365  store <32 x i32> %c, ptr %out
366  ret void
367}
368
369define void @sext_v64i16_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 {
370; CHECK-LABEL: sext_v64i16_v64i32:
371; CHECK:       // %bb.0:
372; CHECK-NEXT:    ptrue p0.h, vl64
373; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
374; CHECK-NEXT:    ptrue p0.s, vl64
375; CHECK-NEXT:    add z0.h, z0.h, z0.h
376; CHECK-NEXT:    sunpklo z0.s, z0.h
377; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
378; CHECK-NEXT:    ret
379  %a = load <64 x i16>, ptr %in
380  %b = add <64 x i16> %a, %a
381  %c = sext <64 x i16> %b to <64 x i32>
382  store <64 x i32> %c, ptr %out
383  ret void
384}
385
386;
387; sext i16 -> i64
388;
389
390define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) vscale_range(2,0) #0 {
391; CHECK-LABEL: sext_v4i16_v4i64:
392; CHECK:       // %bb.0:
393; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
394; CHECK-NEXT:    ptrue p0.d, vl4
395; CHECK-NEXT:    sunpklo z0.s, z0.h
396; CHECK-NEXT:    sunpklo z0.d, z0.s
397; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
398; CHECK-NEXT:    ret
399  %b = sext <4 x i16> %a to <4 x i64>
400  store <4 x i64>%b, ptr %out
401  ret void
402}
403
404define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) #0 {
405; VBITS_GE_256-LABEL: sext_v8i16_v8i64:
406; VBITS_GE_256:       // %bb.0:
407; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
408; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
409; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
410; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
411; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
412; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
413; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
414; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
415; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
416; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
417; VBITS_GE_256-NEXT:    ret
418;
419; VBITS_GE_512-LABEL: sext_v8i16_v8i64:
420; VBITS_GE_512:       // %bb.0:
421; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
422; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
423; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
424; VBITS_GE_512-NEXT:    sunpklo z0.d, z0.s
425; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
426; VBITS_GE_512-NEXT:    ret
427  %b = sext <8 x i16> %a to <8 x i64>
428  store <8 x i64>%b, ptr %out
429  ret void
430}
431
432define void @sext_v16i16_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 {
433; CHECK-LABEL: sext_v16i16_v16i64:
434; CHECK:       // %bb.0:
435; CHECK-NEXT:    ptrue p0.h, vl16
436; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
437; CHECK-NEXT:    ptrue p0.d, vl16
438; CHECK-NEXT:    add z0.h, z0.h, z0.h
439; CHECK-NEXT:    sunpklo z0.s, z0.h
440; CHECK-NEXT:    sunpklo z0.d, z0.s
441; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
442; CHECK-NEXT:    ret
443  %a = load <16 x i16>, ptr %in
444  %b = add <16 x i16> %a, %a
445  %c = sext <16 x i16> %b to <16 x i64>
446  store <16 x i64> %c, ptr %out
447  ret void
448}
449
450define void @sext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
451; CHECK-LABEL: sext_v32i16_v32i64:
452; CHECK:       // %bb.0:
453; CHECK-NEXT:    ptrue p0.h, vl32
454; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
455; CHECK-NEXT:    ptrue p0.d, vl32
456; CHECK-NEXT:    add z0.h, z0.h, z0.h
457; CHECK-NEXT:    sunpklo z0.s, z0.h
458; CHECK-NEXT:    sunpklo z0.d, z0.s
459; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
460; CHECK-NEXT:    ret
461  %a = load <32 x i16>, ptr %in
462  %b = add <32 x i16> %a, %a
463  %c = sext <32 x i16> %b to <32 x i64>
464  store <32 x i64> %c, ptr %out
465  ret void
466}
467
468;
469; sext i32 -> i64
470;
471
472define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) vscale_range(2,0) #0 {
473; CHECK-LABEL: sext_v4i32_v4i64:
474; CHECK:       // %bb.0:
475; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
476; CHECK-NEXT:    ptrue p0.d, vl4
477; CHECK-NEXT:    sunpklo z0.d, z0.s
478; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
479; CHECK-NEXT:    ret
480  %b = sext <4 x i32> %a to <4 x i64>
481  store <4 x i64>%b, ptr %out
482  ret void
483}
484
485define void @sext_v8i32_v8i64(ptr %in, ptr %out) #0 {
486; VBITS_GE_256-LABEL: sext_v8i32_v8i64:
487; VBITS_GE_256:       // %bb.0:
488; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
489; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
490; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
491; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
492; VBITS_GE_256-NEXT:    add z0.s, z0.s, z0.s
493; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
494; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
495; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
496; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
497; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
498; VBITS_GE_256-NEXT:    ret
499;
500; VBITS_GE_512-LABEL: sext_v8i32_v8i64:
501; VBITS_GE_512:       // %bb.0:
502; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
503; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
504; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
505; VBITS_GE_512-NEXT:    add z0.s, z0.s, z0.s
506; VBITS_GE_512-NEXT:    sunpklo z0.d, z0.s
507; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
508; VBITS_GE_512-NEXT:    ret
509  %a = load <8 x i32>, ptr %in
510  %b = add <8 x i32> %a, %a
511  %c = sext <8 x i32> %b to <8 x i64>
512  store <8 x i64> %c, ptr %out
513  ret void
514}
515
516define void @sext_v16i32_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 {
517; CHECK-LABEL: sext_v16i32_v16i64:
518; CHECK:       // %bb.0:
519; CHECK-NEXT:    ptrue p0.s, vl16
520; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
521; CHECK-NEXT:    ptrue p0.d, vl16
522; CHECK-NEXT:    add z0.s, z0.s, z0.s
523; CHECK-NEXT:    sunpklo z0.d, z0.s
524; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
525; CHECK-NEXT:    ret
526  %a = load <16 x i32>, ptr %in
527  %b = add <16 x i32> %a, %a
528  %c = sext <16 x i32> %b to <16 x i64>
529  store <16 x i64> %c, ptr %out
530  ret void
531}
532
533define void @sext_v32i32_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
534; CHECK-LABEL: sext_v32i32_v32i64:
535; CHECK:       // %bb.0:
536; CHECK-NEXT:    ptrue p0.s, vl32
537; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
538; CHECK-NEXT:    ptrue p0.d, vl32
539; CHECK-NEXT:    add z0.s, z0.s, z0.s
540; CHECK-NEXT:    sunpklo z0.d, z0.s
541; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
542; CHECK-NEXT:    ret
543  %a = load <32 x i32>, ptr %in
544  %b = add <32 x i32> %a, %a
545  %c = sext <32 x i32> %b to <32 x i64>
546  store <32 x i64> %c, ptr %out
547  ret void
548}
549
550;
551; zext i8 -> i16
552;
553
554define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) vscale_range(2,0) #0 {
555; CHECK-LABEL: zext_v16i8_v16i16:
556; CHECK:       // %bb.0:
557; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
558; CHECK-NEXT:    ptrue p0.h, vl16
559; CHECK-NEXT:    uunpklo z0.h, z0.b
560; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
561; CHECK-NEXT:    ret
562  %b = zext <16 x i8> %a to <16 x i16>
563  store <16 x i16>%b, ptr %out
564  ret void
565}
566
567; NOTE: Extra 'add' is to prevent the extend being combined with the load.
568define void @zext_v32i8_v32i16(ptr %in, ptr %out) #0 {
569; VBITS_GE_256-LABEL: zext_v32i8_v32i16:
570; VBITS_GE_256:       // %bb.0:
571; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
572; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
573; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
574; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
575; VBITS_GE_256-NEXT:    add z0.b, z0.b, z0.b
576; VBITS_GE_256-NEXT:    uunpklo z1.h, z0.b
577; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
578; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
579; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
580; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
581; VBITS_GE_256-NEXT:    ret
582;
583; VBITS_GE_512-LABEL: zext_v32i8_v32i16:
584; VBITS_GE_512:       // %bb.0:
585; VBITS_GE_512-NEXT:    ptrue p0.b, vl32
586; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
587; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
588; VBITS_GE_512-NEXT:    add z0.b, z0.b, z0.b
589; VBITS_GE_512-NEXT:    uunpklo z0.h, z0.b
590; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
591; VBITS_GE_512-NEXT:    ret
592  %a = load <32 x i8>, ptr %in
593  %b = add <32 x i8> %a, %a
594  %c = zext <32 x i8> %b to <32 x i16>
595  store <32 x i16> %c, ptr %out
596  ret void
597}
598
599define void @zext_v64i8_v64i16(ptr %in, ptr %out) vscale_range(8,0) #0 {
600; CHECK-LABEL: zext_v64i8_v64i16:
601; CHECK:       // %bb.0:
602; CHECK-NEXT:    ptrue p0.b, vl64
603; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
604; CHECK-NEXT:    ptrue p0.h, vl64
605; CHECK-NEXT:    add z0.b, z0.b, z0.b
606; CHECK-NEXT:    uunpklo z0.h, z0.b
607; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
608; CHECK-NEXT:    ret
609  %a = load <64 x i8>, ptr %in
610  %b = add <64 x i8> %a, %a
611  %c = zext <64 x i8> %b to <64 x i16>
612  store <64 x i16> %c, ptr %out
613  ret void
614}
615
616define void @zext_v128i8_v128i16(ptr %in, ptr %out) vscale_range(16,0) #0 {
617; CHECK-LABEL: zext_v128i8_v128i16:
618; CHECK:       // %bb.0:
619; CHECK-NEXT:    ptrue p0.b, vl128
620; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
621; CHECK-NEXT:    ptrue p0.h, vl128
622; CHECK-NEXT:    add z0.b, z0.b, z0.b
623; CHECK-NEXT:    uunpklo z0.h, z0.b
624; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
625; CHECK-NEXT:    ret
626  %a = load <128 x i8>, ptr %in
627  %b = add <128 x i8> %a, %a
628  %c = zext <128 x i8> %b to <128 x i16>
629  store <128 x i16> %c, ptr %out
630  ret void
631}
632
633;
634; zext i8 -> i32
635;
636
637define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) vscale_range(2,0) #0 {
638; CHECK-LABEL: zext_v8i8_v8i32:
639; CHECK:       // %bb.0:
640; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
641; CHECK-NEXT:    ptrue p0.s, vl8
642; CHECK-NEXT:    uunpklo z0.h, z0.b
643; CHECK-NEXT:    uunpklo z0.s, z0.h
644; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
645; CHECK-NEXT:    ret
646  %b = zext <8 x i8> %a to <8 x i32>
647  store <8 x i32>%b, ptr %out
648  ret void
649}
650
651define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) #0 {
652; VBITS_GE_256-LABEL: zext_v16i8_v16i32:
653; VBITS_GE_256:       // %bb.0:
654; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
655; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
656; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
657; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
658; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
659; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
660; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
661; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
662; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
663; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
664; VBITS_GE_256-NEXT:    ret
665;
666; VBITS_GE_512-LABEL: zext_v16i8_v16i32:
667; VBITS_GE_512:       // %bb.0:
668; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
669; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
670; VBITS_GE_512-NEXT:    uunpklo z0.h, z0.b
671; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
672; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
673; VBITS_GE_512-NEXT:    ret
674  %b = zext <16 x i8> %a to <16 x i32>
675  store <16 x i32> %b, ptr %out
676  ret void
677}
678
679define void @zext_v32i8_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 {
680; CHECK-LABEL: zext_v32i8_v32i32:
681; CHECK:       // %bb.0:
682; CHECK-NEXT:    ptrue p0.b, vl32
683; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
684; CHECK-NEXT:    ptrue p0.s, vl32
685; CHECK-NEXT:    add z0.b, z0.b, z0.b
686; CHECK-NEXT:    uunpklo z0.h, z0.b
687; CHECK-NEXT:    uunpklo z0.s, z0.h
688; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
689; CHECK-NEXT:    ret
690  %a = load <32 x i8>, ptr %in
691  %b = add <32 x i8> %a, %a
692  %c = zext <32 x i8> %b to <32 x i32>
693  store <32 x i32> %c, ptr %out
694  ret void
695}
696
697define void @zext_v64i8_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 {
698; CHECK-LABEL: zext_v64i8_v64i32:
699; CHECK:       // %bb.0:
700; CHECK-NEXT:    ptrue p0.b, vl64
701; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
702; CHECK-NEXT:    ptrue p0.s, vl64
703; CHECK-NEXT:    add z0.b, z0.b, z0.b
704; CHECK-NEXT:    uunpklo z0.h, z0.b
705; CHECK-NEXT:    uunpklo z0.s, z0.h
706; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
707; CHECK-NEXT:    ret
708  %a = load <64 x i8>, ptr %in
709  %b = add <64 x i8> %a, %a
710  %c = zext <64 x i8> %b to <64 x i32>
711  store <64 x i32> %c, ptr %out
712  ret void
713}
714
715;
716; zext i8 -> i64
717;
718
719; NOTE: v4i8 is an unpacked typed stored within a v4i16 container. The zero
720; extend is a two step process where the container is zero_extend_inreg'd with
721; the result feeding a normal zero extend from halfs to doublewords.
722define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) vscale_range(2,0) #0 {
723; CHECK-LABEL: zext_v4i8_v4i64:
724; CHECK:       // %bb.0:
725; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
726; CHECK-NEXT:    ptrue p0.d, vl4
727; CHECK-NEXT:    bic v0.4h, #255, lsl #8
728; CHECK-NEXT:    uunpklo z0.s, z0.h
729; CHECK-NEXT:    uunpklo z0.d, z0.s
730; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
731; CHECK-NEXT:    ret
732  %b = zext <4 x i8> %a to <4 x i64>
733  store <4 x i64>%b, ptr %out
734  ret void
735}
736
737define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) #0 {
738; VBITS_GE_256-LABEL: zext_v8i8_v8i64:
739; VBITS_GE_256:       // %bb.0:
740; VBITS_GE_256-NEXT:    ushll v0.8h, v0.8b, #0
741; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
742; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
743; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
744; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
745; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
746; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
747; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
748; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
749; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
750; VBITS_GE_256-NEXT:    ret
751;
752; VBITS_GE_512-LABEL: zext_v8i8_v8i64:
753; VBITS_GE_512:       // %bb.0:
754; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
755; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
756; VBITS_GE_512-NEXT:    uunpklo z0.h, z0.b
757; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
758; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
759; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
760; VBITS_GE_512-NEXT:    ret
761  %b = zext <8 x i8> %a to <8 x i64>
762  store <8 x i64>%b, ptr %out
763  ret void
764}
765
766define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) vscale_range(8,0) #0 {
767; CHECK-LABEL: zext_v16i8_v16i64:
768; CHECK:       // %bb.0:
769; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
770; CHECK-NEXT:    ptrue p0.d, vl16
771; CHECK-NEXT:    uunpklo z0.h, z0.b
772; CHECK-NEXT:    uunpklo z0.s, z0.h
773; CHECK-NEXT:    uunpklo z0.d, z0.s
774; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
775; CHECK-NEXT:    ret
776  %b = zext <16 x i8> %a to <16 x i64>
777  store <16 x i64> %b, ptr %out
778  ret void
779}
780
781define void @zext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
782; CHECK-LABEL: zext_v32i8_v32i64:
783; CHECK:       // %bb.0:
784; CHECK-NEXT:    ptrue p0.b, vl32
785; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
786; CHECK-NEXT:    ptrue p0.d, vl32
787; CHECK-NEXT:    add z0.b, z0.b, z0.b
788; CHECK-NEXT:    uunpklo z0.h, z0.b
789; CHECK-NEXT:    uunpklo z0.s, z0.h
790; CHECK-NEXT:    uunpklo z0.d, z0.s
791; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
792; CHECK-NEXT:    ret
793  %a = load <32 x i8>, ptr %in
794  %b = add <32 x i8> %a, %a
795  %c = zext <32 x i8> %b to <32 x i64>
796  store <32 x i64> %c, ptr %out
797  ret void
798}
799
800;
801; zext i16 -> i32
802;
803
804define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) vscale_range(2,0) #0 {
805; CHECK-LABEL: zext_v8i16_v8i32:
806; CHECK:       // %bb.0:
807; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
808; CHECK-NEXT:    ptrue p0.s, vl8
809; CHECK-NEXT:    uunpklo z0.s, z0.h
810; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
811; CHECK-NEXT:    ret
812  %b = zext <8 x i16> %a to <8 x i32>
813  store <8 x i32>%b, ptr %out
814  ret void
815}
816
817define void @zext_v16i16_v16i32(ptr %in, ptr %out) #0 {
818; VBITS_GE_256-LABEL: zext_v16i16_v16i32:
819; VBITS_GE_256:       // %bb.0:
820; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
821; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
822; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
823; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
824; VBITS_GE_256-NEXT:    add z0.h, z0.h, z0.h
825; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
826; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
827; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
828; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
829; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
830; VBITS_GE_256-NEXT:    ret
831;
832; VBITS_GE_512-LABEL: zext_v16i16_v16i32:
833; VBITS_GE_512:       // %bb.0:
834; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
835; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
836; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
837; VBITS_GE_512-NEXT:    add z0.h, z0.h, z0.h
838; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
839; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
840; VBITS_GE_512-NEXT:    ret
841  %a = load <16 x i16>, ptr %in
842  %b = add <16 x i16> %a, %a
843  %c = zext <16 x i16> %b to <16 x i32>
844  store <16 x i32> %c, ptr %out
845  ret void
846}
847
848define void @zext_v32i16_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 {
849; CHECK-LABEL: zext_v32i16_v32i32:
850; CHECK:       // %bb.0:
851; CHECK-NEXT:    ptrue p0.h, vl32
852; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
853; CHECK-NEXT:    ptrue p0.s, vl32
854; CHECK-NEXT:    add z0.h, z0.h, z0.h
855; CHECK-NEXT:    uunpklo z0.s, z0.h
856; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
857; CHECK-NEXT:    ret
858  %a = load <32 x i16>, ptr %in
859  %b = add <32 x i16> %a, %a
860  %c = zext <32 x i16> %b to <32 x i32>
861  store <32 x i32> %c, ptr %out
862  ret void
863}
864
865define void @zext_v64i16_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 {
866; CHECK-LABEL: zext_v64i16_v64i32:
867; CHECK:       // %bb.0:
868; CHECK-NEXT:    ptrue p0.h, vl64
869; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
870; CHECK-NEXT:    ptrue p0.s, vl64
871; CHECK-NEXT:    add z0.h, z0.h, z0.h
872; CHECK-NEXT:    uunpklo z0.s, z0.h
873; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
874; CHECK-NEXT:    ret
875  %a = load <64 x i16>, ptr %in
876  %b = add <64 x i16> %a, %a
877  %c = zext <64 x i16> %b to <64 x i32>
878  store <64 x i32> %c, ptr %out
879  ret void
880}
881
882;
883; zext i16 -> i64
884;
885
886define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) vscale_range(2,0) #0 {
887; CHECK-LABEL: zext_v4i16_v4i64:
888; CHECK:       // %bb.0:
889; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
890; CHECK-NEXT:    ptrue p0.d, vl4
891; CHECK-NEXT:    uunpklo z0.s, z0.h
892; CHECK-NEXT:    uunpklo z0.d, z0.s
893; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
894; CHECK-NEXT:    ret
895  %b = zext <4 x i16> %a to <4 x i64>
896  store <4 x i64>%b, ptr %out
897  ret void
898}
899
900define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) #0 {
901; VBITS_GE_256-LABEL: zext_v8i16_v8i64:
902; VBITS_GE_256:       // %bb.0:
903; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
904; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
905; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
906; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
907; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
908; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
909; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
910; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
911; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
912; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
913; VBITS_GE_256-NEXT:    ret
914;
915; VBITS_GE_512-LABEL: zext_v8i16_v8i64:
916; VBITS_GE_512:       // %bb.0:
917; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
918; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
919; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
920; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
921; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
922; VBITS_GE_512-NEXT:    ret
923  %b = zext <8 x i16> %a to <8 x i64>
924  store <8 x i64>%b, ptr %out
925  ret void
926}
927
928define void @zext_v16i16_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 {
929; CHECK-LABEL: zext_v16i16_v16i64:
930; CHECK:       // %bb.0:
931; CHECK-NEXT:    ptrue p0.h, vl16
932; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
933; CHECK-NEXT:    ptrue p0.d, vl16
934; CHECK-NEXT:    add z0.h, z0.h, z0.h
935; CHECK-NEXT:    uunpklo z0.s, z0.h
936; CHECK-NEXT:    uunpklo z0.d, z0.s
937; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
938; CHECK-NEXT:    ret
939  %a = load <16 x i16>, ptr %in
940  %b = add <16 x i16> %a, %a
941  %c = zext <16 x i16> %b to <16 x i64>
942  store <16 x i64> %c, ptr %out
943  ret void
944}
945
946define void @zext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
947; CHECK-LABEL: zext_v32i16_v32i64:
948; CHECK:       // %bb.0:
949; CHECK-NEXT:    ptrue p0.h, vl32
950; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
951; CHECK-NEXT:    ptrue p0.d, vl32
952; CHECK-NEXT:    add z0.h, z0.h, z0.h
953; CHECK-NEXT:    uunpklo z0.s, z0.h
954; CHECK-NEXT:    uunpklo z0.d, z0.s
955; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
956; CHECK-NEXT:    ret
957  %a = load <32 x i16>, ptr %in
958  %b = add <32 x i16> %a, %a
959  %c = zext <32 x i16> %b to <32 x i64>
960  store <32 x i64> %c, ptr %out
961  ret void
962}
963
964;
965; zext i32 -> i64
966;
967
968define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) vscale_range(2,0) #0 {
969; CHECK-LABEL: zext_v4i32_v4i64:
970; CHECK:       // %bb.0:
971; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
972; CHECK-NEXT:    ptrue p0.d, vl4
973; CHECK-NEXT:    uunpklo z0.d, z0.s
974; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
975; CHECK-NEXT:    ret
976  %b = zext <4 x i32> %a to <4 x i64>
977  store <4 x i64>%b, ptr %out
978  ret void
979}
980
981define void @zext_v8i32_v8i64(ptr %in, ptr %out) #0 {
982; VBITS_GE_256-LABEL: zext_v8i32_v8i64:
983; VBITS_GE_256:       // %bb.0:
984; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
985; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
986; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
987; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
988; VBITS_GE_256-NEXT:    add z0.s, z0.s, z0.s
989; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
990; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
991; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
992; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
993; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
994; VBITS_GE_256-NEXT:    ret
995;
996; VBITS_GE_512-LABEL: zext_v8i32_v8i64:
997; VBITS_GE_512:       // %bb.0:
998; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
999; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
1000; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1001; VBITS_GE_512-NEXT:    add z0.s, z0.s, z0.s
1002; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
1003; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
1004; VBITS_GE_512-NEXT:    ret
1005  %a = load <8 x i32>, ptr %in
1006  %b = add <8 x i32> %a, %a
1007  %c = zext <8 x i32> %b to <8 x i64>
1008  store <8 x i64> %c, ptr %out
1009  ret void
1010}
1011
1012define void @zext_v16i32_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 {
1013; CHECK-LABEL: zext_v16i32_v16i64:
1014; CHECK:       // %bb.0:
1015; CHECK-NEXT:    ptrue p0.s, vl16
1016; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1017; CHECK-NEXT:    ptrue p0.d, vl16
1018; CHECK-NEXT:    add z0.s, z0.s, z0.s
1019; CHECK-NEXT:    uunpklo z0.d, z0.s
1020; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
1021; CHECK-NEXT:    ret
1022  %a = load <16 x i32>, ptr %in
1023  %b = add <16 x i32> %a, %a
1024  %c = zext <16 x i32> %b to <16 x i64>
1025  store <16 x i64> %c, ptr %out
1026  ret void
1027}
1028
1029define void @zext_v32i32_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
1030; CHECK-LABEL: zext_v32i32_v32i64:
1031; CHECK:       // %bb.0:
1032; CHECK-NEXT:    ptrue p0.s, vl32
1033; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1034; CHECK-NEXT:    ptrue p0.d, vl32
1035; CHECK-NEXT:    add z0.s, z0.s, z0.s
1036; CHECK-NEXT:    uunpklo z0.d, z0.s
1037; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
1038; CHECK-NEXT:    ret
1039  %a = load <32 x i32>, ptr %in
1040  %b = add <32 x i32> %a, %a
1041  %c = zext <32 x i32> %b to <32 x i64>
1042  store <32 x i64> %c, ptr %out
1043  ret void
1044}
1045
1046attributes #0 = { nounwind "target-features"="+sve" }
1047