xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll (revision 73b255c9f824157064212d341a255a962e746fd4)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8; i8
9
10; Don't use SVE for 64-bit vectors.
11define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
12; CHECK-LABEL: extract_subvector_v8i8:
13; CHECK:       // %bb.0:
14; CHECK-NEXT:    zip2 v0.8b, v0.8b, v0.8b
15; CHECK-NEXT:    ret
16  %ret = call <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8> %op, i64 4)
17  ret <4 x i8> %ret
18}
19
20; Don't use SVE for 128-bit vectors.
21define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
22; CHECK-LABEL: extract_subvector_v16i8:
23; CHECK:       // %bb.0:
24; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
25; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
26; CHECK-NEXT:    ret
27  %ret = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> %op, i64 8)
28  ret <8 x i8> %ret
29}
30
31define void @extract_subvector_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
32; CHECK-LABEL: extract_subvector_v32i8:
33; CHECK:       // %bb.0:
34; CHECK-NEXT:    ptrue p0.b, vl32
35; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
36; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #16
37; CHECK-NEXT:    str q0, [x1]
38; CHECK-NEXT:    ret
39  %op = load <32 x i8>, ptr %a
40  %ret = call <16 x i8> @llvm.vector.extract.v16i8.v32i8(<32 x i8> %op, i64 16)
41  store <16 x i8> %ret, ptr %b
42  ret void
43}
44
45define void @extract_subvector_v64i8(ptr %a, ptr %b) #0 {
46; VBITS_GE_256-LABEL: extract_subvector_v64i8:
47; VBITS_GE_256:       // %bb.0:
48; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
49; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
50; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
51; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1]
52; VBITS_GE_256-NEXT:    ret
53;
54; VBITS_GE_512-LABEL: extract_subvector_v64i8:
55; VBITS_GE_512:       // %bb.0:
56; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
57; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
58; VBITS_GE_512-NEXT:    ptrue p0.b, vl32
59; VBITS_GE_512-NEXT:    ext z0.b, z0.b, z0.b, #32
60; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x1]
61; VBITS_GE_512-NEXT:    ret
62  %op = load <64 x i8>, ptr %a
63  %ret = call <32 x i8> @llvm.vector.extract.v32i8.v64i8(<64 x i8> %op, i64 32)
64  store <32 x i8> %ret, ptr %b
65  ret void
66}
67
68define void @extract_subvector_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
69; CHECK-LABEL: extract_subvector_v128i8:
70; CHECK:       // %bb.0:
71; CHECK-NEXT:    ptrue p0.b, vl128
72; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
73; CHECK-NEXT:    ptrue p0.b, vl64
74; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #64
75; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
76; CHECK-NEXT:    ret
77  %op = load <128 x i8>, ptr %a
78  %ret = call <64 x i8> @llvm.vector.extract.v64i8.v128i8(<128 x i8> %op, i64 64)
79  store <64 x i8> %ret, ptr %b
80  ret void
81}
82
83define void @extract_subvector_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
84; CHECK-LABEL: extract_subvector_v256i8:
85; CHECK:       // %bb.0:
86; CHECK-NEXT:    ptrue p0.b, vl256
87; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
88; CHECK-NEXT:    ptrue p0.b, vl128
89; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
90; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
91; CHECK-NEXT:    ret
92  %op = load <256 x i8>, ptr %a
93  %ret = call <128 x i8> @llvm.vector.extract.v128i8.v256i8(<256 x i8> %op, i64 128)
94  store <128 x i8> %ret, ptr %b
95  ret void
96}
97
98; i16
99
100; Don't use SVE for 64-bit vectors.
101define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
102; CHECK-LABEL: extract_subvector_v4i16:
103; CHECK:       // %bb.0:
104; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
105; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
106; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
107; CHECK-NEXT:    ret
108  %ret = call <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16> %op, i64 2)
109  ret <2 x i16> %ret
110}
111
112; Don't use SVE for 128-bit vectors.
113define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
114; CHECK-LABEL: extract_subvector_v8i16:
115; CHECK:       // %bb.0:
116; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
117; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
118; CHECK-NEXT:    ret
119  %ret = call <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16> %op, i64 4)
120  ret <4 x i16> %ret
121}
122
123define void @extract_subvector_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
124; CHECK-LABEL: extract_subvector_v16i16:
125; CHECK:       // %bb.0:
126; CHECK-NEXT:    ptrue p0.h, vl16
127; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
128; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #16
129; CHECK-NEXT:    str q0, [x1]
130; CHECK-NEXT:    ret
131  %op = load <16 x i16>, ptr %a
132  %ret = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> %op, i64 8)
133  store <8 x i16> %ret, ptr %b
134  ret void
135}
136
137define void @extract_subvector_v32i16(ptr %a, ptr %b) #0 {
138; VBITS_GE_256-LABEL: extract_subvector_v32i16:
139; VBITS_GE_256:       // %bb.0:
140; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
141; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
142; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
143; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
144; VBITS_GE_256-NEXT:    ret
145;
146; VBITS_GE_512-LABEL: extract_subvector_v32i16:
147; VBITS_GE_512:       // %bb.0:
148; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
149; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
150; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
151; VBITS_GE_512-NEXT:    ext z0.b, z0.b, z0.b, #32
152; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
153; VBITS_GE_512-NEXT:    ret
154  %op = load <32 x i16>, ptr %a
155  %ret = call <16 x i16> @llvm.vector.extract.v16i16.v32i16(<32 x i16> %op, i64 16)
156  store <16 x i16> %ret, ptr %b
157  ret void
158}
159
160define void @extract_subvector_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
161; CHECK-LABEL: extract_subvector_v64i16:
162; CHECK:       // %bb.0:
163; CHECK-NEXT:    ptrue p0.h, vl64
164; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
165; CHECK-NEXT:    ptrue p0.h, vl32
166; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #64
167; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
168; CHECK-NEXT:    ret
169  %op = load <64 x i16>, ptr %a
170  %ret = call <32 x i16> @llvm.vector.extract.v32i16.v64i16(<64 x i16> %op, i64 32)
171  store <32 x i16> %ret, ptr %b
172  ret void
173}
174
175define void @extract_subvector_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
176; CHECK-LABEL: extract_subvector_v128i16:
177; CHECK:       // %bb.0:
178; CHECK-NEXT:    ptrue p0.h, vl128
179; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
180; CHECK-NEXT:    ptrue p0.h, vl64
181; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
182; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
183; CHECK-NEXT:    ret
184  %op = load <128 x i16>, ptr %a
185  %ret = call <64 x i16> @llvm.vector.extract.v64i16.v128i16(<128 x i16> %op, i64 64)
186  store <64 x i16> %ret, ptr %b
187  ret void
188}
189
190; i32
191
192; Don't use SVE for 64-bit vectors.
193define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
194; CHECK-LABEL: extract_subvector_v2i32:
195; CHECK:       // %bb.0:
196; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
197; CHECK-NEXT:    dup v0.2s, v0.s[1]
198; CHECK-NEXT:    ret
199  %ret = call <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32> %op, i64 1)
200  ret <1 x i32> %ret
201}
202
203; Don't use SVE for 128-bit vectors.
204define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
205; CHECK-LABEL: extract_subvector_v4i32:
206; CHECK:       // %bb.0:
207; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
208; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
209; CHECK-NEXT:    ret
210  %ret = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %op, i64 2)
211  ret <2 x i32> %ret
212}
213
214define void @extract_subvector_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
215; CHECK-LABEL: extract_subvector_v8i32:
216; CHECK:       // %bb.0:
217; CHECK-NEXT:    ptrue p0.s, vl8
218; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
219; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #16
220; CHECK-NEXT:    str q0, [x1]
221; CHECK-NEXT:    ret
222  %op = load <8 x i32>, ptr %a
223  %ret = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %op, i64 4)
224  store <4 x i32> %ret, ptr %b
225  ret void
226}
227
228define void @extract_subvector_v16i32(ptr %a, ptr %b) #0 {
229; VBITS_GE_256-LABEL: extract_subvector_v16i32:
230; VBITS_GE_256:       // %bb.0:
231; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
232; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
233; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
234; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
235; VBITS_GE_256-NEXT:    ret
236;
237; VBITS_GE_512-LABEL: extract_subvector_v16i32:
238; VBITS_GE_512:       // %bb.0:
239; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
240; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
241; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
242; VBITS_GE_512-NEXT:    ext z0.b, z0.b, z0.b, #32
243; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
244; VBITS_GE_512-NEXT:    ret
245  %op = load <16 x i32>, ptr %a
246  %ret = call <8 x i32> @llvm.vector.extract.v8i32.v16i32(<16 x i32> %op, i64 8)
247  store <8 x i32> %ret, ptr %b
248  ret void
249}
250
251define void @extract_subvector_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
252; CHECK-LABEL: extract_subvector_v32i32:
253; CHECK:       // %bb.0:
254; CHECK-NEXT:    ptrue p0.s, vl32
255; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
256; CHECK-NEXT:    ptrue p0.s, vl16
257; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #64
258; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
259; CHECK-NEXT:    ret
260  %op = load <32 x i32>, ptr %a
261  %ret = call <16 x i32> @llvm.vector.extract.v16i32.v32i32(<32 x i32> %op, i64 16)
262  store <16 x i32> %ret, ptr %b
263  ret void
264}
265
266define void @extract_subvector_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
267; CHECK-LABEL: extract_subvector_v64i32:
268; CHECK:       // %bb.0:
269; CHECK-NEXT:    ptrue p0.s, vl64
270; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
271; CHECK-NEXT:    ptrue p0.s, vl32
272; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
273; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
274; CHECK-NEXT:    ret
275  %op = load <64 x i32>, ptr %a
276  %ret = call <32 x i32> @llvm.vector.extract.v32i32.v64i32(<64 x i32> %op, i64 32)
277  store <32 x i32> %ret, ptr %b
278  ret void
279}
280
281; i64
282
283; Don't use SVE for 128-bit vectors.
284define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
285; CHECK-LABEL: extract_subvector_v2i64:
286; CHECK:       // %bb.0:
287; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
288; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
289; CHECK-NEXT:    ret
290  %ret = call <1 x i64> @llvm.vector.extract.v1i64.v2i64(<2 x i64> %op, i64 1)
291  ret <1 x i64> %ret
292}
293
294define void @extract_subvector_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
295; CHECK-LABEL: extract_subvector_v4i64:
296; CHECK:       // %bb.0:
297; CHECK-NEXT:    ptrue p0.d, vl4
298; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
299; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #16
300; CHECK-NEXT:    str q0, [x1]
301; CHECK-NEXT:    ret
302  %op = load <4 x i64>, ptr %a
303  %ret = call <2 x i64> @llvm.vector.extract.v2i64.v4i64(<4 x i64> %op, i64 2)
304  store <2 x i64> %ret, ptr %b
305  ret void
306}
307
308define void @extract_subvector_v8i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
309; CHECK-LABEL: extract_subvector_v8i64:
310; CHECK:       // %bb.0:
311; CHECK-NEXT:    ptrue p0.d, vl4
312; CHECK-NEXT:    mov x8, #4 // =0x4
313; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
314; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
315; CHECK-NEXT:    ret
316  %op = load <8 x i64>, ptr %a
317  %ret = call <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64> %op, i64 4)
318  store <4 x i64> %ret, ptr %b
319  ret void
320}
321
322define void @extract_subvector_v16i64(ptr %a, ptr %b) #0 {
323; VBITS_GE_256-LABEL: extract_subvector_v16i64:
324; VBITS_GE_256:       // %bb.0:
325; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
326; VBITS_GE_256-NEXT:    mov x8, #12 // =0xc
327; VBITS_GE_256-NEXT:    mov x9, #8 // =0x8
328; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
329; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
330; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
331; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
332; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
333; VBITS_GE_256-NEXT:    ret
334  %op = load <16 x i64>, ptr %a
335  %ret = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> %op, i64 8)
336  store <8 x i64> %ret, ptr %b
337  ret void
338}
339
340define void @extract_subvector_v32i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
341; CHECK-LABEL: extract_subvector_v32i64:
342; CHECK:       // %bb.0:
343; CHECK-NEXT:    ptrue p0.d, vl16
344; CHECK-NEXT:    mov x8, #16 // =0x10
345; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
346; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
347; CHECK-NEXT:    ret
348  %op = load <32 x i64>, ptr %a
349  %ret = call <16 x i64> @llvm.vector.extract.v16i64.v32i64(<32 x i64> %op, i64 16)
350  store <16 x i64> %ret, ptr %b
351  ret void
352}
353
354; f16
355
356; Don't use SVE for 64-bit vectors.
357define <2 x half> @extract_subvector_v4f16(<4 x half> %op) vscale_range(16,0) #0 {
358; CHECK-LABEL: extract_subvector_v4f16:
359; CHECK:       // %bb.0:
360; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
361; CHECK-NEXT:    dup v0.2s, v0.s[1]
362; CHECK-NEXT:    ret
363  %ret = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %op, i64 2)
364  ret <2 x half> %ret
365}
366
367; Don't use SVE for 128-bit vectors.
368define <4 x half> @extract_subvector_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
369; CHECK-LABEL: extract_subvector_v8f16:
370; CHECK:       // %bb.0:
371; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
372; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
373; CHECK-NEXT:    ret
374  %ret = call <4 x half> @llvm.vector.extract.v4f16.v8f16(<8 x half> %op, i64 4)
375  ret <4 x half> %ret
376}
377
378define void @extract_subvector_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
379; CHECK-LABEL: extract_subvector_v16f16:
380; CHECK:       // %bb.0:
381; CHECK-NEXT:    ptrue p0.h, vl16
382; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
383; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #16
384; CHECK-NEXT:    str q0, [x1]
385; CHECK-NEXT:    ret
386  %op = load <16 x half>, ptr %a
387  %ret = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> %op, i64 8)
388  store <8 x half> %ret, ptr %b
389  ret void
390}
391
392define void @extract_subvector_v32f16(ptr %a, ptr %b) #0 {
393; VBITS_GE_256-LABEL: extract_subvector_v32f16:
394; VBITS_GE_256:       // %bb.0:
395; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
396; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
397; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
398; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
399; VBITS_GE_256-NEXT:    ret
400;
401; VBITS_GE_512-LABEL: extract_subvector_v32f16:
402; VBITS_GE_512:       // %bb.0:
403; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
404; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
405; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
406; VBITS_GE_512-NEXT:    ext z0.b, z0.b, z0.b, #32
407; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
408; VBITS_GE_512-NEXT:    ret
409  %op = load <32 x half>, ptr %a
410  %ret = call <16 x half> @llvm.vector.extract.v16f16.v32f16(<32 x half> %op, i64 16)
411  store <16 x half> %ret, ptr %b
412  ret void
413}
414
415define void @extract_subvector_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
416; CHECK-LABEL: extract_subvector_v64f16:
417; CHECK:       // %bb.0:
418; CHECK-NEXT:    ptrue p0.h, vl64
419; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
420; CHECK-NEXT:    ptrue p0.h, vl32
421; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #64
422; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
423; CHECK-NEXT:    ret
424  %op = load <64 x half>, ptr %a
425  %ret = call <32 x half> @llvm.vector.extract.v32f16.v64f16(<64 x half> %op, i64 32)
426  store <32 x half> %ret, ptr %b
427  ret void
428}
429
430define void @extract_subvector_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
431; CHECK-LABEL: extract_subvector_v128f16:
432; CHECK:       // %bb.0:
433; CHECK-NEXT:    ptrue p0.h, vl128
434; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
435; CHECK-NEXT:    ptrue p0.h, vl64
436; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
437; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
438; CHECK-NEXT:    ret
439  %op = load <128 x half>, ptr %a
440  %ret = call <64 x half> @llvm.vector.extract.v64f16.v128f16(<128 x half> %op, i64 64)
441  store <64 x half> %ret, ptr %b
442  ret void
443}
444
445; f32
446
447; Don't use SVE for 64-bit vectors.
448define <1 x float> @extract_subvector_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
449; CHECK-LABEL: extract_subvector_v2f32:
450; CHECK:       // %bb.0:
451; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
452; CHECK-NEXT:    dup v0.2s, v0.s[1]
453; CHECK-NEXT:    ret
454  %ret = call <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float> %op, i64 1)
455  ret <1 x float> %ret
456}
457
458; Don't use SVE for 128-bit vectors.
459define <2 x float> @extract_subvector_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
460; CHECK-LABEL: extract_subvector_v4f32:
461; CHECK:       // %bb.0:
462; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
463; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
464; CHECK-NEXT:    ret
465  %ret = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> %op, i64 2)
466  ret <2 x float> %ret
467}
468
469define void @extract_subvector_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
470; CHECK-LABEL: extract_subvector_v8f32:
471; CHECK:       // %bb.0:
472; CHECK-NEXT:    ptrue p0.s, vl8
473; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
474; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #16
475; CHECK-NEXT:    str q0, [x1]
476; CHECK-NEXT:    ret
477  %op = load <8 x float>, ptr %a
478  %ret = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> %op, i64 4)
479  store <4 x float> %ret, ptr %b
480  ret void
481}
482
483define void @extract_subvector_v16f32(ptr %a, ptr %b) #0 {
484; VBITS_GE_256-LABEL: extract_subvector_v16f32:
485; VBITS_GE_256:       // %bb.0:
486; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
487; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
488; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
489; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
490; VBITS_GE_256-NEXT:    ret
491;
492; VBITS_GE_512-LABEL: extract_subvector_v16f32:
493; VBITS_GE_512:       // %bb.0:
494; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
495; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
496; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
497; VBITS_GE_512-NEXT:    ext z0.b, z0.b, z0.b, #32
498; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
499; VBITS_GE_512-NEXT:    ret
500  %op = load <16 x float>, ptr %a
501  %ret = call <8 x float> @llvm.vector.extract.v8f32.v16f32(<16 x float> %op, i64 8)
502  store <8 x float> %ret, ptr %b
503  ret void
504}
505
506define void @extract_subvector_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
507; CHECK-LABEL: extract_subvector_v32f32:
508; CHECK:       // %bb.0:
509; CHECK-NEXT:    ptrue p0.s, vl32
510; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
511; CHECK-NEXT:    ptrue p0.s, vl16
512; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #64
513; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
514; CHECK-NEXT:    ret
515  %op = load <32 x float>, ptr %a
516  %ret = call <16 x float> @llvm.vector.extract.v16f32.v32f32(<32 x float> %op, i64 16)
517  store <16 x float> %ret, ptr %b
518  ret void
519}
520
521define void @extract_subvector_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
522; CHECK-LABEL: extract_subvector_v64f32:
523; CHECK:       // %bb.0:
524; CHECK-NEXT:    ptrue p0.s, vl64
525; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
526; CHECK-NEXT:    ptrue p0.s, vl32
527; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
528; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
529; CHECK-NEXT:    ret
530  %op = load <64 x float>, ptr %a
531  %ret = call <32 x float> @llvm.vector.extract.v32f32.v64f32(<64 x float> %op, i64 32)
532  store <32 x float> %ret, ptr %b
533  ret void
534}
535
536; f64
537
538; Don't use SVE for 128-bit vectors.
539define <1 x double> @extract_subvector_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
540; CHECK-LABEL: extract_subvector_v2f64:
541; CHECK:       // %bb.0:
542; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
543; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
544; CHECK-NEXT:    ret
545  %ret = call <1 x double> @llvm.vector.extract.v1f64.v2f64(<2 x double> %op, i64 1)
546  ret <1 x double> %ret
547}
548
549define void @extract_subvector_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
550; CHECK-LABEL: extract_subvector_v4f64:
551; CHECK:       // %bb.0:
552; CHECK-NEXT:    ptrue p0.d, vl4
553; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
554; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #16
555; CHECK-NEXT:    str q0, [x1]
556; CHECK-NEXT:    ret
557  %op = load <4 x double>, ptr %a
558  %ret = call <2 x double> @llvm.vector.extract.v2f64.v4f64(<4 x double> %op, i64 2)
559  store <2 x double> %ret, ptr %b
560  ret void
561}
562
563define void @extract_subvector_v8f64(ptr %a, ptr %b) #0 {
564; VBITS_GE_256-LABEL: extract_subvector_v8f64:
565; VBITS_GE_256:       // %bb.0:
566; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
567; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
568; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
569; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
570; VBITS_GE_256-NEXT:    ret
571;
572; VBITS_GE_512-LABEL: extract_subvector_v8f64:
573; VBITS_GE_512:       // %bb.0:
574; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
575; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
576; VBITS_GE_512-NEXT:    ptrue p0.d, vl4
577; VBITS_GE_512-NEXT:    ext z0.b, z0.b, z0.b, #32
578; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
579; VBITS_GE_512-NEXT:    ret
580  %op = load <8 x double>, ptr %a
581  %ret = call <4 x double> @llvm.vector.extract.v4f64.v8f64(<8 x double> %op, i64 4)
582  store <4 x double> %ret, ptr %b
583  ret void
584}
585
586define void @extract_subvector_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
587; CHECK-LABEL: extract_subvector_v16f64:
588; CHECK:       // %bb.0:
589; CHECK-NEXT:    ptrue p0.d, vl16
590; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
591; CHECK-NEXT:    ptrue p0.d, vl8
592; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #64
593; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
594; CHECK-NEXT:    ret
595  %op = load <16 x double>, ptr %a
596  %ret = call <8 x double> @llvm.vector.extract.v8f64.v16f64(<16 x double> %op, i64 8)
597  store <8 x double> %ret, ptr %b
598  ret void
599}
600
601define void @extract_subvector_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
602; CHECK-LABEL: extract_subvector_v32f64:
603; CHECK:       // %bb.0:
604; CHECK-NEXT:    ptrue p0.d, vl32
605; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
606; CHECK-NEXT:    ptrue p0.d, vl16
607; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
608; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
609; CHECK-NEXT:    ret
610  %op = load <32 x double>, ptr %a
611  %ret = call <16 x double> @llvm.vector.extract.v16f64.v32f64(<32 x double> %op, i64 16)
612  store <16 x double> %ret, ptr %b
613  ret void
614}
615
616; Test for infinite loop due to fold:
617; extract_subvector(insert_subvector(x,y,c1),c2)--> extract_subvector(y,c2-c1)
618define void @extract_subvector_legalization_v8i32() vscale_range(2,2) #0 {
619; CHECK-LABEL: extract_subvector_legalization_v8i32:
620; CHECK:       // %bb.0: // %entry
621; CHECK-NEXT:    ptrue p0.s
622; CHECK-NEXT:    adrp x8, .LCPI40_0
623; CHECK-NEXT:    add x8, x8, :lo12:.LCPI40_0
624; CHECK-NEXT:    movi v2.2d, #0000000000000000
625; CHECK-NEXT:    ptrue p1.d
626; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
627; CHECK-NEXT:    mov z1.d, z0.d
628; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
629; CHECK-NEXT:    cmeq v0.4s, v0.4s, v2.4s
630; CHECK-NEXT:    cmeq v1.4s, v1.4s, v2.4s
631; CHECK-NEXT:    sunpklo z0.d, z0.s
632; CHECK-NEXT:    sunpklo z1.d, z1.s
633; CHECK-NEXT:    cmpne p0.d, p1/z, z1.d, #0
634; CHECK-NEXT:    cmpne p1.d, p1/z, z0.d, #0
635; CHECK-NEXT:  .LBB40_1: // %body
636; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
637; CHECK-NEXT:    st1d { z0.d }, p1, [x8]
638; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
639; CHECK-NEXT:    b .LBB40_1
640entry:
641  %splat = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer
642  br label %body
643body:
644  %0 = icmp eq <8 x i32> zeroinitializer, %splat
645  tail call void @llvm.masked.store.v8f64.p0(<8 x double> poison, ptr poison, i32 8, <8 x i1> %0)
646  br label %body
647}
648declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr nocapture, i32 immarg, <8 x i1>)
649
650declare <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8>, i64)
651declare <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8>, i64)
652declare <16 x i8> @llvm.vector.extract.v16i8.v32i8(<32 x i8>, i64)
653declare <32 x i8> @llvm.vector.extract.v32i8.v64i8(<64 x i8>, i64)
654declare <64 x i8> @llvm.vector.extract.v64i8.v128i8(<128 x i8>, i64)
655declare <128 x i8> @llvm.vector.extract.v128i8.v256i8(<256 x i8>, i64)
656
657declare <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16>, i64)
658declare <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16>, i64)
659declare <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16>, i64)
660declare <16 x i16> @llvm.vector.extract.v16i16.v32i16(<32 x i16>, i64)
661declare <32 x i16> @llvm.vector.extract.v32i16.v64i16(<64 x i16>, i64)
662declare <64 x i16> @llvm.vector.extract.v64i16.v128i16(<128 x i16>, i64)
663
664declare <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32>, i64)
665declare <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32>, i64)
666declare <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32>, i64)
667declare <8 x i32> @llvm.vector.extract.v8i32.v16i32(<16 x i32>, i64)
668declare <16 x i32> @llvm.vector.extract.v16i32.v32i32(<32 x i32>, i64)
669declare <32 x i32> @llvm.vector.extract.v32i32.v64i32(<64 x i32>, i64)
670
671declare <1 x i64> @llvm.vector.extract.v1i64.v2i64(<2 x i64>, i64)
672declare <2 x i64> @llvm.vector.extract.v2i64.v4i64(<4 x i64>, i64)
673declare <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64>, i64)
674declare <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64>, i64)
675declare <16 x i64> @llvm.vector.extract.v16i64.v32i64(<32 x i64>, i64)
676
677declare <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half>, i64)
678declare <4 x half> @llvm.vector.extract.v4f16.v8f16(<8 x half>, i64)
679declare <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half>, i64)
680declare <16 x half> @llvm.vector.extract.v16f16.v32f16(<32 x half>, i64)
681declare <32 x half> @llvm.vector.extract.v32f16.v64f16(<64 x half>, i64)
682declare <64 x half> @llvm.vector.extract.v64f16.v128f16(<128 x half>, i64)
683
684declare <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float>, i64)
685declare <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float>, i64)
686declare <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float>, i64)
687declare <8 x float> @llvm.vector.extract.v8f32.v16f32(<16 x float>, i64)
688declare <16 x float> @llvm.vector.extract.v16f32.v32f32(<32 x float>, i64)
689declare <32 x float> @llvm.vector.extract.v32f32.v64f32(<64 x float>, i64)
690
691declare <1 x double> @llvm.vector.extract.v1f64.v2f64(<2 x double>, i64)
692declare <2 x double> @llvm.vector.extract.v2f64.v4f64(<4 x double>, i64)
693declare <4 x double> @llvm.vector.extract.v4f64.v8f64(<8 x double>, i64)
694declare <8 x double> @llvm.vector.extract.v8f64.v16f64(<16 x double>, i64)
695declare <16 x double> @llvm.vector.extract.v16f64.v32f64(<32 x double>, i64)
696
697attributes #0 = { "target-features"="+sve" }
698