xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll (revision cc82f1290a1e2157a6c0530d78d8cc84d2b8553d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8; Don't use SVE for 64-bit vectors.
9define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) vscale_range(2,0) #0 {
10; CHECK-LABEL: select_v8i8:
11; CHECK:       // %bb.0:
12; CHECK-NEXT:    tst w0, #0x1
13; CHECK-NEXT:    csetm w8, ne
14; CHECK-NEXT:    dup v2.8b, w8
15; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
16; CHECK-NEXT:    ret
17  %sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2
18  ret <8 x i8> %sel
19}
20
21; Don't use SVE for 128-bit vectors.
22define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) vscale_range(2,0) #0 {
23; CHECK-LABEL: select_v16i8:
24; CHECK:       // %bb.0:
25; CHECK-NEXT:    tst w0, #0x1
26; CHECK-NEXT:    csetm w8, ne
27; CHECK-NEXT:    dup v2.16b, w8
28; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
29; CHECK-NEXT:    ret
30  %sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2
31  ret <16 x i8> %sel
32}
33
34define void @select_v32i8(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
35; CHECK-LABEL: select_v32i8:
36; CHECK:       // %bb.0:
37; CHECK-NEXT:    mov z0.b, w2
38; CHECK-NEXT:    ptrue p0.b
39; CHECK-NEXT:    ptrue p1.b, vl32
40; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
41; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0]
42; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x1]
43; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
44; CHECK-NEXT:    st1b { z0.b }, p1, [x0]
45; CHECK-NEXT:    ret
46  %op1 = load volatile <32 x i8>, ptr %a
47  %op2 = load volatile <32 x i8>, ptr %b
48  %sel = select i1 %mask, <32 x i8> %op1, <32 x i8> %op2
49  store <32 x i8> %sel, ptr %a
50  ret void
51}
52
53define void @select_v64i8(ptr %a, ptr %b, i1 %mask) #0 {
54; VBITS_GE_256-LABEL: select_v64i8:
55; VBITS_GE_256:       // %bb.0:
56; VBITS_GE_256-NEXT:    mov z0.b, w2
57; VBITS_GE_256-NEXT:    ptrue p0.b
58; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
59; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
60; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
61; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
62; VBITS_GE_256-NEXT:    ld1b { z1.b }, p1/z, [x0]
63; VBITS_GE_256-NEXT:    ld1b { z2.b }, p1/z, [x1, x8]
64; VBITS_GE_256-NEXT:    ld1b { z3.b }, p1/z, [x1]
65; VBITS_GE_256-NEXT:    sel z0.b, p0, z0.b, z2.b
66; VBITS_GE_256-NEXT:    sel z1.b, p0, z1.b, z3.b
67; VBITS_GE_256-NEXT:    st1b { z0.b }, p1, [x0, x8]
68; VBITS_GE_256-NEXT:    st1b { z1.b }, p1, [x0]
69; VBITS_GE_256-NEXT:    ret
70;
71; VBITS_GE_512-LABEL: select_v64i8:
72; VBITS_GE_512:       // %bb.0:
73; VBITS_GE_512-NEXT:    mov z0.b, w2
74; VBITS_GE_512-NEXT:    ptrue p0.b
75; VBITS_GE_512-NEXT:    ptrue p1.b, vl64
76; VBITS_GE_512-NEXT:    cmpne p0.b, p0/z, z0.b, #0
77; VBITS_GE_512-NEXT:    ld1b { z0.b }, p1/z, [x0]
78; VBITS_GE_512-NEXT:    ld1b { z1.b }, p1/z, [x1]
79; VBITS_GE_512-NEXT:    sel z0.b, p0, z0.b, z1.b
80; VBITS_GE_512-NEXT:    st1b { z0.b }, p1, [x0]
81; VBITS_GE_512-NEXT:    ret
82  %op1 = load volatile <64 x i8>, ptr %a
83  %op2 = load volatile <64 x i8>, ptr %b
84  %sel = select i1 %mask, <64 x i8> %op1, <64 x i8> %op2
85  store <64 x i8> %sel, ptr %a
86  ret void
87}
88
89define void @select_v128i8(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
90; CHECK-LABEL: select_v128i8:
91; CHECK:       // %bb.0:
92; CHECK-NEXT:    mov z0.b, w2
93; CHECK-NEXT:    ptrue p0.b
94; CHECK-NEXT:    ptrue p1.b, vl128
95; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
96; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0]
97; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x1]
98; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
99; CHECK-NEXT:    st1b { z0.b }, p1, [x0]
100; CHECK-NEXT:    ret
101  %op1 = load volatile <128 x i8>, ptr %a
102  %op2 = load volatile <128 x i8>, ptr %b
103  %sel = select i1 %mask, <128 x i8> %op1, <128 x i8> %op2
104  store <128 x i8> %sel, ptr %a
105  ret void
106}
107
108define void @select_v256i8(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
109; CHECK-LABEL: select_v256i8:
110; CHECK:       // %bb.0:
111; CHECK-NEXT:    mov z0.b, w2
112; CHECK-NEXT:    ptrue p0.b
113; CHECK-NEXT:    ptrue p1.b, vl256
114; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
115; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0]
116; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x1]
117; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
118; CHECK-NEXT:    st1b { z0.b }, p1, [x0]
119; CHECK-NEXT:    ret
120  %op1 = load volatile <256 x i8>, ptr %a
121  %op2 = load volatile <256 x i8>, ptr %b
122  %sel = select i1 %mask, <256 x i8> %op1, <256 x i8> %op2
123  store <256 x i8> %sel, ptr %a
124  ret void
125}
126
127; Don't use SVE for 64-bit vectors.
128define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) vscale_range(2,0) #0 {
129; CHECK-LABEL: select_v4i16:
130; CHECK:       // %bb.0:
131; CHECK-NEXT:    tst w0, #0x1
132; CHECK-NEXT:    csetm w8, ne
133; CHECK-NEXT:    dup v2.4h, w8
134; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
135; CHECK-NEXT:    ret
136  %sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2
137  ret <4 x i16> %sel
138}
139
140; Don't use SVE for 128-bit vectors.
141define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) vscale_range(2,0) #0 {
142; CHECK-LABEL: select_v8i16:
143; CHECK:       // %bb.0:
144; CHECK-NEXT:    tst w0, #0x1
145; CHECK-NEXT:    csetm w8, ne
146; CHECK-NEXT:    dup v2.8h, w8
147; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
148; CHECK-NEXT:    ret
149  %sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2
150  ret <8 x i16> %sel
151}
152
153define void @select_v16i16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
154; CHECK-LABEL: select_v16i16:
155; CHECK:       // %bb.0:
156; CHECK-NEXT:    mov z0.h, w2
157; CHECK-NEXT:    ptrue p0.h
158; CHECK-NEXT:    ptrue p1.h, vl16
159; CHECK-NEXT:    and z0.h, z0.h, #0x1
160; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
161; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
162; CHECK-NEXT:    ld1h { z1.h }, p1/z, [x1]
163; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
164; CHECK-NEXT:    st1h { z0.h }, p1, [x0]
165; CHECK-NEXT:    ret
166  %op1 = load volatile <16 x i16>, ptr %a
167  %op2 = load volatile <16 x i16>, ptr %b
168  %sel = select i1 %mask, <16 x i16> %op1, <16 x i16> %op2
169  store <16 x i16> %sel, ptr %a
170  ret void
171}
172
173define void @select_v32i16(ptr %a, ptr %b, i1 %mask) #0 {
174; VBITS_GE_256-LABEL: select_v32i16:
175; VBITS_GE_256:       // %bb.0:
176; VBITS_GE_256-NEXT:    mov z0.h, w2
177; VBITS_GE_256-NEXT:    ptrue p0.h
178; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
179; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
180; VBITS_GE_256-NEXT:    and z0.h, z0.h, #0x1
181; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z0.h, #0
182; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
183; VBITS_GE_256-NEXT:    ld1h { z1.h }, p1/z, [x0]
184; VBITS_GE_256-NEXT:    ld1h { z2.h }, p1/z, [x1, x8, lsl #1]
185; VBITS_GE_256-NEXT:    ld1h { z3.h }, p1/z, [x1]
186; VBITS_GE_256-NEXT:    sel z0.h, p0, z0.h, z2.h
187; VBITS_GE_256-NEXT:    sel z1.h, p0, z1.h, z3.h
188; VBITS_GE_256-NEXT:    st1h { z0.h }, p1, [x0, x8, lsl #1]
189; VBITS_GE_256-NEXT:    st1h { z1.h }, p1, [x0]
190; VBITS_GE_256-NEXT:    ret
191;
192; VBITS_GE_512-LABEL: select_v32i16:
193; VBITS_GE_512:       // %bb.0:
194; VBITS_GE_512-NEXT:    mov z0.h, w2
195; VBITS_GE_512-NEXT:    ptrue p0.h
196; VBITS_GE_512-NEXT:    ptrue p1.h, vl32
197; VBITS_GE_512-NEXT:    and z0.h, z0.h, #0x1
198; VBITS_GE_512-NEXT:    cmpne p0.h, p0/z, z0.h, #0
199; VBITS_GE_512-NEXT:    ld1h { z0.h }, p1/z, [x0]
200; VBITS_GE_512-NEXT:    ld1h { z1.h }, p1/z, [x1]
201; VBITS_GE_512-NEXT:    sel z0.h, p0, z0.h, z1.h
202; VBITS_GE_512-NEXT:    st1h { z0.h }, p1, [x0]
203; VBITS_GE_512-NEXT:    ret
204  %op1 = load volatile <32 x i16>, ptr %a
205  %op2 = load volatile <32 x i16>, ptr %b
206  %sel = select i1 %mask, <32 x i16> %op1, <32 x i16> %op2
207  store <32 x i16> %sel, ptr %a
208  ret void
209}
210
211define void @select_v64i16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
212; CHECK-LABEL: select_v64i16:
213; CHECK:       // %bb.0:
214; CHECK-NEXT:    mov z0.h, w2
215; CHECK-NEXT:    ptrue p0.h
216; CHECK-NEXT:    ptrue p1.h, vl64
217; CHECK-NEXT:    and z0.h, z0.h, #0x1
218; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
219; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
220; CHECK-NEXT:    ld1h { z1.h }, p1/z, [x1]
221; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
222; CHECK-NEXT:    st1h { z0.h }, p1, [x0]
223; CHECK-NEXT:    ret
224  %op1 = load volatile <64 x i16>, ptr %a
225  %op2 = load volatile <64 x i16>, ptr %b
226  %sel = select i1 %mask, <64 x i16> %op1, <64 x i16> %op2
227  store <64 x i16> %sel, ptr %a
228  ret void
229}
230
231define void @select_v128i16(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
232; CHECK-LABEL: select_v128i16:
233; CHECK:       // %bb.0:
234; CHECK-NEXT:    mov z0.h, w2
235; CHECK-NEXT:    ptrue p0.h
236; CHECK-NEXT:    ptrue p1.h, vl128
237; CHECK-NEXT:    and z0.h, z0.h, #0x1
238; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
239; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
240; CHECK-NEXT:    ld1h { z1.h }, p1/z, [x1]
241; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
242; CHECK-NEXT:    st1h { z0.h }, p1, [x0]
243; CHECK-NEXT:    ret
244  %op1 = load volatile <128 x i16>, ptr %a
245  %op2 = load volatile <128 x i16>, ptr %b
246  %sel = select i1 %mask, <128 x i16> %op1, <128 x i16> %op2
247  store <128 x i16> %sel, ptr %a
248  ret void
249}
250
251; Don't use SVE for 64-bit vectors.
252define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) vscale_range(2,0) #0 {
253; CHECK-LABEL: select_v2i32:
254; CHECK:       // %bb.0:
255; CHECK-NEXT:    tst w0, #0x1
256; CHECK-NEXT:    csetm w8, ne
257; CHECK-NEXT:    dup v2.2s, w8
258; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
259; CHECK-NEXT:    ret
260  %sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2
261  ret <2 x i32> %sel
262}
263
264; Don't use SVE for 128-bit vectors.
265define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) vscale_range(2,0) #0 {
266; CHECK-LABEL: select_v4i32:
267; CHECK:       // %bb.0:
268; CHECK-NEXT:    tst w0, #0x1
269; CHECK-NEXT:    csetm w8, ne
270; CHECK-NEXT:    dup v2.4s, w8
271; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
272; CHECK-NEXT:    ret
273  %sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2
274  ret <4 x i32> %sel
275}
276
277define void @select_v8i32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
278; CHECK-LABEL: select_v8i32:
279; CHECK:       // %bb.0:
280; CHECK-NEXT:    and w8, w2, #0x1
281; CHECK-NEXT:    ptrue p0.s
282; CHECK-NEXT:    mov z0.s, w8
283; CHECK-NEXT:    ptrue p1.s, vl8
284; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
285; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
286; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
287; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
288; CHECK-NEXT:    st1w { z0.s }, p1, [x0]
289; CHECK-NEXT:    ret
290  %op1 = load volatile <8 x i32>, ptr %a
291  %op2 = load volatile <8 x i32>, ptr %b
292  %sel = select i1 %mask, <8 x i32> %op1, <8 x i32> %op2
293  store <8 x i32> %sel, ptr %a
294  ret void
295}
296
297define void @select_v16i32(ptr %a, ptr %b, i1 %mask) #0 {
298; VBITS_GE_256-LABEL: select_v16i32:
299; VBITS_GE_256:       // %bb.0:
300; VBITS_GE_256-NEXT:    and w8, w2, #0x1
301; VBITS_GE_256-NEXT:    ptrue p0.s
302; VBITS_GE_256-NEXT:    mov z0.s, w8
303; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
304; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
305; VBITS_GE_256-NEXT:    cmpne p0.s, p0/z, z0.s, #0
306; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
307; VBITS_GE_256-NEXT:    ld1w { z1.s }, p1/z, [x0]
308; VBITS_GE_256-NEXT:    ld1w { z2.s }, p1/z, [x1, x8, lsl #2]
309; VBITS_GE_256-NEXT:    ld1w { z3.s }, p1/z, [x1]
310; VBITS_GE_256-NEXT:    sel z0.s, p0, z0.s, z2.s
311; VBITS_GE_256-NEXT:    sel z1.s, p0, z1.s, z3.s
312; VBITS_GE_256-NEXT:    st1w { z0.s }, p1, [x0, x8, lsl #2]
313; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x0]
314; VBITS_GE_256-NEXT:    ret
315;
316; VBITS_GE_512-LABEL: select_v16i32:
317; VBITS_GE_512:       // %bb.0:
318; VBITS_GE_512-NEXT:    and w8, w2, #0x1
319; VBITS_GE_512-NEXT:    ptrue p0.s
320; VBITS_GE_512-NEXT:    mov z0.s, w8
321; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
322; VBITS_GE_512-NEXT:    cmpne p0.s, p0/z, z0.s, #0
323; VBITS_GE_512-NEXT:    ld1w { z0.s }, p1/z, [x0]
324; VBITS_GE_512-NEXT:    ld1w { z1.s }, p1/z, [x1]
325; VBITS_GE_512-NEXT:    sel z0.s, p0, z0.s, z1.s
326; VBITS_GE_512-NEXT:    st1w { z0.s }, p1, [x0]
327; VBITS_GE_512-NEXT:    ret
328  %op1 = load volatile <16 x i32>, ptr %a
329  %op2 = load volatile <16 x i32>, ptr %b
330  %sel = select i1 %mask, <16 x i32> %op1, <16 x i32> %op2
331  store <16 x i32> %sel, ptr %a
332  ret void
333}
334
335define void @select_v32i32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
336; CHECK-LABEL: select_v32i32:
337; CHECK:       // %bb.0:
338; CHECK-NEXT:    and w8, w2, #0x1
339; CHECK-NEXT:    ptrue p0.s
340; CHECK-NEXT:    mov z0.s, w8
341; CHECK-NEXT:    ptrue p1.s, vl32
342; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
343; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
344; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
345; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
346; CHECK-NEXT:    st1w { z0.s }, p1, [x0]
347; CHECK-NEXT:    ret
348  %op1 = load volatile <32 x i32>, ptr %a
349  %op2 = load volatile <32 x i32>, ptr %b
350  %sel = select i1 %mask, <32 x i32> %op1, <32 x i32> %op2
351  store <32 x i32> %sel, ptr %a
352  ret void
353}
354
355define void @select_v64i32(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
356; CHECK-LABEL: select_v64i32:
357; CHECK:       // %bb.0:
358; CHECK-NEXT:    and w8, w2, #0x1
359; CHECK-NEXT:    ptrue p0.s
360; CHECK-NEXT:    mov z0.s, w8
361; CHECK-NEXT:    ptrue p1.s, vl64
362; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
363; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
364; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
365; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
366; CHECK-NEXT:    st1w { z0.s }, p1, [x0]
367; CHECK-NEXT:    ret
368  %op1 = load volatile <64 x i32>, ptr %a
369  %op2 = load volatile <64 x i32>, ptr %b
370  %sel = select i1 %mask, <64 x i32> %op1, <64 x i32> %op2
371  store <64 x i32> %sel, ptr %a
372  ret void
373}
374
375; Don't use SVE for 64-bit vectors.
376define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) vscale_range(2,0) #0 {
377; CHECK-LABEL: select_v1i64:
378; CHECK:       // %bb.0:
379; CHECK-NEXT:    tst w0, #0x1
380; CHECK-NEXT:    csetm x8, ne
381; CHECK-NEXT:    fmov d2, x8
382; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
383; CHECK-NEXT:    ret
384  %sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2
385  ret <1 x i64> %sel
386}
387
388; Don't use SVE for 128-bit vectors.
389define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) vscale_range(2,0) #0 {
390; CHECK-LABEL: select_v2i64:
391; CHECK:       // %bb.0:
392; CHECK-NEXT:    tst w0, #0x1
393; CHECK-NEXT:    csetm x8, ne
394; CHECK-NEXT:    dup v2.2d, x8
395; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
396; CHECK-NEXT:    ret
397  %sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2
398  ret <2 x i64> %sel
399}
400
401define void @select_v4i64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
402; CHECK-LABEL: select_v4i64:
403; CHECK:       // %bb.0:
404; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
405; CHECK-NEXT:    and x8, x2, #0x1
406; CHECK-NEXT:    ptrue p0.d
407; CHECK-NEXT:    mov z0.d, x8
408; CHECK-NEXT:    ptrue p1.d, vl4
409; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
410; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0]
411; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
412; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
413; CHECK-NEXT:    st1d { z0.d }, p1, [x0]
414; CHECK-NEXT:    ret
415  %op1 = load volatile <4 x i64>, ptr %a
416  %op2 = load volatile <4 x i64>, ptr %b
417  %sel = select i1 %mask, <4 x i64> %op1, <4 x i64> %op2
418  store <4 x i64> %sel, ptr %a
419  ret void
420}
421
422define void @select_v8i64(ptr %a, ptr %b, i1 %mask) #0 {
423; VBITS_GE_256-LABEL: select_v8i64:
424; VBITS_GE_256:       // %bb.0:
425; VBITS_GE_256-NEXT:    // kill: def $w2 killed $w2 def $x2
426; VBITS_GE_256-NEXT:    and x8, x2, #0x1
427; VBITS_GE_256-NEXT:    ptrue p0.d
428; VBITS_GE_256-NEXT:    mov z0.d, x8
429; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
430; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
431; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z0.d, #0
432; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
433; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [x0]
434; VBITS_GE_256-NEXT:    ld1d { z2.d }, p1/z, [x1, x8, lsl #3]
435; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1]
436; VBITS_GE_256-NEXT:    sel z0.d, p0, z0.d, z2.d
437; VBITS_GE_256-NEXT:    sel z1.d, p0, z1.d, z3.d
438; VBITS_GE_256-NEXT:    st1d { z0.d }, p1, [x0, x8, lsl #3]
439; VBITS_GE_256-NEXT:    st1d { z1.d }, p1, [x0]
440; VBITS_GE_256-NEXT:    ret
441;
442; VBITS_GE_512-LABEL: select_v8i64:
443; VBITS_GE_512:       // %bb.0:
444; VBITS_GE_512-NEXT:    // kill: def $w2 killed $w2 def $x2
445; VBITS_GE_512-NEXT:    and x8, x2, #0x1
446; VBITS_GE_512-NEXT:    ptrue p0.d
447; VBITS_GE_512-NEXT:    mov z0.d, x8
448; VBITS_GE_512-NEXT:    ptrue p1.d, vl8
449; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z0.d, #0
450; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [x0]
451; VBITS_GE_512-NEXT:    ld1d { z1.d }, p1/z, [x1]
452; VBITS_GE_512-NEXT:    sel z0.d, p0, z0.d, z1.d
453; VBITS_GE_512-NEXT:    st1d { z0.d }, p1, [x0]
454; VBITS_GE_512-NEXT:    ret
455  %op1 = load volatile <8 x i64>, ptr %a
456  %op2 = load volatile <8 x i64>, ptr %b
457  %sel = select i1 %mask, <8 x i64> %op1, <8 x i64> %op2
458  store <8 x i64> %sel, ptr %a
459  ret void
460}
461
462define void @select_v16i64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
463; CHECK-LABEL: select_v16i64:
464; CHECK:       // %bb.0:
465; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
466; CHECK-NEXT:    and x8, x2, #0x1
467; CHECK-NEXT:    ptrue p0.d
468; CHECK-NEXT:    mov z0.d, x8
469; CHECK-NEXT:    ptrue p1.d, vl16
470; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
471; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0]
472; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
473; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
474; CHECK-NEXT:    st1d { z0.d }, p1, [x0]
475; CHECK-NEXT:    ret
476  %op1 = load volatile <16 x i64>, ptr %a
477  %op2 = load volatile <16 x i64>, ptr %b
478  %sel = select i1 %mask, <16 x i64> %op1, <16 x i64> %op2
479  store <16 x i64> %sel, ptr %a
480  ret void
481}
482
483define void @select_v32i64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
484; CHECK-LABEL: select_v32i64:
485; CHECK:       // %bb.0:
486; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
487; CHECK-NEXT:    and x8, x2, #0x1
488; CHECK-NEXT:    ptrue p0.d
489; CHECK-NEXT:    mov z0.d, x8
490; CHECK-NEXT:    ptrue p1.d, vl32
491; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
492; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0]
493; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
494; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
495; CHECK-NEXT:    st1d { z0.d }, p1, [x0]
496; CHECK-NEXT:    ret
497  %op1 = load volatile <32 x i64>, ptr %a
498  %op2 = load volatile <32 x i64>, ptr %b
499  %sel = select i1 %mask, <32 x i64> %op1, <32 x i64> %op2
500  store <32 x i64> %sel, ptr %a
501  ret void
502}
503
504attributes #0 = { "target-features"="+sve" }
505