xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll (revision ab7110bcd6b137803935508de8c9f6af377f9454)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8; Don't use SVE for 64-bit vectors.
9define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
10; CHECK-LABEL: select_v4f16:
11; CHECK:       // %bb.0:
12; CHECK-NEXT:    shl v2.4h, v2.4h, #15
13; CHECK-NEXT:    cmlt v2.4h, v2.4h, #0
14; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
15; CHECK-NEXT:    ret
16  %sel = select <4 x i1> %mask, <4 x half> %op1, <4 x half> %op2
17  ret <4 x half> %sel
18}
19
20; Don't use SVE for 128-bit vectors.
21define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask) vscale_range(2,0) #0 {
22; CHECK-LABEL: select_v8f16:
23; CHECK:       // %bb.0:
24; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
25; CHECK-NEXT:    shl v2.8h, v2.8h, #15
26; CHECK-NEXT:    cmlt v2.8h, v2.8h, #0
27; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
28; CHECK-NEXT:    ret
29  %sel = select <8 x i1> %mask, <8 x half> %op1, <8 x half> %op2
30  ret <8 x half> %sel
31}
32
33define void @select_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
34; CHECK-LABEL: select_v16f16:
35; CHECK:       // %bb.0:
36; CHECK-NEXT:    ptrue p0.h, vl16
37; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
38; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
39; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, z1.h
40; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
41; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
42; CHECK-NEXT:    ret
43  %op1 = load <16 x half>, ptr %a
44  %op2 = load <16 x half>, ptr %b
45  %mask = fcmp oeq <16 x half> %op1, %op2
46  %sel = select <16 x i1> %mask, <16 x half> %op1, <16 x half> %op2
47  store <16 x half> %sel, ptr %a
48  ret void
49}
50
51define void @select_v32f16(ptr %a, ptr %b) #0 {
52; VBITS_GE_256-LABEL: select_v32f16:
53; VBITS_GE_256:       // %bb.0:
54; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
55; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
56; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
57; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
58; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
59; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
60; VBITS_GE_256-NEXT:    fcmeq p1.h, p0/z, z0.h, z1.h
61; VBITS_GE_256-NEXT:    fcmeq p2.h, p0/z, z2.h, z3.h
62; VBITS_GE_256-NEXT:    sel z0.h, p1, z0.h, z1.h
63; VBITS_GE_256-NEXT:    sel z1.h, p2, z2.h, z3.h
64; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
65; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
66; VBITS_GE_256-NEXT:    ret
67;
68; VBITS_GE_512-LABEL: select_v32f16:
69; VBITS_GE_512:       // %bb.0:
70; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
71; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
72; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
73; VBITS_GE_512-NEXT:    fcmeq p1.h, p0/z, z0.h, z1.h
74; VBITS_GE_512-NEXT:    sel z0.h, p1, z0.h, z1.h
75; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
76; VBITS_GE_512-NEXT:    ret
77  %op1 = load <32 x half>, ptr %a
78  %op2 = load <32 x half>, ptr %b
79  %mask = fcmp oeq <32 x half> %op1, %op2
80  %sel = select <32 x i1> %mask, <32 x half> %op1, <32 x half> %op2
81  store <32 x half> %sel, ptr %a
82  ret void
83}
84
85define void @select_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
86; CHECK-LABEL: select_v64f16:
87; CHECK:       // %bb.0:
88; CHECK-NEXT:    ptrue p0.h, vl64
89; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
90; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
91; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, z1.h
92; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
93; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
94; CHECK-NEXT:    ret
95  %op1 = load <64 x half>, ptr %a
96  %op2 = load <64 x half>, ptr %b
97  %mask = fcmp oeq <64 x half> %op1, %op2
98  %sel = select <64 x i1> %mask, <64 x half> %op1, <64 x half> %op2
99  store <64 x half> %sel, ptr %a
100  ret void
101}
102
103define void @select_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
104; CHECK-LABEL: select_v128f16:
105; CHECK:       // %bb.0:
106; CHECK-NEXT:    ptrue p0.h, vl128
107; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
108; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
109; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, z1.h
110; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
111; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
112; CHECK-NEXT:    ret
113  %op1 = load <128 x half>, ptr %a
114  %op2 = load <128 x half>, ptr %b
115  %mask = fcmp oeq <128 x half> %op1, %op2
116  %sel = select <128 x i1> %mask, <128 x half> %op1, <128 x half> %op2
117  store <128 x half> %sel, ptr %a
118  ret void
119}
120
121; Don't use SVE for 64-bit vectors.
122define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
123; CHECK-LABEL: select_v2f32:
124; CHECK:       // %bb.0:
125; CHECK-NEXT:    shl v2.2s, v2.2s, #31
126; CHECK-NEXT:    cmlt v2.2s, v2.2s, #0
127; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
128; CHECK-NEXT:    ret
129  %sel = select <2 x i1> %mask, <2 x float> %op1, <2 x float> %op2
130  ret <2 x float> %sel
131}
132
133; Don't use SVE for 128-bit vectors.
134define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
135; CHECK-LABEL: select_v4f32:
136; CHECK:       // %bb.0:
137; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
138; CHECK-NEXT:    shl v2.4s, v2.4s, #31
139; CHECK-NEXT:    cmlt v2.4s, v2.4s, #0
140; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
141; CHECK-NEXT:    ret
142  %sel = select <4 x i1> %mask, <4 x float> %op1, <4 x float> %op2
143  ret <4 x float> %sel
144}
145
146define void @select_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
147; CHECK-LABEL: select_v8f32:
148; CHECK:       // %bb.0:
149; CHECK-NEXT:    ptrue p0.s, vl8
150; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
151; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
152; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
153; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
154; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
155; CHECK-NEXT:    ret
156  %op1 = load <8 x float>, ptr %a
157  %op2 = load <8 x float>, ptr %b
158  %mask = fcmp oeq <8 x float> %op1, %op2
159  %sel = select <8 x i1> %mask, <8 x float> %op1, <8 x float> %op2
160  store <8 x float> %sel, ptr %a
161  ret void
162}
163
164define void @select_v16f32(ptr %a, ptr %b) #0 {
165; VBITS_GE_256-LABEL: select_v16f32:
166; VBITS_GE_256:       // %bb.0:
167; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
168; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
169; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
170; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
171; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
172; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
173; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
174; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z2.s, z3.s
175; VBITS_GE_256-NEXT:    sel z0.s, p1, z0.s, z1.s
176; VBITS_GE_256-NEXT:    sel z1.s, p2, z2.s, z3.s
177; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
178; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
179; VBITS_GE_256-NEXT:    ret
180;
181; VBITS_GE_512-LABEL: select_v16f32:
182; VBITS_GE_512:       // %bb.0:
183; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
184; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
185; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
186; VBITS_GE_512-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
187; VBITS_GE_512-NEXT:    sel z0.s, p1, z0.s, z1.s
188; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
189; VBITS_GE_512-NEXT:    ret
190  %op1 = load <16 x float>, ptr %a
191  %op2 = load <16 x float>, ptr %b
192  %mask = fcmp oeq <16 x float> %op1, %op2
193  %sel = select <16 x i1> %mask, <16 x float> %op1, <16 x float> %op2
194  store <16 x float> %sel, ptr %a
195  ret void
196}
197
198define void @select_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
199; CHECK-LABEL: select_v32f32:
200; CHECK:       // %bb.0:
201; CHECK-NEXT:    ptrue p0.s, vl32
202; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
203; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
204; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
205; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
206; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
207; CHECK-NEXT:    ret
208  %op1 = load <32 x float>, ptr %a
209  %op2 = load <32 x float>, ptr %b
210  %mask = fcmp oeq <32 x float> %op1, %op2
211  %sel = select <32 x i1> %mask, <32 x float> %op1, <32 x float> %op2
212  store <32 x float> %sel, ptr %a
213  ret void
214}
215
216define void @select_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
217; CHECK-LABEL: select_v64f32:
218; CHECK:       // %bb.0:
219; CHECK-NEXT:    ptrue p0.s, vl64
220; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
221; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
222; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
223; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
224; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
225; CHECK-NEXT:    ret
226  %op1 = load <64 x float>, ptr %a
227  %op2 = load <64 x float>, ptr %b
228  %mask = fcmp oeq <64 x float> %op1, %op2
229  %sel = select <64 x i1> %mask, <64 x float> %op1, <64 x float> %op2
230  store <64 x float> %sel, ptr %a
231  ret void
232}
233
234; Don't use SVE for 64-bit vectors.
235define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> %mask) vscale_range(2,0) #0 {
236; CHECK-LABEL: select_v1f64:
237; CHECK:       // %bb.0:
238; CHECK-NEXT:    tst w0, #0x1
239; CHECK-NEXT:    csetm x8, ne
240; CHECK-NEXT:    fmov d2, x8
241; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
242; CHECK-NEXT:    ret
243  %sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2
244  ret <1 x double> %sel
245}
246
247; Don't use SVE for 128-bit vectors.
248define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
249; CHECK-LABEL: select_v2f64:
250; CHECK:       // %bb.0:
251; CHECK-NEXT:    ushll v2.2d, v2.2s, #0
252; CHECK-NEXT:    shl v2.2d, v2.2d, #63
253; CHECK-NEXT:    cmlt v2.2d, v2.2d, #0
254; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
255; CHECK-NEXT:    ret
256  %sel = select <2 x i1> %mask, <2 x double> %op1, <2 x double> %op2
257  ret <2 x double> %sel
258}
259
260define void @select_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
261; CHECK-LABEL: select_v4f64:
262; CHECK:       // %bb.0:
263; CHECK-NEXT:    ptrue p0.d, vl4
264; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
265; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
266; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
267; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
268; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
269; CHECK-NEXT:    ret
270  %op1 = load <4 x double>, ptr %a
271  %op2 = load <4 x double>, ptr %b
272  %mask = fcmp oeq <4 x double> %op1, %op2
273  %sel = select <4 x i1> %mask, <4 x double> %op1, <4 x double> %op2
274  store <4 x double> %sel, ptr %a
275  ret void
276}
277
278define void @select_v8f64(ptr %a, ptr %b) #0 {
279; VBITS_GE_256-LABEL: select_v8f64:
280; VBITS_GE_256:       // %bb.0:
281; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
282; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
283; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
284; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
285; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
286; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
287; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
288; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z2.d, z3.d
289; VBITS_GE_256-NEXT:    sel z0.d, p1, z0.d, z1.d
290; VBITS_GE_256-NEXT:    sel z1.d, p2, z2.d, z3.d
291; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
292; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
293; VBITS_GE_256-NEXT:    ret
294;
295; VBITS_GE_512-LABEL: select_v8f64:
296; VBITS_GE_512:       // %bb.0:
297; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
298; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
299; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
300; VBITS_GE_512-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
301; VBITS_GE_512-NEXT:    sel z0.d, p1, z0.d, z1.d
302; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
303; VBITS_GE_512-NEXT:    ret
304  %op1 = load <8 x double>, ptr %a
305  %op2 = load <8 x double>, ptr %b
306  %mask = fcmp oeq <8 x double> %op1, %op2
307  %sel = select <8 x i1> %mask, <8 x double> %op1, <8 x double> %op2
308  store <8 x double> %sel, ptr %a
309  ret void
310}
311
312define void @select_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
313; CHECK-LABEL: select_v16f64:
314; CHECK:       // %bb.0:
315; CHECK-NEXT:    ptrue p0.d, vl16
316; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
317; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
318; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
319; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
320; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
321; CHECK-NEXT:    ret
322  %op1 = load <16 x double>, ptr %a
323  %op2 = load <16 x double>, ptr %b
324  %mask = fcmp oeq <16 x double> %op1, %op2
325  %sel = select <16 x i1> %mask, <16 x double> %op1, <16 x double> %op2
326  store <16 x double> %sel, ptr %a
327  ret void
328}
329
330define void @select_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
331; CHECK-LABEL: select_v32f64:
332; CHECK:       // %bb.0:
333; CHECK-NEXT:    ptrue p0.d, vl32
334; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
335; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
336; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
337; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
338; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
339; CHECK-NEXT:    ret
340  %op1 = load <32 x double>, ptr %a
341  %op2 = load <32 x double>, ptr %b
342  %mask = fcmp oeq <32 x double> %op1, %op2
343  %sel = select <32 x i1> %mask, <32 x double> %op1, <32 x double> %op2
344  store <32 x double> %sel, ptr %a
345  ret void
346}
347
348attributes #0 = { "target-features"="+sve" }
349