xref: /llvm-project/llvm/test/CodeGen/AArch64/sve2-fixed-length-fcopysign.ll (revision cc82f1290a1e2157a6c0530d78d8cc84d2b8553d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,CHECK_NO_EXTEND_ROUND
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,CHECK_NO_EXTEND_ROUND
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,CHECK_NO_EXTEND_ROUND
5; RUN: llc -aarch64-sve-vector-bits-min=256  --combiner-vector-fcopysign-extend-round < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,CHECK_EXTEND_ROUND
6; RUN: llc -aarch64-sve-vector-bits-min=512  --combiner-vector-fcopysign-extend-round < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,CHECK_EXTEND_ROUND
7; RUN: llc -aarch64-sve-vector-bits-min=2048 --combiner-vector-fcopysign-extend-round < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,CHECK_EXTEND_ROUND
8
9
10target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
11
12target triple = "aarch64-unknown-linux-gnu"
13
14;============ f16
15
16define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
17; CHECK-LABEL: test_copysign_v4f16_v4f16:
18; CHECK:       // %bb.0:
19; CHECK-NEXT:    mvni v0.4h, #128, lsl #8
20; CHECK-NEXT:    ldr d1, [x0]
21; CHECK-NEXT:    ldr d2, [x1]
22; CHECK-NEXT:    bsl v0.8b, v1.8b, v2.8b
23; CHECK-NEXT:    str d0, [x0]
24; CHECK-NEXT:    ret
25  %a = load <4 x half>, ptr %ap
26  %b = load <4 x half>, ptr %bp
27  %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b)
28  store <4 x half> %r, ptr %ap
29  ret void
30}
31
32define void @test_copysign_v8f16_v8f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
33; CHECK-LABEL: test_copysign_v8f16_v8f16:
34; CHECK:       // %bb.0:
35; CHECK-NEXT:    mvni v0.8h, #128, lsl #8
36; CHECK-NEXT:    ldr q1, [x0]
37; CHECK-NEXT:    ldr q2, [x1]
38; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
39; CHECK-NEXT:    str q0, [x0]
40; CHECK-NEXT:    ret
41  %a = load <8 x half>, ptr %ap
42  %b = load <8 x half>, ptr %bp
43  %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b)
44  store <8 x half> %r, ptr %ap
45  ret void
46}
47
48define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
49; CHECK-LABEL: test_copysign_v16f16_v16f16:
50; CHECK:       // %bb.0:
51; CHECK-NEXT:    ptrue p0.h, vl16
52; CHECK-NEXT:    mov z0.h, #32767 // =0x7fff
53; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
54; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
55; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
56; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
57; CHECK-NEXT:    ret
58  %a = load <16 x half>, ptr %ap
59  %b = load <16 x half>, ptr %bp
60  %r = call <16 x half> @llvm.copysign.v16f16(<16 x half> %a, <16 x half> %b)
61  store <16 x half> %r, ptr %ap
62  ret void
63}
64
65define void @test_copysign_v32f16_v32f16(ptr %ap, ptr %bp) #0 {
66; VBITS_GE_256-LABEL: test_copysign_v32f16_v32f16:
67; VBITS_GE_256:       // %bb.0:
68; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
69; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
70; VBITS_GE_256-NEXT:    mov z0.h, #32767 // =0x7fff
71; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
72; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
73; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0]
74; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x1]
75; VBITS_GE_256-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
76; VBITS_GE_256-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
77; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0, x8, lsl #1]
78; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x0]
79; VBITS_GE_256-NEXT:    ret
80;
81; VBITS_GE_512-LABEL: test_copysign_v32f16_v32f16:
82; VBITS_GE_512:       // %bb.0:
83; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
84; VBITS_GE_512-NEXT:    mov z0.h, #32767 // =0x7fff
85; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x0]
86; VBITS_GE_512-NEXT:    ld1h { z2.h }, p0/z, [x1]
87; VBITS_GE_512-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
88; VBITS_GE_512-NEXT:    st1h { z1.h }, p0, [x0]
89; VBITS_GE_512-NEXT:    ret
90  %a = load <32 x half>, ptr %ap
91  %b = load <32 x half>, ptr %bp
92  %r = call <32 x half> @llvm.copysign.v32f16(<32 x half> %a, <32 x half> %b)
93  store <32 x half> %r, ptr %ap
94  ret void
95}
96
97define void @test_copysign_v64f16_v64f16(ptr %ap, ptr %bp) vscale_range(8,0) #0 {
98; CHECK-LABEL: test_copysign_v64f16_v64f16:
99; CHECK:       // %bb.0:
100; CHECK-NEXT:    ptrue p0.h, vl64
101; CHECK-NEXT:    mov z0.h, #32767 // =0x7fff
102; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
103; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
104; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
105; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
106; CHECK-NEXT:    ret
107  %a = load <64 x half>, ptr %ap
108  %b = load <64 x half>, ptr %bp
109  %r = call <64 x half> @llvm.copysign.v64f16(<64 x half> %a, <64 x half> %b)
110  store <64 x half> %r, ptr %ap
111  ret void
112}
113
114define void @test_copysign_v128f16_v128f16(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
115; CHECK-LABEL: test_copysign_v128f16_v128f16:
116; CHECK:       // %bb.0:
117; CHECK-NEXT:    ptrue p0.h, vl128
118; CHECK-NEXT:    mov z0.h, #32767 // =0x7fff
119; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
120; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
121; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
122; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
123; CHECK-NEXT:    ret
124  %a = load <128 x half>, ptr %ap
125  %b = load <128 x half>, ptr %bp
126  %r = call <128 x half> @llvm.copysign.v128f16(<128 x half> %a, <128 x half> %b)
127  store <128 x half> %r, ptr %ap
128  ret void
129}
130
131;============ f32
132
133define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
134; CHECK-LABEL: test_copysign_v2f32_v2f32:
135; CHECK:       // %bb.0:
136; CHECK-NEXT:    mvni v0.2s, #128, lsl #24
137; CHECK-NEXT:    ldr d1, [x0]
138; CHECK-NEXT:    ldr d2, [x1]
139; CHECK-NEXT:    bsl v0.8b, v1.8b, v2.8b
140; CHECK-NEXT:    str d0, [x0]
141; CHECK-NEXT:    ret
142  %a = load <2 x float>, ptr %ap
143  %b = load <2 x float>, ptr %bp
144  %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b)
145  store <2 x float> %r, ptr %ap
146  ret void
147}
148
149define void @test_copysign_v4f32_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
150; CHECK-LABEL: test_copysign_v4f32_v4f32:
151; CHECK:       // %bb.0:
152; CHECK-NEXT:    mvni v0.4s, #128, lsl #24
153; CHECK-NEXT:    ldr q1, [x0]
154; CHECK-NEXT:    ldr q2, [x1]
155; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
156; CHECK-NEXT:    str q0, [x0]
157; CHECK-NEXT:    ret
158  %a = load <4 x float>, ptr %ap
159  %b = load <4 x float>, ptr %bp
160  %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b)
161  store <4 x float> %r, ptr %ap
162  ret void
163}
164
165define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
166; CHECK-LABEL: test_copysign_v8f32_v8f32:
167; CHECK:       // %bb.0:
168; CHECK-NEXT:    ptrue p0.s, vl8
169; CHECK-NEXT:    mov z0.s, #0x7fffffff
170; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
171; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
172; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
173; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
174; CHECK-NEXT:    ret
175  %a = load <8 x float>, ptr %ap
176  %b = load <8 x float>, ptr %bp
177  %r = call <8 x float> @llvm.copysign.v8f32(<8 x float> %a, <8 x float> %b)
178  store <8 x float> %r, ptr %ap
179  ret void
180}
181
182define void @test_copysign_v16f32_v16f32(ptr %ap, ptr %bp) #0 {
183; VBITS_GE_256-LABEL: test_copysign_v16f32_v16f32:
184; VBITS_GE_256:       // %bb.0:
185; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
186; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
187; VBITS_GE_256-NEXT:    mov z0.s, #0x7fffffff
188; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
189; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
190; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
191; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1]
192; VBITS_GE_256-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
193; VBITS_GE_256-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
194; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
195; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x0]
196; VBITS_GE_256-NEXT:    ret
197;
198; VBITS_GE_512-LABEL: test_copysign_v16f32_v16f32:
199; VBITS_GE_512:       // %bb.0:
200; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
201; VBITS_GE_512-NEXT:    mov z0.s, #0x7fffffff
202; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0]
203; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x1]
204; VBITS_GE_512-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
205; VBITS_GE_512-NEXT:    st1w { z1.s }, p0, [x0]
206; VBITS_GE_512-NEXT:    ret
207  %a = load <16 x float>, ptr %ap
208  %b = load <16 x float>, ptr %bp
209  %r = call <16 x float> @llvm.copysign.v16f32(<16 x float> %a, <16 x float> %b)
210  store <16 x float> %r, ptr %ap
211  ret void
212}
213
214define void @test_copysign_v32f32_v32f32(ptr %ap, ptr %bp) vscale_range(8,0) #0 {
215; CHECK-LABEL: test_copysign_v32f32_v32f32:
216; CHECK:       // %bb.0:
217; CHECK-NEXT:    ptrue p0.s, vl32
218; CHECK-NEXT:    mov z0.s, #0x7fffffff
219; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
220; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
221; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
222; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
223; CHECK-NEXT:    ret
224  %a = load <32 x float>, ptr %ap
225  %b = load <32 x float>, ptr %bp
226  %r = call <32 x float> @llvm.copysign.v32f32(<32 x float> %a, <32 x float> %b)
227  store <32 x float> %r, ptr %ap
228  ret void
229}
230
231define void @test_copysign_v64f32_v64f32(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
232; CHECK-LABEL: test_copysign_v64f32_v64f32:
233; CHECK:       // %bb.0:
234; CHECK-NEXT:    ptrue p0.s, vl64
235; CHECK-NEXT:    mov z0.s, #0x7fffffff
236; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
237; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
238; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
239; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
240; CHECK-NEXT:    ret
241  %a = load <64 x float>, ptr %ap
242  %b = load <64 x float>, ptr %bp
243  %r = call <64 x float> @llvm.copysign.v64f32(<64 x float> %a, <64 x float> %b)
244  store <64 x float> %r, ptr %ap
245  ret void
246}
247
248;============ f64
249
250define void @test_copysign_v2f64_v2f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
251; CHECK-LABEL: test_copysign_v2f64_v2f64:
252; CHECK:       // %bb.0:
253; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
254; CHECK-NEXT:    ldr q1, [x0]
255; CHECK-NEXT:    ldr q2, [x1]
256; CHECK-NEXT:    fneg v0.2d, v0.2d
257; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
258; CHECK-NEXT:    str q0, [x0]
259; CHECK-NEXT:    ret
260  %a = load <2 x double>, ptr %ap
261  %b = load <2 x double>, ptr %bp
262  %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b)
263  store <2 x double> %r, ptr %ap
264  ret void
265}
266
267define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
268; CHECK-LABEL: test_copysign_v4f64_v4f64:
269; CHECK:       // %bb.0:
270; CHECK-NEXT:    ptrue p0.d, vl4
271; CHECK-NEXT:    mov z0.d, #0x7fffffffffffffff
272; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
273; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
274; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
275; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
276; CHECK-NEXT:    ret
277  %a = load <4 x double>, ptr %ap
278  %b = load <4 x double>, ptr %bp
279  %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b)
280  store <4 x double> %r, ptr %ap
281  ret void
282}
283
284define void @test_copysign_v8f64_v8f64(ptr %ap, ptr %bp) #0 {
285; VBITS_GE_256-LABEL: test_copysign_v8f64_v8f64:
286; VBITS_GE_256:       // %bb.0:
287; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
288; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
289; VBITS_GE_256-NEXT:    mov z0.d, #0x7fffffffffffffff
290; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
291; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
292; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0]
293; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
294; VBITS_GE_256-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
295; VBITS_GE_256-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
296; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
297; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x0]
298; VBITS_GE_256-NEXT:    ret
299;
300; VBITS_GE_512-LABEL: test_copysign_v8f64_v8f64:
301; VBITS_GE_512:       // %bb.0:
302; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
303; VBITS_GE_512-NEXT:    mov z0.d, #0x7fffffffffffffff
304; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x0]
305; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x1]
306; VBITS_GE_512-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
307; VBITS_GE_512-NEXT:    st1d { z1.d }, p0, [x0]
308; VBITS_GE_512-NEXT:    ret
309  %a = load <8 x double>, ptr %ap
310  %b = load <8 x double>, ptr %bp
311  %r = call <8 x double> @llvm.copysign.v8f64(<8 x double> %a, <8 x double> %b)
312  store <8 x double> %r, ptr %ap
313  ret void
314}
315
316define void @test_copysign_v16f64_v16f64(ptr %ap, ptr %bp) vscale_range(8,0) #0 {
317; CHECK-LABEL: test_copysign_v16f64_v16f64:
318; CHECK:       // %bb.0:
319; CHECK-NEXT:    ptrue p0.d, vl16
320; CHECK-NEXT:    mov z0.d, #0x7fffffffffffffff
321; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
322; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
323; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
324; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
325; CHECK-NEXT:    ret
326  %a = load <16 x double>, ptr %ap
327  %b = load <16 x double>, ptr %bp
328  %r = call <16 x double> @llvm.copysign.v16f64(<16 x double> %a, <16 x double> %b)
329  store <16 x double> %r, ptr %ap
330  ret void
331}
332
333define void @test_copysign_v32f64_v32f64(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
334; CHECK-LABEL: test_copysign_v32f64_v32f64:
335; CHECK:       // %bb.0:
336; CHECK-NEXT:    ptrue p0.d, vl32
337; CHECK-NEXT:    mov z0.d, #0x7fffffffffffffff
338; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
339; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
340; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
341; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
342; CHECK-NEXT:    ret
343  %a = load <32 x double>, ptr %ap
344  %b = load <32 x double>, ptr %bp
345  %r = call <32 x double> @llvm.copysign.v32f64(<32 x double> %a, <32 x double> %b)
346  store <32 x double> %r, ptr %ap
347  ret void
348}
349
350;============ v2f32
351
352define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
353; CHECK-LABEL: test_copysign_v2f32_v2f64:
354; CHECK:       // %bb.0:
355; CHECK-NEXT:    ldr q0, [x1]
356; CHECK-NEXT:    mvni v1.2s, #128, lsl #24
357; CHECK-NEXT:    ldr d2, [x0]
358; CHECK-NEXT:    fcvtn v0.2s, v0.2d
359; CHECK-NEXT:    bit v0.8b, v2.8b, v1.8b
360; CHECK-NEXT:    str d0, [x0]
361; CHECK-NEXT:    ret
362  %a = load <2 x float>, ptr %ap
363  %b = load <2 x double>, ptr %bp
364  %tmp0 = fptrunc <2 x double> %b to <2 x float>
365  %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %tmp0)
366  store <2 x float> %r, ptr %ap
367  ret void
368}
369
370;============ v4f32
371
372; SplitVecOp #1
373define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
374; CHECK-LABEL: test_copysign_v4f32_v4f64:
375; CHECK:       // %bb.0:
376; CHECK-NEXT:    ptrue p0.d, vl4
377; CHECK-NEXT:    mvni v1.4s, #128, lsl #24
378; CHECK-NEXT:    ldr q2, [x0]
379; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
380; CHECK-NEXT:    ptrue p0.d
381; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
382; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
383; CHECK-NEXT:    bit v0.16b, v2.16b, v1.16b
384; CHECK-NEXT:    str q0, [x0]
385; CHECK-NEXT:    ret
386  %a = load <4 x float>, ptr %ap
387  %b = load <4 x double>, ptr %bp
388  %tmp0 = fptrunc <4 x double> %b to <4 x float>
389  %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %tmp0)
390  store <4 x float> %r, ptr %ap
391  ret void
392}
393
394;============ v2f64
395
396define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
397; CHECK-LABEL: test_copysign_v2f64_v2f32:
398; CHECK:       // %bb.0:
399; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
400; CHECK-NEXT:    ldr d1, [x1]
401; CHECK-NEXT:    ldr q2, [x0]
402; CHECK-NEXT:    fcvtl v1.2d, v1.2s
403; CHECK-NEXT:    fneg v0.2d, v0.2d
404; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
405; CHECK-NEXT:    str q0, [x0]
406; CHECK-NEXT:    ret
407  %a = load <2 x double>, ptr %ap
408  %b = load < 2 x float>, ptr %bp
409  %tmp0 = fpext <2 x float> %b to <2 x double>
410  %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %tmp0)
411  store <2 x double> %r, ptr %ap
412  ret void
413}
414
415;============ v4f64
416
417; SplitVecRes mismatched
418define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
419; CHECK_NO_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32:
420; CHECK_NO_EXTEND_ROUND:       // %bb.0:
421; CHECK_NO_EXTEND_ROUND-NEXT:    ptrue p0.d, vl4
422; CHECK_NO_EXTEND_ROUND-NEXT:    mov z1.d, #0x7fffffffffffffff
423; CHECK_NO_EXTEND_ROUND-NEXT:    ld1w { z0.d }, p0/z, [x1]
424; CHECK_NO_EXTEND_ROUND-NEXT:    ld1d { z2.d }, p0/z, [x0]
425; CHECK_NO_EXTEND_ROUND-NEXT:    fcvt z0.d, p0/m, z0.s
426; CHECK_NO_EXTEND_ROUND-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
427; CHECK_NO_EXTEND_ROUND-NEXT:    st1d { z2.d }, p0, [x0]
428; CHECK_NO_EXTEND_ROUND-NEXT:    ret
429;
430; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32:
431; CHECK_EXTEND_ROUND:       // %bb.0:
432; CHECK_EXTEND_ROUND-NEXT:    ldr q0, [x1]
433; CHECK_EXTEND_ROUND-NEXT:    ptrue p0.d, vl4
434; CHECK_EXTEND_ROUND-NEXT:    mov z1.d, #0x7fffffffffffffff
435; CHECK_EXTEND_ROUND-NEXT:    uunpklo z0.d, z0.s
436; CHECK_EXTEND_ROUND-NEXT:    ld1d { z2.d }, p0/z, [x0]
437; CHECK_EXTEND_ROUND-NEXT:    fcvt z0.d, p0/m, z0.s
438; CHECK_EXTEND_ROUND-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
439; CHECK_EXTEND_ROUND-NEXT:    st1d { z2.d }, p0, [x0]
440; CHECK_EXTEND_ROUND-NEXT:    ret
441  %a = load <4 x double>, ptr %ap
442  %b = load <4 x float>, ptr %bp
443  %tmp0 = fpext <4 x float> %b to <4 x double>
444  %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %tmp0)
445  store <4 x double> %r, ptr %ap
446  ret void
447}
448
449;============ v4f16
450
451define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
452; CHECK-LABEL: test_copysign_v4f16_v4f32:
453; CHECK:       // %bb.0:
454; CHECK-NEXT:    ldr q0, [x1]
455; CHECK-NEXT:    mvni v1.4h, #128, lsl #8
456; CHECK-NEXT:    ldr d2, [x0]
457; CHECK-NEXT:    fcvtn v0.4h, v0.4s
458; CHECK-NEXT:    bit v0.8b, v2.8b, v1.8b
459; CHECK-NEXT:    str d0, [x0]
460; CHECK-NEXT:    ret
461  %a = load <4 x half>, ptr %ap
462  %b = load <4 x float>, ptr %bp
463  %tmp0 = fptrunc <4 x float> %b to <4 x half>
464  %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %tmp0)
465  store <4 x half> %r, ptr %ap
466  ret void
467}
468
469define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
470; CHECK-LABEL: test_copysign_v4f16_v4f64:
471; CHECK:       // %bb.0:
472; CHECK-NEXT:    ptrue p0.d, vl4
473; CHECK-NEXT:    mvni v1.4h, #128, lsl #8
474; CHECK-NEXT:    ldr d2, [x0]
475; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
476; CHECK-NEXT:    ptrue p0.d
477; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
478; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
479; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
480; CHECK-NEXT:    bit v0.8b, v2.8b, v1.8b
481; CHECK-NEXT:    str d0, [x0]
482; CHECK-NEXT:    ret
483  %a = load <4 x half>, ptr %ap
484  %b = load <4 x double>, ptr %bp
485  %tmp0 = fptrunc <4 x double> %b to <4 x half>
486  %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %tmp0)
487  store <4 x half> %r, ptr %ap
488  ret void
489}
490
491declare <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b) #0
492
493;============ v8f16
494
495
496define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
497; CHECK-LABEL: test_copysign_v8f16_v8f32:
498; CHECK:       // %bb.0:
499; CHECK-NEXT:    ptrue p0.s, vl8
500; CHECK-NEXT:    mvni v1.8h, #128, lsl #8
501; CHECK-NEXT:    ldr q2, [x0]
502; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
503; CHECK-NEXT:    ptrue p0.s
504; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
505; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
506; CHECK-NEXT:    bit v0.16b, v2.16b, v1.16b
507; CHECK-NEXT:    str q0, [x0]
508; CHECK-NEXT:    ret
509  %a = load <8 x half>, ptr %ap
510  %b = load <8 x float>, ptr %bp
511  %tmp0 = fptrunc <8 x float> %b to <8 x half>
512  %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %tmp0)
513  store <8 x half> %r, ptr %ap
514  ret void
515}
516
517declare <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b) #0
518declare <16 x half> @llvm.copysign.v16f16(<16 x half> %a, <16 x half> %b) #0
519declare <32 x half> @llvm.copysign.v32f16(<32 x half> %a, <32 x half> %b) #0
520declare <64 x half> @llvm.copysign.v64f16(<64 x half> %a, <64 x half> %b) #0
521declare <128 x half> @llvm.copysign.v128f16(<128 x half> %a, <128 x half> %b) #0
522
523declare <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b) #0
524declare <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b) #0
525declare <8 x float> @llvm.copysign.v8f32(<8 x float> %a, <8 x float> %b) #0
526declare <16 x float> @llvm.copysign.v16f32(<16 x float> %a, <16 x float> %b) #0
527declare <32 x float> @llvm.copysign.v32f32(<32 x float> %a, <32 x float> %b) #0
528declare <64 x float> @llvm.copysign.v64f32(<64 x float> %a, <64 x float> %b) #0
529
530declare <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b) #0
531declare <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b) #0
532declare <8 x double> @llvm.copysign.v8f64(<8 x double> %a, <8 x double> %b) #0
533declare <16 x double> @llvm.copysign.v16f64(<16 x double> %a, <16 x double> %b) #0
534declare <32 x double> @llvm.copysign.v32f64(<32 x double> %a, <32 x double> %b) #0
535
536attributes #0 = { "target-features"="+sve2" }
537