xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll (revision cc82f1290a1e2157a6c0530d78d8cc84d2b8553d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,CHECK_NO_EXTEND_ROUND
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,CHECK_NO_EXTEND_ROUND
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,CHECK_NO_EXTEND_ROUND
5; RUN: llc -aarch64-sve-vector-bits-min=256  --combiner-vector-fcopysign-extend-round < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256,CHECK_EXTEND_ROUND
6; RUN: llc -aarch64-sve-vector-bits-min=512  --combiner-vector-fcopysign-extend-round < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,CHECK_EXTEND_ROUND
7; RUN: llc -aarch64-sve-vector-bits-min=2048 --combiner-vector-fcopysign-extend-round < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,CHECK_EXTEND_ROUND
8
9target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
10
11target triple = "aarch64-unknown-linux-gnu"
12
13;============ f16
14
15define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
16; CHECK-LABEL: test_copysign_v4f16_v4f16:
17; CHECK:       // %bb.0:
18; CHECK-NEXT:    mvni v0.4h, #128, lsl #8
19; CHECK-NEXT:    ldr d1, [x0]
20; CHECK-NEXT:    ldr d2, [x1]
21; CHECK-NEXT:    bsl v0.8b, v1.8b, v2.8b
22; CHECK-NEXT:    str d0, [x0]
23; CHECK-NEXT:    ret
24  %a = load <4 x half>, ptr %ap
25  %b = load <4 x half>, ptr %bp
26  %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b)
27  store <4 x half> %r, ptr %ap
28  ret void
29}
30
31define void @test_copysign_v8f16_v8f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
32; CHECK-LABEL: test_copysign_v8f16_v8f16:
33; CHECK:       // %bb.0:
34; CHECK-NEXT:    mvni v0.8h, #128, lsl #8
35; CHECK-NEXT:    ldr q1, [x0]
36; CHECK-NEXT:    ldr q2, [x1]
37; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
38; CHECK-NEXT:    str q0, [x0]
39; CHECK-NEXT:    ret
40  %a = load <8 x half>, ptr %ap
41  %b = load <8 x half>, ptr %bp
42  %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b)
43  store <8 x half> %r, ptr %ap
44  ret void
45}
46
47define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
48; CHECK-LABEL: test_copysign_v16f16_v16f16:
49; CHECK:       // %bb.0:
50; CHECK-NEXT:    ptrue p0.h, vl16
51; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
52; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
53; CHECK-NEXT:    and z1.h, z1.h, #0x8000
54; CHECK-NEXT:    and z0.h, z0.h, #0x7fff
55; CHECK-NEXT:    orr z0.d, z0.d, z1.d
56; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
57; CHECK-NEXT:    ret
58  %a = load <16 x half>, ptr %ap
59  %b = load <16 x half>, ptr %bp
60  %r = call <16 x half> @llvm.copysign.v16f16(<16 x half> %a, <16 x half> %b)
61  store <16 x half> %r, ptr %ap
62  ret void
63}
64
65define void @test_copysign_v32f16_v32f16(ptr %ap, ptr %bp) #0 {
66; VBITS_GE_256-LABEL: test_copysign_v32f16_v32f16:
67; VBITS_GE_256:       // %bb.0:
68; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
69; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
70; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
71; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
72; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
73; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
74; VBITS_GE_256-NEXT:    and z1.h, z1.h, #0x8000
75; VBITS_GE_256-NEXT:    and z0.h, z0.h, #0x7fff
76; VBITS_GE_256-NEXT:    and z2.h, z2.h, #0x7fff
77; VBITS_GE_256-NEXT:    and z3.h, z3.h, #0x8000
78; VBITS_GE_256-NEXT:    orr z0.d, z0.d, z1.d
79; VBITS_GE_256-NEXT:    orr z1.d, z2.d, z3.d
80; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
81; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
82; VBITS_GE_256-NEXT:    ret
83;
84; VBITS_GE_512-LABEL: test_copysign_v32f16_v32f16:
85; VBITS_GE_512:       // %bb.0:
86; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
87; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
88; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
89; VBITS_GE_512-NEXT:    and z1.h, z1.h, #0x8000
90; VBITS_GE_512-NEXT:    and z0.h, z0.h, #0x7fff
91; VBITS_GE_512-NEXT:    orr z0.d, z0.d, z1.d
92; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
93; VBITS_GE_512-NEXT:    ret
94  %a = load <32 x half>, ptr %ap
95  %b = load <32 x half>, ptr %bp
96  %r = call <32 x half> @llvm.copysign.v32f16(<32 x half> %a, <32 x half> %b)
97  store <32 x half> %r, ptr %ap
98  ret void
99}
100
101define void @test_copysign_v64f16_v64f16(ptr %ap, ptr %bp) vscale_range(8,0) #0 {
102; CHECK-LABEL: test_copysign_v64f16_v64f16:
103; CHECK:       // %bb.0:
104; CHECK-NEXT:    ptrue p0.h, vl64
105; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
106; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
107; CHECK-NEXT:    and z1.h, z1.h, #0x8000
108; CHECK-NEXT:    and z0.h, z0.h, #0x7fff
109; CHECK-NEXT:    orr z0.d, z0.d, z1.d
110; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
111; CHECK-NEXT:    ret
112  %a = load <64 x half>, ptr %ap
113  %b = load <64 x half>, ptr %bp
114  %r = call <64 x half> @llvm.copysign.v64f16(<64 x half> %a, <64 x half> %b)
115  store <64 x half> %r, ptr %ap
116  ret void
117}
118
119define void @test_copysign_v128f16_v128f16(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
120; CHECK-LABEL: test_copysign_v128f16_v128f16:
121; CHECK:       // %bb.0:
122; CHECK-NEXT:    ptrue p0.h, vl128
123; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
124; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
125; CHECK-NEXT:    and z1.h, z1.h, #0x8000
126; CHECK-NEXT:    and z0.h, z0.h, #0x7fff
127; CHECK-NEXT:    orr z0.d, z0.d, z1.d
128; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
129; CHECK-NEXT:    ret
130  %a = load <128 x half>, ptr %ap
131  %b = load <128 x half>, ptr %bp
132  %r = call <128 x half> @llvm.copysign.v128f16(<128 x half> %a, <128 x half> %b)
133  store <128 x half> %r, ptr %ap
134  ret void
135}
136
137;============ f32
138
139define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
140; CHECK-LABEL: test_copysign_v2f32_v2f32:
141; CHECK:       // %bb.0:
142; CHECK-NEXT:    mvni v0.2s, #128, lsl #24
143; CHECK-NEXT:    ldr d1, [x0]
144; CHECK-NEXT:    ldr d2, [x1]
145; CHECK-NEXT:    bsl v0.8b, v1.8b, v2.8b
146; CHECK-NEXT:    str d0, [x0]
147; CHECK-NEXT:    ret
148  %a = load <2 x float>, ptr %ap
149  %b = load <2 x float>, ptr %bp
150  %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b)
151  store <2 x float> %r, ptr %ap
152  ret void
153}
154
155define void @test_copysign_v4f32_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
156; CHECK-LABEL: test_copysign_v4f32_v4f32:
157; CHECK:       // %bb.0:
158; CHECK-NEXT:    mvni v0.4s, #128, lsl #24
159; CHECK-NEXT:    ldr q1, [x0]
160; CHECK-NEXT:    ldr q2, [x1]
161; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
162; CHECK-NEXT:    str q0, [x0]
163; CHECK-NEXT:    ret
164  %a = load <4 x float>, ptr %ap
165  %b = load <4 x float>, ptr %bp
166  %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b)
167  store <4 x float> %r, ptr %ap
168  ret void
169}
170
171define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
172; CHECK-LABEL: test_copysign_v8f32_v8f32:
173; CHECK:       // %bb.0:
174; CHECK-NEXT:    ptrue p0.s, vl8
175; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
176; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
177; CHECK-NEXT:    and z1.s, z1.s, #0x80000000
178; CHECK-NEXT:    and z0.s, z0.s, #0x7fffffff
179; CHECK-NEXT:    orr z0.d, z0.d, z1.d
180; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
181; CHECK-NEXT:    ret
182  %a = load <8 x float>, ptr %ap
183  %b = load <8 x float>, ptr %bp
184  %r = call <8 x float> @llvm.copysign.v8f32(<8 x float> %a, <8 x float> %b)
185  store <8 x float> %r, ptr %ap
186  ret void
187}
188
189define void @test_copysign_v16f32_v16f32(ptr %ap, ptr %bp) #0 {
190; VBITS_GE_256-LABEL: test_copysign_v16f32_v16f32:
191; VBITS_GE_256:       // %bb.0:
192; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
193; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
194; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
195; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
196; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
197; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
198; VBITS_GE_256-NEXT:    and z1.s, z1.s, #0x80000000
199; VBITS_GE_256-NEXT:    and z0.s, z0.s, #0x7fffffff
200; VBITS_GE_256-NEXT:    and z2.s, z2.s, #0x7fffffff
201; VBITS_GE_256-NEXT:    and z3.s, z3.s, #0x80000000
202; VBITS_GE_256-NEXT:    orr z0.d, z0.d, z1.d
203; VBITS_GE_256-NEXT:    orr z1.d, z2.d, z3.d
204; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
205; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
206; VBITS_GE_256-NEXT:    ret
207;
208; VBITS_GE_512-LABEL: test_copysign_v16f32_v16f32:
209; VBITS_GE_512:       // %bb.0:
210; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
211; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
212; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
213; VBITS_GE_512-NEXT:    and z1.s, z1.s, #0x80000000
214; VBITS_GE_512-NEXT:    and z0.s, z0.s, #0x7fffffff
215; VBITS_GE_512-NEXT:    orr z0.d, z0.d, z1.d
216; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
217; VBITS_GE_512-NEXT:    ret
218  %a = load <16 x float>, ptr %ap
219  %b = load <16 x float>, ptr %bp
220  %r = call <16 x float> @llvm.copysign.v16f32(<16 x float> %a, <16 x float> %b)
221  store <16 x float> %r, ptr %ap
222  ret void
223}
224
225define void @test_copysign_v32f32_v32f32(ptr %ap, ptr %bp) vscale_range(8,0) #0 {
226; CHECK-LABEL: test_copysign_v32f32_v32f32:
227; CHECK:       // %bb.0:
228; CHECK-NEXT:    ptrue p0.s, vl32
229; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
230; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
231; CHECK-NEXT:    and z1.s, z1.s, #0x80000000
232; CHECK-NEXT:    and z0.s, z0.s, #0x7fffffff
233; CHECK-NEXT:    orr z0.d, z0.d, z1.d
234; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
235; CHECK-NEXT:    ret
236  %a = load <32 x float>, ptr %ap
237  %b = load <32 x float>, ptr %bp
238  %r = call <32 x float> @llvm.copysign.v32f32(<32 x float> %a, <32 x float> %b)
239  store <32 x float> %r, ptr %ap
240  ret void
241}
242
243define void @test_copysign_v64f32_v64f32(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
244; CHECK-LABEL: test_copysign_v64f32_v64f32:
245; CHECK:       // %bb.0:
246; CHECK-NEXT:    ptrue p0.s, vl64
247; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
248; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
249; CHECK-NEXT:    and z1.s, z1.s, #0x80000000
250; CHECK-NEXT:    and z0.s, z0.s, #0x7fffffff
251; CHECK-NEXT:    orr z0.d, z0.d, z1.d
252; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
253; CHECK-NEXT:    ret
254  %a = load <64 x float>, ptr %ap
255  %b = load <64 x float>, ptr %bp
256  %r = call <64 x float> @llvm.copysign.v64f32(<64 x float> %a, <64 x float> %b)
257  store <64 x float> %r, ptr %ap
258  ret void
259}
260
261;============ f64
262
263define void @test_copysign_v2f64_v2f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
264; CHECK-LABEL: test_copysign_v2f64_v2f64:
265; CHECK:       // %bb.0:
266; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
267; CHECK-NEXT:    ldr q1, [x0]
268; CHECK-NEXT:    ldr q2, [x1]
269; CHECK-NEXT:    fneg v0.2d, v0.2d
270; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
271; CHECK-NEXT:    str q0, [x0]
272; CHECK-NEXT:    ret
273  %a = load <2 x double>, ptr %ap
274  %b = load <2 x double>, ptr %bp
275  %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b)
276  store <2 x double> %r, ptr %ap
277  ret void
278}
279
280define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
281; CHECK-LABEL: test_copysign_v4f64_v4f64:
282; CHECK:       // %bb.0:
283; CHECK-NEXT:    ptrue p0.d, vl4
284; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
285; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
286; CHECK-NEXT:    and z1.d, z1.d, #0x8000000000000000
287; CHECK-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
288; CHECK-NEXT:    orr z0.d, z0.d, z1.d
289; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
290; CHECK-NEXT:    ret
291  %a = load <4 x double>, ptr %ap
292  %b = load <4 x double>, ptr %bp
293  %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b)
294  store <4 x double> %r, ptr %ap
295  ret void
296}
297
298define void @test_copysign_v8f64_v8f64(ptr %ap, ptr %bp) #0 {
299; VBITS_GE_256-LABEL: test_copysign_v8f64_v8f64:
300; VBITS_GE_256:       // %bb.0:
301; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
302; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
303; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
304; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
305; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
306; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
307; VBITS_GE_256-NEXT:    and z1.d, z1.d, #0x8000000000000000
308; VBITS_GE_256-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
309; VBITS_GE_256-NEXT:    and z2.d, z2.d, #0x7fffffffffffffff
310; VBITS_GE_256-NEXT:    and z3.d, z3.d, #0x8000000000000000
311; VBITS_GE_256-NEXT:    orr z0.d, z0.d, z1.d
312; VBITS_GE_256-NEXT:    orr z1.d, z2.d, z3.d
313; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
314; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
315; VBITS_GE_256-NEXT:    ret
316;
317; VBITS_GE_512-LABEL: test_copysign_v8f64_v8f64:
318; VBITS_GE_512:       // %bb.0:
319; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
320; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
321; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
322; VBITS_GE_512-NEXT:    and z1.d, z1.d, #0x8000000000000000
323; VBITS_GE_512-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
324; VBITS_GE_512-NEXT:    orr z0.d, z0.d, z1.d
325; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
326; VBITS_GE_512-NEXT:    ret
327  %a = load <8 x double>, ptr %ap
328  %b = load <8 x double>, ptr %bp
329  %r = call <8 x double> @llvm.copysign.v8f64(<8 x double> %a, <8 x double> %b)
330  store <8 x double> %r, ptr %ap
331  ret void
332}
333
334define void @test_copysign_v16f64_v16f64(ptr %ap, ptr %bp) vscale_range(8,0) #0 {
335; CHECK-LABEL: test_copysign_v16f64_v16f64:
336; CHECK:       // %bb.0:
337; CHECK-NEXT:    ptrue p0.d, vl16
338; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
339; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
340; CHECK-NEXT:    and z1.d, z1.d, #0x8000000000000000
341; CHECK-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
342; CHECK-NEXT:    orr z0.d, z0.d, z1.d
343; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
344; CHECK-NEXT:    ret
345  %a = load <16 x double>, ptr %ap
346  %b = load <16 x double>, ptr %bp
347  %r = call <16 x double> @llvm.copysign.v16f64(<16 x double> %a, <16 x double> %b)
348  store <16 x double> %r, ptr %ap
349  ret void
350}
351
352define void @test_copysign_v32f64_v32f64(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
353; CHECK-LABEL: test_copysign_v32f64_v32f64:
354; CHECK:       // %bb.0:
355; CHECK-NEXT:    ptrue p0.d, vl32
356; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
357; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
358; CHECK-NEXT:    and z1.d, z1.d, #0x8000000000000000
359; CHECK-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
360; CHECK-NEXT:    orr z0.d, z0.d, z1.d
361; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
362; CHECK-NEXT:    ret
363  %a = load <32 x double>, ptr %ap
364  %b = load <32 x double>, ptr %bp
365  %r = call <32 x double> @llvm.copysign.v32f64(<32 x double> %a, <32 x double> %b)
366  store <32 x double> %r, ptr %ap
367  ret void
368}
369
370;============ v2f32
371
372define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
373; CHECK-LABEL: test_copysign_v2f32_v2f64:
374; CHECK:       // %bb.0:
375; CHECK-NEXT:    ldr q0, [x1]
376; CHECK-NEXT:    mvni v1.2s, #128, lsl #24
377; CHECK-NEXT:    ldr d2, [x0]
378; CHECK-NEXT:    fcvtn v0.2s, v0.2d
379; CHECK-NEXT:    bit v0.8b, v2.8b, v1.8b
380; CHECK-NEXT:    str d0, [x0]
381; CHECK-NEXT:    ret
382  %a = load <2 x float>, ptr %ap
383  %b = load <2 x double>, ptr %bp
384  %tmp0 = fptrunc <2 x double> %b to <2 x float>
385  %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %tmp0)
386  store <2 x float> %r, ptr %ap
387  ret void
388}
389
390;============ v4f32
391
392; SplitVecOp #1
393define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
394; CHECK-LABEL: test_copysign_v4f32_v4f64:
395; CHECK:       // %bb.0:
396; CHECK-NEXT:    ptrue p0.d, vl4
397; CHECK-NEXT:    mvni v1.4s, #128, lsl #24
398; CHECK-NEXT:    ldr q2, [x0]
399; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
400; CHECK-NEXT:    ptrue p0.d
401; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
402; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
403; CHECK-NEXT:    bit v0.16b, v2.16b, v1.16b
404; CHECK-NEXT:    str q0, [x0]
405; CHECK-NEXT:    ret
406  %a = load <4 x float>, ptr %ap
407  %b = load <4 x double>, ptr %bp
408  %tmp0 = fptrunc <4 x double> %b to <4 x float>
409  %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %tmp0)
410  store <4 x float> %r, ptr %ap
411  ret void
412}
413
414;============ v2f64
415
416define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
417; CHECK-LABEL: test_copysign_v2f64_v2f32:
418; CHECK:       // %bb.0:
419; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
420; CHECK-NEXT:    ldr d1, [x1]
421; CHECK-NEXT:    ldr q2, [x0]
422; CHECK-NEXT:    fcvtl v1.2d, v1.2s
423; CHECK-NEXT:    fneg v0.2d, v0.2d
424; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
425; CHECK-NEXT:    str q0, [x0]
426; CHECK-NEXT:    ret
427  %a = load <2 x double>, ptr %ap
428  %b = load < 2 x float>, ptr %bp
429  %tmp0 = fpext <2 x float> %b to <2 x double>
430  %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %tmp0)
431  store <2 x double> %r, ptr %ap
432  ret void
433}
434
435;============ v4f64
436
437; SplitVecRes mismatched
438define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
439; CHECK_NO_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32:
440; CHECK_NO_EXTEND_ROUND:       // %bb.0:
441; CHECK_NO_EXTEND_ROUND-NEXT:    ptrue p0.d, vl4
442; CHECK_NO_EXTEND_ROUND-NEXT:    ld1w { z0.d }, p0/z, [x1]
443; CHECK_NO_EXTEND_ROUND-NEXT:    ld1d { z1.d }, p0/z, [x0]
444; CHECK_NO_EXTEND_ROUND-NEXT:    fcvt z0.d, p0/m, z0.s
445; CHECK_NO_EXTEND_ROUND-NEXT:    and z1.d, z1.d, #0x7fffffffffffffff
446; CHECK_NO_EXTEND_ROUND-NEXT:    and z0.d, z0.d, #0x8000000000000000
447; CHECK_NO_EXTEND_ROUND-NEXT:    orr z0.d, z1.d, z0.d
448; CHECK_NO_EXTEND_ROUND-NEXT:    st1d { z0.d }, p0, [x0]
449; CHECK_NO_EXTEND_ROUND-NEXT:    ret
450;
451; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32:
452; CHECK_EXTEND_ROUND:       // %bb.0:
453; CHECK_EXTEND_ROUND-NEXT:    ldr q0, [x1]
454; CHECK_EXTEND_ROUND-NEXT:    ptrue p0.d, vl4
455; CHECK_EXTEND_ROUND-NEXT:    uunpklo z0.d, z0.s
456; CHECK_EXTEND_ROUND-NEXT:    ld1d { z1.d }, p0/z, [x0]
457; CHECK_EXTEND_ROUND-NEXT:    and z1.d, z1.d, #0x7fffffffffffffff
458; CHECK_EXTEND_ROUND-NEXT:    fcvt z0.d, p0/m, z0.s
459; CHECK_EXTEND_ROUND-NEXT:    and z0.d, z0.d, #0x8000000000000000
460; CHECK_EXTEND_ROUND-NEXT:    orr z0.d, z1.d, z0.d
461; CHECK_EXTEND_ROUND-NEXT:    st1d { z0.d }, p0, [x0]
462; CHECK_EXTEND_ROUND-NEXT:    ret
463  %a = load <4 x double>, ptr %ap
464  %b = load <4 x float>, ptr %bp
465  %tmp0 = fpext <4 x float> %b to <4 x double>
466  %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %tmp0)
467  store <4 x double> %r, ptr %ap
468  ret void
469}
470
471;============ v4f16
472
473define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
474; CHECK-LABEL: test_copysign_v4f16_v4f32:
475; CHECK:       // %bb.0:
476; CHECK-NEXT:    ldr q0, [x1]
477; CHECK-NEXT:    mvni v1.4h, #128, lsl #8
478; CHECK-NEXT:    ldr d2, [x0]
479; CHECK-NEXT:    fcvtn v0.4h, v0.4s
480; CHECK-NEXT:    bit v0.8b, v2.8b, v1.8b
481; CHECK-NEXT:    str d0, [x0]
482; CHECK-NEXT:    ret
483  %a = load <4 x half>, ptr %ap
484  %b = load <4 x float>, ptr %bp
485  %tmp0 = fptrunc <4 x float> %b to <4 x half>
486  %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %tmp0)
487  store <4 x half> %r, ptr %ap
488  ret void
489}
490
491define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
492; CHECK-LABEL: test_copysign_v4f16_v4f64:
493; CHECK:       // %bb.0:
494; CHECK-NEXT:    ptrue p0.d, vl4
495; CHECK-NEXT:    mvni v1.4h, #128, lsl #8
496; CHECK-NEXT:    ldr d2, [x0]
497; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
498; CHECK-NEXT:    ptrue p0.d
499; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
500; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
501; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
502; CHECK-NEXT:    bit v0.8b, v2.8b, v1.8b
503; CHECK-NEXT:    str d0, [x0]
504; CHECK-NEXT:    ret
505  %a = load <4 x half>, ptr %ap
506  %b = load <4 x double>, ptr %bp
507  %tmp0 = fptrunc <4 x double> %b to <4 x half>
508  %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %tmp0)
509  store <4 x half> %r, ptr %ap
510  ret void
511}
512
513declare <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b) #0
514
515;============ v8f16
516
517
518define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
519; CHECK-LABEL: test_copysign_v8f16_v8f32:
520; CHECK:       // %bb.0:
521; CHECK-NEXT:    ptrue p0.s, vl8
522; CHECK-NEXT:    mvni v1.8h, #128, lsl #8
523; CHECK-NEXT:    ldr q2, [x0]
524; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
525; CHECK-NEXT:    ptrue p0.s
526; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
527; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
528; CHECK-NEXT:    bit v0.16b, v2.16b, v1.16b
529; CHECK-NEXT:    str q0, [x0]
530; CHECK-NEXT:    ret
531  %a = load <8 x half>, ptr %ap
532  %b = load <8 x float>, ptr %bp
533  %tmp0 = fptrunc <8 x float> %b to <8 x half>
534  %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %tmp0)
535  store <8 x half> %r, ptr %ap
536  ret void
537}
538
539declare <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b) #0
540declare <16 x half> @llvm.copysign.v16f16(<16 x half> %a, <16 x half> %b) #0
541declare <32 x half> @llvm.copysign.v32f16(<32 x half> %a, <32 x half> %b) #0
542declare <64 x half> @llvm.copysign.v64f16(<64 x half> %a, <64 x half> %b) #0
543declare <128 x half> @llvm.copysign.v128f16(<128 x half> %a, <128 x half> %b) #0
544
545declare <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b) #0
546declare <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b) #0
547declare <8 x float> @llvm.copysign.v8f32(<8 x float> %a, <8 x float> %b) #0
548declare <16 x float> @llvm.copysign.v16f32(<16 x float> %a, <16 x float> %b) #0
549declare <32 x float> @llvm.copysign.v32f32(<32 x float> %a, <32 x float> %b) #0
550declare <64 x float> @llvm.copysign.v64f32(<64 x float> %a, <64 x float> %b) #0
551
552declare <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b) #0
553declare <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b) #0
554declare <8 x double> @llvm.copysign.v8f64(<8 x double> %a, <8 x double> %b) #0
555declare <16 x double> @llvm.copysign.v16f64(<16 x double> %a, <16 x double> %b) #0
556declare <32 x double> @llvm.copysign.v32f64(<32 x double> %a, <32 x double> %b) #0
557
558attributes #0 = { "target-features"="+sve" }
559