xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SIVI,SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=SIVI,VI %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
5
6declare double @llvm.copysign.f64(double, double) #0
7declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) #0
8declare <3 x double> @llvm.copysign.v3f64(<3 x double>, <3 x double>) #0
9declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) #0
10
11define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], double %sign) {
12; SI-LABEL: s_test_copysign_f64:
13; SI:       ; %bb.0:
14; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
15; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
16; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x1d
17; SI-NEXT:    s_waitcnt lgkmcnt(0)
18; SI-NEXT:    s_brev_b32 s4, -2
19; SI-NEXT:    s_mov_b32 s3, 0xf000
20; SI-NEXT:    s_mov_b32 s2, -1
21; SI-NEXT:    v_mov_b32_e32 v0, s7
22; SI-NEXT:    v_mov_b32_e32 v1, s5
23; SI-NEXT:    v_bfi_b32 v1, s4, v0, v1
24; SI-NEXT:    v_mov_b32_e32 v0, s6
25; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
26; SI-NEXT:    s_endpgm
27;
28; VI-LABEL: s_test_copysign_f64:
29; VI:       ; %bb.0:
30; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
31; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x74
32; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
33; VI-NEXT:    s_waitcnt lgkmcnt(0)
34; VI-NEXT:    s_brev_b32 s2, -2
35; VI-NEXT:    v_mov_b32_e32 v0, s1
36; VI-NEXT:    v_mov_b32_e32 v1, s3
37; VI-NEXT:    v_mov_b32_e32 v2, s4
38; VI-NEXT:    v_bfi_b32 v1, s2, v0, v1
39; VI-NEXT:    v_mov_b32_e32 v0, s0
40; VI-NEXT:    v_mov_b32_e32 v3, s5
41; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
42; VI-NEXT:    s_endpgm
43;
44; GFX11-LABEL: s_test_copysign_f64:
45; GFX11:       ; %bb.0:
46; GFX11-NEXT:    s_clause 0x2
47; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x74
48; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x4c
49; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
50; GFX11-NEXT:    v_mov_b32_e32 v2, 0
51; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
52; GFX11-NEXT:    v_mov_b32_e32 v0, s1
53; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
54; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s3, v0
55; GFX11-NEXT:    v_mov_b32_e32 v0, s2
56; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
57; GFX11-NEXT:    s_endpgm
58  %result = call double @llvm.copysign.f64(double %mag, double %sign)
59  store double %result, ptr addrspace(1) %out, align 8
60  ret void
61}
62
63define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32], double %mag) {
64; SI-LABEL: s_test_copysign_f64_0:
65; SI:       ; %bb.0:
66; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
67; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
68; SI-NEXT:    s_mov_b32 s3, 0xf000
69; SI-NEXT:    s_mov_b32 s2, -1
70; SI-NEXT:    s_waitcnt lgkmcnt(0)
71; SI-NEXT:    s_and_b32 s4, s7, 0x7fffffff
72; SI-NEXT:    v_mov_b32_e32 v0, s6
73; SI-NEXT:    v_mov_b32_e32 v1, s4
74; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
75; SI-NEXT:    s_endpgm
76;
77; VI-LABEL: s_test_copysign_f64_0:
78; VI:       ; %bb.0:
79; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
80; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
81; VI-NEXT:    s_waitcnt lgkmcnt(0)
82; VI-NEXT:    s_bitset0_b32 s1, 31
83; VI-NEXT:    v_mov_b32_e32 v2, s2
84; VI-NEXT:    v_mov_b32_e32 v0, s0
85; VI-NEXT:    v_mov_b32_e32 v1, s1
86; VI-NEXT:    v_mov_b32_e32 v3, s3
87; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
88; VI-NEXT:    s_endpgm
89;
90; GFX11-LABEL: s_test_copysign_f64_0:
91; GFX11:       ; %bb.0:
92; GFX11-NEXT:    s_clause 0x1
93; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
94; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
95; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX11-NEXT:    s_bitset0_b32 s1, 31
97; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
98; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
99; GFX11-NEXT:    v_mov_b32_e32 v0, s0
100; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
101; GFX11-NEXT:    s_endpgm
102  %result = call double @llvm.copysign.f64(double %mag, double 0.0)
103  store double %result, ptr addrspace(1) %out, align 8
104  ret void
105}
106
107define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32], double %mag) {
108; SI-LABEL: s_test_copysign_f64_1:
109; SI:       ; %bb.0:
110; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
111; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
112; SI-NEXT:    s_mov_b32 s3, 0xf000
113; SI-NEXT:    s_mov_b32 s2, -1
114; SI-NEXT:    s_waitcnt lgkmcnt(0)
115; SI-NEXT:    s_and_b32 s4, s7, 0x7fffffff
116; SI-NEXT:    v_mov_b32_e32 v0, s6
117; SI-NEXT:    v_mov_b32_e32 v1, s4
118; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
119; SI-NEXT:    s_endpgm
120;
121; VI-LABEL: s_test_copysign_f64_1:
122; VI:       ; %bb.0:
123; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
124; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
125; VI-NEXT:    s_waitcnt lgkmcnt(0)
126; VI-NEXT:    s_bitset0_b32 s1, 31
127; VI-NEXT:    v_mov_b32_e32 v2, s2
128; VI-NEXT:    v_mov_b32_e32 v0, s0
129; VI-NEXT:    v_mov_b32_e32 v1, s1
130; VI-NEXT:    v_mov_b32_e32 v3, s3
131; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
132; VI-NEXT:    s_endpgm
133;
134; GFX11-LABEL: s_test_copysign_f64_1:
135; GFX11:       ; %bb.0:
136; GFX11-NEXT:    s_clause 0x1
137; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
138; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
139; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
140; GFX11-NEXT:    s_bitset0_b32 s1, 31
141; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
142; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
143; GFX11-NEXT:    v_mov_b32_e32 v0, s0
144; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
145; GFX11-NEXT:    s_endpgm
146  %result = call double @llvm.copysign.f64(double %mag, double 1.0)
147  store double %result, ptr addrspace(1) %out, align 8
148  ret void
149}
150
151define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i32], double %mag) {
152; SI-LABEL: s_test_copysign_f64_10:
153; SI:       ; %bb.0:
154; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
155; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
156; SI-NEXT:    s_mov_b32 s3, 0xf000
157; SI-NEXT:    s_mov_b32 s2, -1
158; SI-NEXT:    s_waitcnt lgkmcnt(0)
159; SI-NEXT:    s_and_b32 s4, s7, 0x7fffffff
160; SI-NEXT:    v_mov_b32_e32 v0, s6
161; SI-NEXT:    v_mov_b32_e32 v1, s4
162; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
163; SI-NEXT:    s_endpgm
164;
165; VI-LABEL: s_test_copysign_f64_10:
166; VI:       ; %bb.0:
167; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
168; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
169; VI-NEXT:    s_waitcnt lgkmcnt(0)
170; VI-NEXT:    s_bitset0_b32 s1, 31
171; VI-NEXT:    v_mov_b32_e32 v2, s2
172; VI-NEXT:    v_mov_b32_e32 v0, s0
173; VI-NEXT:    v_mov_b32_e32 v1, s1
174; VI-NEXT:    v_mov_b32_e32 v3, s3
175; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
176; VI-NEXT:    s_endpgm
177;
178; GFX11-LABEL: s_test_copysign_f64_10:
179; GFX11:       ; %bb.0:
180; GFX11-NEXT:    s_clause 0x1
181; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
182; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
183; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
184; GFX11-NEXT:    s_bitset0_b32 s1, 31
185; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
186; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
187; GFX11-NEXT:    v_mov_b32_e32 v0, s0
188; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
189; GFX11-NEXT:    s_endpgm
190  %result = call double @llvm.copysign.f64(double %mag, double 10.0)
191  store double %result, ptr addrspace(1) %out, align 8
192  ret void
193}
194
195define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x i32], double %mag) {
196; SI-LABEL: s_test_copysign_f64_neg1:
197; SI:       ; %bb.0:
198; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
199; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
200; SI-NEXT:    s_mov_b32 s3, 0xf000
201; SI-NEXT:    s_mov_b32 s2, -1
202; SI-NEXT:    s_waitcnt lgkmcnt(0)
203; SI-NEXT:    s_or_b32 s4, s7, 0x80000000
204; SI-NEXT:    v_mov_b32_e32 v0, s6
205; SI-NEXT:    v_mov_b32_e32 v1, s4
206; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
207; SI-NEXT:    s_endpgm
208;
209; VI-LABEL: s_test_copysign_f64_neg1:
210; VI:       ; %bb.0:
211; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
212; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
213; VI-NEXT:    s_waitcnt lgkmcnt(0)
214; VI-NEXT:    s_bitset1_b32 s1, 31
215; VI-NEXT:    v_mov_b32_e32 v2, s2
216; VI-NEXT:    v_mov_b32_e32 v0, s0
217; VI-NEXT:    v_mov_b32_e32 v1, s1
218; VI-NEXT:    v_mov_b32_e32 v3, s3
219; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
220; VI-NEXT:    s_endpgm
221;
222; GFX11-LABEL: s_test_copysign_f64_neg1:
223; GFX11:       ; %bb.0:
224; GFX11-NEXT:    s_clause 0x1
225; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
226; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
227; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX11-NEXT:    s_bitset1_b32 s1, 31
229; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
230; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
231; GFX11-NEXT:    v_mov_b32_e32 v0, s0
232; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
233; GFX11-NEXT:    s_endpgm
234  %result = call double @llvm.copysign.f64(double %mag, double -1.0)
235  store double %result, ptr addrspace(1) %out, align 8
236  ret void
237}
238
239define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x i32], double %mag) {
240; SI-LABEL: s_test_copysign_f64_neg10:
241; SI:       ; %bb.0:
242; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
243; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
244; SI-NEXT:    s_mov_b32 s3, 0xf000
245; SI-NEXT:    s_mov_b32 s2, -1
246; SI-NEXT:    s_waitcnt lgkmcnt(0)
247; SI-NEXT:    s_or_b32 s4, s7, 0x80000000
248; SI-NEXT:    v_mov_b32_e32 v0, s6
249; SI-NEXT:    v_mov_b32_e32 v1, s4
250; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
251; SI-NEXT:    s_endpgm
252;
253; VI-LABEL: s_test_copysign_f64_neg10:
254; VI:       ; %bb.0:
255; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
256; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
257; VI-NEXT:    s_waitcnt lgkmcnt(0)
258; VI-NEXT:    s_bitset1_b32 s1, 31
259; VI-NEXT:    v_mov_b32_e32 v2, s2
260; VI-NEXT:    v_mov_b32_e32 v0, s0
261; VI-NEXT:    v_mov_b32_e32 v1, s1
262; VI-NEXT:    v_mov_b32_e32 v3, s3
263; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
264; VI-NEXT:    s_endpgm
265;
266; GFX11-LABEL: s_test_copysign_f64_neg10:
267; GFX11:       ; %bb.0:
268; GFX11-NEXT:    s_clause 0x1
269; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
270; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
271; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
272; GFX11-NEXT:    s_bitset1_b32 s1, 31
273; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
274; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
275; GFX11-NEXT:    v_mov_b32_e32 v0, s0
276; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
277; GFX11-NEXT:    s_endpgm
278  %result = call double @llvm.copysign.f64(double %mag, double -10.0)
279  store double %result, ptr addrspace(1) %out, align 8
280  ret void
281}
282
283define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], float %sign) {
284; SI-LABEL: s_test_copysign_f64_f32:
285; SI:       ; %bb.0:
286; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
287; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
288; SI-NEXT:    s_load_dword s4, s[4:5], 0x1d
289; SI-NEXT:    s_brev_b32 s5, -2
290; SI-NEXT:    s_mov_b32 s3, 0xf000
291; SI-NEXT:    s_mov_b32 s2, -1
292; SI-NEXT:    s_waitcnt lgkmcnt(0)
293; SI-NEXT:    v_mov_b32_e32 v0, s7
294; SI-NEXT:    v_mov_b32_e32 v1, s4
295; SI-NEXT:    v_bfi_b32 v1, s5, v0, v1
296; SI-NEXT:    v_mov_b32_e32 v0, s6
297; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
298; SI-NEXT:    s_endpgm
299;
300; VI-LABEL: s_test_copysign_f64_f32:
301; VI:       ; %bb.0:
302; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
303; VI-NEXT:    s_load_dword s6, s[4:5], 0x74
304; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
305; VI-NEXT:    s_brev_b32 s4, -2
306; VI-NEXT:    s_waitcnt lgkmcnt(0)
307; VI-NEXT:    v_mov_b32_e32 v0, s1
308; VI-NEXT:    v_mov_b32_e32 v1, s6
309; VI-NEXT:    v_mov_b32_e32 v2, s2
310; VI-NEXT:    v_bfi_b32 v1, s4, v0, v1
311; VI-NEXT:    v_mov_b32_e32 v0, s0
312; VI-NEXT:    v_mov_b32_e32 v3, s3
313; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
314; VI-NEXT:    s_endpgm
315;
316; GFX11-LABEL: s_test_copysign_f64_f32:
317; GFX11:       ; %bb.0:
318; GFX11-NEXT:    s_clause 0x2
319; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x74
320; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
321; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
322; GFX11-NEXT:    v_mov_b32_e32 v2, 0
323; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
324; GFX11-NEXT:    v_mov_b32_e32 v0, s6
325; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
326; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s1, v0
327; GFX11-NEXT:    v_mov_b32_e32 v0, s0
328; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
329; GFX11-NEXT:    s_endpgm
330  %sign.ext = fpext float %sign to double
331  %result = call double @llvm.copysign.f64(double %mag, double %sign.ext)
332  store double %result, ptr addrspace(1) %out, align 8
333  ret void
334}
335
336define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], half %sign) {
337; SI-LABEL: s_test_copysign_f64_f16:
338; SI:       ; %bb.0:
339; SI-NEXT:    s_load_dword s2, s[4:5], 0x1d
340; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
341; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x13
342; SI-NEXT:    s_brev_b32 s6, -2
343; SI-NEXT:    s_mov_b32 s3, 0xf000
344; SI-NEXT:    s_waitcnt lgkmcnt(0)
345; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
346; SI-NEXT:    s_mov_b32 s2, -1
347; SI-NEXT:    v_mov_b32_e32 v1, s5
348; SI-NEXT:    v_bfi_b32 v1, s6, v1, v0
349; SI-NEXT:    v_mov_b32_e32 v0, s4
350; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
351; SI-NEXT:    s_endpgm
352;
353; VI-LABEL: s_test_copysign_f64_f16:
354; VI:       ; %bb.0:
355; VI-NEXT:    s_load_dword s6, s[4:5], 0x74
356; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
357; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
358; VI-NEXT:    s_brev_b32 s4, -2
359; VI-NEXT:    s_waitcnt lgkmcnt(0)
360; VI-NEXT:    v_lshlrev_b32_e64 v0, 16, s6
361; VI-NEXT:    v_mov_b32_e32 v1, s1
362; VI-NEXT:    v_mov_b32_e32 v2, s2
363; VI-NEXT:    v_bfi_b32 v1, s4, v1, v0
364; VI-NEXT:    v_mov_b32_e32 v0, s0
365; VI-NEXT:    v_mov_b32_e32 v3, s3
366; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
367; VI-NEXT:    s_endpgm
368;
369; GFX11-LABEL: s_test_copysign_f64_f16:
370; GFX11:       ; %bb.0:
371; GFX11-NEXT:    s_clause 0x2
372; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x74
373; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
374; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
375; GFX11-NEXT:    v_mov_b32_e32 v2, 0
376; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
377; GFX11-NEXT:    v_lshlrev_b32_e64 v0, 16, s6
378; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
379; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s1, v0
380; GFX11-NEXT:    v_mov_b32_e32 v0, s0
381; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
382; GFX11-NEXT:    s_endpgm
383  %sign.ext = fpext half %sign to double
384  %result = call double @llvm.copysign.f64(double %mag, double %sign.ext)
385  store double %result, ptr addrspace(1) %out, align 8
386  ret void
387}
388
389define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, double %sign) {
390; SI-LABEL: s_test_copysign_f64_0_mag:
391; SI:       ; %bb.0:
392; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
393; SI-NEXT:    s_mov_b32 s7, 0xf000
394; SI-NEXT:    s_mov_b32 s6, -1
395; SI-NEXT:    v_mov_b32_e32 v0, 0
396; SI-NEXT:    s_waitcnt lgkmcnt(0)
397; SI-NEXT:    s_mov_b32 s4, s0
398; SI-NEXT:    s_and_b32 s0, s3, 0x80000000
399; SI-NEXT:    s_mov_b32 s5, s1
400; SI-NEXT:    v_mov_b32_e32 v1, s0
401; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
402; SI-NEXT:    s_endpgm
403;
404; VI-LABEL: s_test_copysign_f64_0_mag:
405; VI:       ; %bb.0:
406; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
407; VI-NEXT:    v_mov_b32_e32 v2, 0
408; VI-NEXT:    s_waitcnt lgkmcnt(0)
409; VI-NEXT:    v_mov_b32_e32 v0, s0
410; VI-NEXT:    s_and_b32 s0, s3, 0x80000000
411; VI-NEXT:    v_mov_b32_e32 v1, s1
412; VI-NEXT:    v_mov_b32_e32 v3, s0
413; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
414; VI-NEXT:    s_endpgm
415;
416; GFX11-LABEL: s_test_copysign_f64_0_mag:
417; GFX11:       ; %bb.0:
418; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
419; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
420; GFX11-NEXT:    s_and_b32 s2, s3, 0x80000000
421; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
422; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
423; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
424; GFX11-NEXT:    s_endpgm
425  %result = call double @llvm.copysign.f64(double 0.0, double %sign)
426  store double %result, ptr addrspace(1) %out, align 4
427  ret void
428}
429
430define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, double %sign) {
431; SI-LABEL: s_test_copysign_f64_1_mag:
432; SI:       ; %bb.0:
433; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
434; SI-NEXT:    s_mov_b32 s7, 0xf000
435; SI-NEXT:    s_mov_b32 s6, -1
436; SI-NEXT:    v_mov_b32_e32 v0, 0
437; SI-NEXT:    s_waitcnt lgkmcnt(0)
438; SI-NEXT:    s_mov_b32 s4, s0
439; SI-NEXT:    s_and_b32 s0, s3, 0x80000000
440; SI-NEXT:    s_or_b32 s0, s0, 0x3ff00000
441; SI-NEXT:    s_mov_b32 s5, s1
442; SI-NEXT:    v_mov_b32_e32 v1, s0
443; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
444; SI-NEXT:    s_endpgm
445;
446; VI-LABEL: s_test_copysign_f64_1_mag:
447; VI:       ; %bb.0:
448; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
449; VI-NEXT:    v_mov_b32_e32 v2, 0
450; VI-NEXT:    s_waitcnt lgkmcnt(0)
451; VI-NEXT:    v_mov_b32_e32 v0, s0
452; VI-NEXT:    s_and_b32 s0, s3, 0x80000000
453; VI-NEXT:    s_or_b32 s0, s0, 0x3ff00000
454; VI-NEXT:    v_mov_b32_e32 v1, s1
455; VI-NEXT:    v_mov_b32_e32 v3, s0
456; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
457; VI-NEXT:    s_endpgm
458;
459; GFX11-LABEL: s_test_copysign_f64_1_mag:
460; GFX11:       ; %bb.0:
461; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
462; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
463; GFX11-NEXT:    s_and_b32 s2, s3, 0x80000000
464; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
465; GFX11-NEXT:    s_or_b32 s2, s2, 0x3ff00000
466; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
467; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
468; GFX11-NEXT:    s_endpgm
469  %result = call double @llvm.copysign.f64(double 1.0, double %sign)
470  store double %result, ptr addrspace(1) %out, align 4
471  ret void
472}
473
474define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, double %sign) {
475; SI-LABEL: s_test_copysign_f64_10_mag:
476; SI:       ; %bb.0:
477; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
478; SI-NEXT:    s_mov_b32 s7, 0xf000
479; SI-NEXT:    s_mov_b32 s6, -1
480; SI-NEXT:    v_mov_b32_e32 v0, 0
481; SI-NEXT:    s_waitcnt lgkmcnt(0)
482; SI-NEXT:    s_mov_b32 s4, s0
483; SI-NEXT:    s_and_b32 s0, s3, 0x80000000
484; SI-NEXT:    s_or_b32 s0, s0, 0x40240000
485; SI-NEXT:    s_mov_b32 s5, s1
486; SI-NEXT:    v_mov_b32_e32 v1, s0
487; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
488; SI-NEXT:    s_endpgm
489;
490; VI-LABEL: s_test_copysign_f64_10_mag:
491; VI:       ; %bb.0:
492; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
493; VI-NEXT:    v_mov_b32_e32 v2, 0
494; VI-NEXT:    s_waitcnt lgkmcnt(0)
495; VI-NEXT:    v_mov_b32_e32 v0, s0
496; VI-NEXT:    s_and_b32 s0, s3, 0x80000000
497; VI-NEXT:    s_or_b32 s0, s0, 0x40240000
498; VI-NEXT:    v_mov_b32_e32 v1, s1
499; VI-NEXT:    v_mov_b32_e32 v3, s0
500; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
501; VI-NEXT:    s_endpgm
502;
503; GFX11-LABEL: s_test_copysign_f64_10_mag:
504; GFX11:       ; %bb.0:
505; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
506; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
507; GFX11-NEXT:    s_and_b32 s2, s3, 0x80000000
508; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
509; GFX11-NEXT:    s_or_b32 s2, s2, 0x40240000
510; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
511; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
512; GFX11-NEXT:    s_endpgm
513  %result = call double @llvm.copysign.f64(double 10.0, double %sign)
514  store double %result, ptr addrspace(1) %out, align 4
515  ret void
516}
517
518define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, double %sign) {
519; SI-LABEL: s_test_copysign_f64_neg1_mag:
520; SI:       ; %bb.0:
521; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
522; SI-NEXT:    s_mov_b32 s7, 0xf000
523; SI-NEXT:    s_mov_b32 s6, -1
524; SI-NEXT:    v_mov_b32_e32 v0, 0
525; SI-NEXT:    s_waitcnt lgkmcnt(0)
526; SI-NEXT:    s_mov_b32 s4, s0
527; SI-NEXT:    s_and_b32 s0, s3, 0x80000000
528; SI-NEXT:    s_or_b32 s0, s0, 0x3ff00000
529; SI-NEXT:    s_mov_b32 s5, s1
530; SI-NEXT:    v_mov_b32_e32 v1, s0
531; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
532; SI-NEXT:    s_endpgm
533;
534; VI-LABEL: s_test_copysign_f64_neg1_mag:
535; VI:       ; %bb.0:
536; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
537; VI-NEXT:    v_mov_b32_e32 v2, 0
538; VI-NEXT:    s_waitcnt lgkmcnt(0)
539; VI-NEXT:    v_mov_b32_e32 v0, s0
540; VI-NEXT:    s_and_b32 s0, s3, 0x80000000
541; VI-NEXT:    s_or_b32 s0, s0, 0x3ff00000
542; VI-NEXT:    v_mov_b32_e32 v1, s1
543; VI-NEXT:    v_mov_b32_e32 v3, s0
544; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
545; VI-NEXT:    s_endpgm
546;
547; GFX11-LABEL: s_test_copysign_f64_neg1_mag:
548; GFX11:       ; %bb.0:
549; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
550; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
551; GFX11-NEXT:    s_and_b32 s2, s3, 0x80000000
552; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
553; GFX11-NEXT:    s_or_b32 s2, s2, 0x3ff00000
554; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
555; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
556; GFX11-NEXT:    s_endpgm
557  %result = call double @llvm.copysign.f64(double -1.0, double %sign)
558  store double %result, ptr addrspace(1) %out, align 4
559  ret void
560}
561
562define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, double %sign) {
563; SI-LABEL: s_test_copysign_f64_neg10_mag:
564; SI:       ; %bb.0:
565; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
566; SI-NEXT:    s_mov_b32 s7, 0xf000
567; SI-NEXT:    s_mov_b32 s6, -1
568; SI-NEXT:    v_mov_b32_e32 v0, 0
569; SI-NEXT:    s_waitcnt lgkmcnt(0)
570; SI-NEXT:    s_mov_b32 s4, s0
571; SI-NEXT:    s_and_b32 s0, s3, 0x80000000
572; SI-NEXT:    s_or_b32 s0, s0, 0x40240000
573; SI-NEXT:    s_mov_b32 s5, s1
574; SI-NEXT:    v_mov_b32_e32 v1, s0
575; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
576; SI-NEXT:    s_endpgm
577;
578; VI-LABEL: s_test_copysign_f64_neg10_mag:
579; VI:       ; %bb.0:
580; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
581; VI-NEXT:    v_mov_b32_e32 v2, 0
582; VI-NEXT:    s_waitcnt lgkmcnt(0)
583; VI-NEXT:    v_mov_b32_e32 v0, s0
584; VI-NEXT:    s_and_b32 s0, s3, 0x80000000
585; VI-NEXT:    s_or_b32 s0, s0, 0x40240000
586; VI-NEXT:    v_mov_b32_e32 v1, s1
587; VI-NEXT:    v_mov_b32_e32 v3, s0
588; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
589; VI-NEXT:    s_endpgm
590;
591; GFX11-LABEL: s_test_copysign_f64_neg10_mag:
592; GFX11:       ; %bb.0:
593; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
594; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
595; GFX11-NEXT:    s_and_b32 s2, s3, 0x80000000
596; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
597; GFX11-NEXT:    s_or_b32 s2, s2, 0x40240000
598; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
599; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
600; GFX11-NEXT:    s_endpgm
601  %result = call double @llvm.copysign.f64(double -10.0, double %sign)
602  store double %result, ptr addrspace(1) %out, align 4
603  ret void
604}
605
606define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x double> %mag, <2 x double> %sign) {
607; SI-LABEL: s_test_copysign_v2f64:
608; SI:       ; %bb.0:
609; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
610; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
611; SI-NEXT:    s_brev_b32 s6, -2
612; SI-NEXT:    s_mov_b32 s3, 0xf000
613; SI-NEXT:    s_mov_b32 s2, -1
614; SI-NEXT:    s_waitcnt lgkmcnt(0)
615; SI-NEXT:    v_mov_b32_e32 v0, s11
616; SI-NEXT:    v_mov_b32_e32 v1, s15
617; SI-NEXT:    v_bfi_b32 v3, s6, v0, v1
618; SI-NEXT:    v_mov_b32_e32 v0, s9
619; SI-NEXT:    v_mov_b32_e32 v1, s13
620; SI-NEXT:    v_bfi_b32 v1, s6, v0, v1
621; SI-NEXT:    v_mov_b32_e32 v0, s8
622; SI-NEXT:    v_mov_b32_e32 v2, s10
623; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
624; SI-NEXT:    s_endpgm
625;
626; VI-LABEL: s_test_copysign_v2f64:
627; VI:       ; %bb.0:
628; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
629; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
630; VI-NEXT:    s_brev_b32 s2, -2
631; VI-NEXT:    s_waitcnt lgkmcnt(0)
632; VI-NEXT:    v_mov_b32_e32 v0, s11
633; VI-NEXT:    v_mov_b32_e32 v1, s15
634; VI-NEXT:    v_mov_b32_e32 v2, s9
635; VI-NEXT:    v_bfi_b32 v3, s2, v0, v1
636; VI-NEXT:    v_mov_b32_e32 v0, s13
637; VI-NEXT:    v_mov_b32_e32 v5, s1
638; VI-NEXT:    v_bfi_b32 v1, s2, v2, v0
639; VI-NEXT:    v_mov_b32_e32 v0, s8
640; VI-NEXT:    v_mov_b32_e32 v2, s10
641; VI-NEXT:    v_mov_b32_e32 v4, s0
642; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
643; VI-NEXT:    s_endpgm
644;
645; GFX11-LABEL: s_test_copysign_v2f64:
646; GFX11:       ; %bb.0:
647; GFX11-NEXT:    s_clause 0x1
648; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
649; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
650; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
651; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s15
652; GFX11-NEXT:    v_mov_b32_e32 v2, s13
653; GFX11-NEXT:    v_mov_b32_e32 v0, s8
654; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
655; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, s11, v1
656; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s9, v2
657; GFX11-NEXT:    v_mov_b32_e32 v2, s10
658; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
659; GFX11-NEXT:    s_endpgm
660  %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
661  store <2 x double> %result, ptr addrspace(1) %out, align 16
662  ret void
663}
664
665define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x double> %mag, <3 x double> %sign) {
666; SI-LABEL: s_test_copysign_v3f64:
667; SI:       ; %bb.0:
668; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x11
669; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
670; SI-NEXT:    s_brev_b32 s6, -2
671; SI-NEXT:    s_mov_b32 s3, 0xf000
672; SI-NEXT:    s_mov_b32 s2, -1
673; SI-NEXT:    s_waitcnt lgkmcnt(0)
674; SI-NEXT:    v_mov_b32_e32 v0, s11
675; SI-NEXT:    v_mov_b32_e32 v1, s19
676; SI-NEXT:    v_bfi_b32 v3, s6, v0, v1
677; SI-NEXT:    v_mov_b32_e32 v0, s9
678; SI-NEXT:    v_mov_b32_e32 v1, s17
679; SI-NEXT:    v_bfi_b32 v1, s6, v0, v1
680; SI-NEXT:    v_mov_b32_e32 v0, s13
681; SI-NEXT:    v_mov_b32_e32 v2, s21
682; SI-NEXT:    v_bfi_b32 v5, s6, v0, v2
683; SI-NEXT:    v_mov_b32_e32 v4, s12
684; SI-NEXT:    v_mov_b32_e32 v0, s8
685; SI-NEXT:    v_mov_b32_e32 v2, s10
686; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
687; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
688; SI-NEXT:    s_endpgm
689;
690; VI-LABEL: s_test_copysign_v3f64:
691; VI:       ; %bb.0:
692; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x44
693; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
694; VI-NEXT:    s_brev_b32 s2, -2
695; VI-NEXT:    s_waitcnt lgkmcnt(0)
696; VI-NEXT:    v_mov_b32_e32 v0, s11
697; VI-NEXT:    v_mov_b32_e32 v1, s19
698; VI-NEXT:    v_mov_b32_e32 v2, s9
699; VI-NEXT:    v_bfi_b32 v3, s2, v0, v1
700; VI-NEXT:    v_mov_b32_e32 v0, s17
701; VI-NEXT:    v_bfi_b32 v1, s2, v2, v0
702; VI-NEXT:    v_mov_b32_e32 v0, s13
703; VI-NEXT:    v_mov_b32_e32 v2, s21
704; VI-NEXT:    v_bfi_b32 v5, s2, v0, v2
705; VI-NEXT:    s_add_u32 s2, s0, 16
706; VI-NEXT:    s_addc_u32 s3, s1, 0
707; VI-NEXT:    v_mov_b32_e32 v7, s3
708; VI-NEXT:    v_mov_b32_e32 v4, s12
709; VI-NEXT:    v_mov_b32_e32 v6, s2
710; VI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
711; VI-NEXT:    v_mov_b32_e32 v5, s1
712; VI-NEXT:    v_mov_b32_e32 v0, s8
713; VI-NEXT:    v_mov_b32_e32 v2, s10
714; VI-NEXT:    v_mov_b32_e32 v4, s0
715; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
716; VI-NEXT:    s_endpgm
717;
718; GFX11-LABEL: s_test_copysign_v3f64:
719; GFX11:       ; %bb.0:
720; GFX11-NEXT:    s_clause 0x1
721; GFX11-NEXT:    s_load_b512 s[8:23], s[4:5], 0x44
722; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
723; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
724; GFX11-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s19
725; GFX11-NEXT:    v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v0, s8
726; GFX11-NEXT:    v_dual_mov_b32 v7, s17 :: v_dual_mov_b32 v4, s12
727; GFX11-NEXT:    v_mov_b32_e32 v2, s10
728; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
729; GFX11-NEXT:    v_bfi_b32 v5, 0x7fffffff, s13, v5
730; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, s11, v1
731; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s9, v7
732; GFX11-NEXT:    s_clause 0x1
733; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
734; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
735; GFX11-NEXT:    s_endpgm
736  %result = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign)
737  store <3 x double> %result, ptr addrspace(1) %out, align 32
738  ret void
739}
740
741define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %mag, <4 x double> %sign) {
742; SI-LABEL: s_test_copysign_v4f64:
743; SI:       ; %bb.0:
744; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x11
745; SI-NEXT:    s_brev_b32 s6, -2
746; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
747; SI-NEXT:    s_mov_b32 s3, 0xf000
748; SI-NEXT:    s_mov_b32 s2, -1
749; SI-NEXT:    s_waitcnt lgkmcnt(0)
750; SI-NEXT:    v_mov_b32_e32 v0, s11
751; SI-NEXT:    v_mov_b32_e32 v1, s19
752; SI-NEXT:    v_bfi_b32 v3, s6, v0, v1
753; SI-NEXT:    v_mov_b32_e32 v0, s9
754; SI-NEXT:    v_mov_b32_e32 v1, s17
755; SI-NEXT:    v_bfi_b32 v1, s6, v0, v1
756; SI-NEXT:    v_mov_b32_e32 v0, s15
757; SI-NEXT:    v_mov_b32_e32 v2, s23
758; SI-NEXT:    v_bfi_b32 v7, s6, v0, v2
759; SI-NEXT:    v_mov_b32_e32 v0, s13
760; SI-NEXT:    v_mov_b32_e32 v2, s21
761; SI-NEXT:    v_bfi_b32 v5, s6, v0, v2
762; SI-NEXT:    v_mov_b32_e32 v4, s12
763; SI-NEXT:    v_mov_b32_e32 v6, s14
764; SI-NEXT:    v_mov_b32_e32 v0, s8
765; SI-NEXT:    v_mov_b32_e32 v2, s10
766; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
767; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
768; SI-NEXT:    s_endpgm
769;
770; VI-LABEL: s_test_copysign_v4f64:
771; VI:       ; %bb.0:
772; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x44
773; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
774; VI-NEXT:    s_brev_b32 s2, -2
775; VI-NEXT:    s_waitcnt lgkmcnt(0)
776; VI-NEXT:    v_mov_b32_e32 v0, s11
777; VI-NEXT:    v_mov_b32_e32 v1, s19
778; VI-NEXT:    v_mov_b32_e32 v2, s9
779; VI-NEXT:    v_bfi_b32 v3, s2, v0, v1
780; VI-NEXT:    v_mov_b32_e32 v0, s17
781; VI-NEXT:    v_bfi_b32 v1, s2, v2, v0
782; VI-NEXT:    v_mov_b32_e32 v0, s15
783; VI-NEXT:    v_mov_b32_e32 v2, s23
784; VI-NEXT:    v_bfi_b32 v7, s2, v0, v2
785; VI-NEXT:    v_mov_b32_e32 v0, s13
786; VI-NEXT:    v_mov_b32_e32 v2, s21
787; VI-NEXT:    v_bfi_b32 v5, s2, v0, v2
788; VI-NEXT:    s_add_u32 s2, s0, 16
789; VI-NEXT:    s_addc_u32 s3, s1, 0
790; VI-NEXT:    v_mov_b32_e32 v9, s3
791; VI-NEXT:    v_mov_b32_e32 v4, s12
792; VI-NEXT:    v_mov_b32_e32 v6, s14
793; VI-NEXT:    v_mov_b32_e32 v8, s2
794; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
795; VI-NEXT:    v_mov_b32_e32 v0, s8
796; VI-NEXT:    v_mov_b32_e32 v5, s1
797; VI-NEXT:    v_mov_b32_e32 v2, s10
798; VI-NEXT:    v_mov_b32_e32 v4, s0
799; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
800; VI-NEXT:    s_endpgm
801;
802; GFX11-LABEL: s_test_copysign_v4f64:
803; GFX11:       ; %bb.0:
804; GFX11-NEXT:    s_clause 0x1
805; GFX11-NEXT:    s_load_b512 s[8:23], s[4:5], 0x44
806; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
807; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
808; GFX11-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s19
809; GFX11-NEXT:    v_dual_mov_b32 v3, s23 :: v_dual_mov_b32 v2, s14
810; GFX11-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v4, s8
811; GFX11-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v0, s12
812; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
813; GFX11-NEXT:    v_bfi_b32 v7, 0x7fffffff, s11, v1
814; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, s15, v3
815; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
816; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, s13, v9
817; GFX11-NEXT:    v_mov_b32_e32 v6, s10
818; GFX11-NEXT:    v_bfi_b32 v5, 0x7fffffff, s9, v5
819; GFX11-NEXT:    s_clause 0x1
820; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
821; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
822; GFX11-NEXT:    s_endpgm
823  %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
824  store <4 x double> %result, ptr addrspace(1) %out, align 32
825  ret void
826}
827
828define double @v_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], double %sign) {
829; SIVI-LABEL: v_test_copysign_f64:
830; SIVI:       ; %bb.0:
831; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
832; SIVI-NEXT:    s_brev_b32 s4, -2
833; SIVI-NEXT:    v_mov_b32_e32 v0, v10
834; SIVI-NEXT:    v_bfi_b32 v1, s4, v11, v21
835; SIVI-NEXT:    s_setpc_b64 s[30:31]
836;
837; GFX11-LABEL: v_test_copysign_f64:
838; GFX11:       ; %bb.0:
839; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
840; GFX11-NEXT:    v_mov_b32_e32 v0, v10
841; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v11, v21
842; GFX11-NEXT:    s_setpc_b64 s[30:31]
843  %result = call double @llvm.copysign.f64(double %mag, double %sign)
844  ret double %result
845}
846
847define double @v_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32], double %mag) {
848; SIVI-LABEL: v_test_copysign_f64_0:
849; SIVI:       ; %bb.0:
850; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
851; SIVI-NEXT:    v_mov_b32_e32 v0, v10
852; SIVI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v11
853; SIVI-NEXT:    s_setpc_b64 s[30:31]
854;
855; GFX11-LABEL: v_test_copysign_f64_0:
856; GFX11:       ; %bb.0:
857; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
858; GFX11-NEXT:    v_dual_mov_b32 v0, v10 :: v_dual_and_b32 v1, 0x7fffffff, v11
859; GFX11-NEXT:    s_setpc_b64 s[30:31]
860  %result = call double @llvm.copysign.f64(double %mag, double 0.0)
861  ret double %result
862}
863
864define double @v_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32], double %mag) {
865; SIVI-LABEL: v_test_copysign_f64_1:
866; SIVI:       ; %bb.0:
867; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
868; SIVI-NEXT:    v_mov_b32_e32 v0, v10
869; SIVI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v11
870; SIVI-NEXT:    s_setpc_b64 s[30:31]
871;
872; GFX11-LABEL: v_test_copysign_f64_1:
873; GFX11:       ; %bb.0:
874; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
875; GFX11-NEXT:    v_dual_mov_b32 v0, v10 :: v_dual_and_b32 v1, 0x7fffffff, v11
876; GFX11-NEXT:    s_setpc_b64 s[30:31]
877  %result = call double @llvm.copysign.f64(double %mag, double 1.0)
878  ret double %result
879}
880
881define double @v_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i32], double %mag) {
882; SIVI-LABEL: v_test_copysign_f64_10:
883; SIVI:       ; %bb.0:
884; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
885; SIVI-NEXT:    v_mov_b32_e32 v0, v10
886; SIVI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v11
887; SIVI-NEXT:    s_setpc_b64 s[30:31]
888;
889; GFX11-LABEL: v_test_copysign_f64_10:
890; GFX11:       ; %bb.0:
891; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
892; GFX11-NEXT:    v_dual_mov_b32 v0, v10 :: v_dual_and_b32 v1, 0x7fffffff, v11
893; GFX11-NEXT:    s_setpc_b64 s[30:31]
894  %result = call double @llvm.copysign.f64(double %mag, double 10.0)
895  ret double %result
896}
897
898define double @v_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x i32], double %mag) {
899; SIVI-LABEL: v_test_copysign_f64_neg1:
900; SIVI:       ; %bb.0:
901; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
902; SIVI-NEXT:    v_mov_b32_e32 v0, v10
903; SIVI-NEXT:    v_or_b32_e32 v1, 0x80000000, v11
904; SIVI-NEXT:    s_setpc_b64 s[30:31]
905;
906; GFX11-LABEL: v_test_copysign_f64_neg1:
907; GFX11:       ; %bb.0:
908; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
909; GFX11-NEXT:    v_mov_b32_e32 v0, v10
910; GFX11-NEXT:    v_or_b32_e32 v1, 0x80000000, v11
911; GFX11-NEXT:    s_setpc_b64 s[30:31]
912  %result = call double @llvm.copysign.f64(double %mag, double -1.0)
913  ret double %result
914}
915
916define double @v_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x i32], double %mag) {
917; SIVI-LABEL: v_test_copysign_f64_neg10:
918; SIVI:       ; %bb.0:
919; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
920; SIVI-NEXT:    v_mov_b32_e32 v0, v10
921; SIVI-NEXT:    v_or_b32_e32 v1, 0x80000000, v11
922; SIVI-NEXT:    s_setpc_b64 s[30:31]
923;
924; GFX11-LABEL: v_test_copysign_f64_neg10:
925; GFX11:       ; %bb.0:
926; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
927; GFX11-NEXT:    v_mov_b32_e32 v0, v10
928; GFX11-NEXT:    v_or_b32_e32 v1, 0x80000000, v11
929; GFX11-NEXT:    s_setpc_b64 s[30:31]
930  %result = call double @llvm.copysign.f64(double %mag, double -10.0)
931  ret double %result
932}
933
934define double @v_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], float %sign) {
935; SIVI-LABEL: v_test_copysign_f64_f32:
936; SIVI:       ; %bb.0:
937; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
938; SIVI-NEXT:    s_brev_b32 s4, -2
939; SIVI-NEXT:    v_mov_b32_e32 v0, v10
940; SIVI-NEXT:    v_bfi_b32 v1, s4, v11, v20
941; SIVI-NEXT:    s_setpc_b64 s[30:31]
942;
943; GFX11-LABEL: v_test_copysign_f64_f32:
944; GFX11:       ; %bb.0:
945; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
946; GFX11-NEXT:    v_mov_b32_e32 v0, v10
947; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v11, v20
948; GFX11-NEXT:    s_setpc_b64 s[30:31]
949  %sign.ext = fpext float %sign to double
950  %result = call double @llvm.copysign.f64(double %mag, double %sign.ext)
951  ret double %result
952}
953
954define double @v_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], half %sign) {
955; SI-LABEL: v_test_copysign_f64_f16:
956; SI:       ; %bb.0:
957; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
958; SI-NEXT:    s_brev_b32 s4, -2
959; SI-NEXT:    v_mov_b32_e32 v0, v10
960; SI-NEXT:    v_bfi_b32 v1, s4, v11, v20
961; SI-NEXT:    s_setpc_b64 s[30:31]
962;
963; VI-LABEL: v_test_copysign_f64_f16:
964; VI:       ; %bb.0:
965; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
966; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v20
967; VI-NEXT:    s_brev_b32 s4, -2
968; VI-NEXT:    v_mov_b32_e32 v0, v10
969; VI-NEXT:    v_bfi_b32 v1, s4, v11, v1
970; VI-NEXT:    s_setpc_b64 s[30:31]
971;
972; GFX11-LABEL: v_test_copysign_f64_f16:
973; GFX11:       ; %bb.0:
974; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
975; GFX11-NEXT:    v_dual_mov_b32 v0, v10 :: v_dual_lshlrev_b32 v1, 16, v20
976; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
977; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v11, v1
978; GFX11-NEXT:    s_setpc_b64 s[30:31]
979  %sign.ext = fpext half %sign to double
980  %result = call double @llvm.copysign.f64(double %mag, double %sign.ext)
981  ret double %result
982}
983
984define <2 x double> @v_test_copysign_v2f64(ptr addrspace(1) %out, <2 x double> %mag, <2 x double> %sign) {
985; SIVI-LABEL: v_test_copysign_v2f64:
986; SIVI:       ; %bb.0:
987; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
988; SIVI-NEXT:    s_brev_b32 s4, -2
989; SIVI-NEXT:    v_mov_b32_e32 v0, v2
990; SIVI-NEXT:    v_bfi_b32 v1, s4, v3, v7
991; SIVI-NEXT:    v_bfi_b32 v3, s4, v5, v9
992; SIVI-NEXT:    v_mov_b32_e32 v2, v4
993; SIVI-NEXT:    s_setpc_b64 s[30:31]
994;
995; GFX11-LABEL: v_test_copysign_v2f64:
996; GFX11:       ; %bb.0:
997; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
998; GFX11-NEXT:    v_mov_b32_e32 v0, v2
999; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v3, v7
1000; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, v5, v9
1001; GFX11-NEXT:    v_mov_b32_e32 v2, v4
1002; GFX11-NEXT:    s_setpc_b64 s[30:31]
1003  %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
1004  ret <2 x double> %result
1005}
1006
1007define <3 x double> @v_test_copysign_v3f64(ptr addrspace(1) %out, <3 x double> %mag, <3 x double> %sign) {
1008; SIVI-LABEL: v_test_copysign_v3f64:
1009; SIVI:       ; %bb.0:
1010; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1011; SIVI-NEXT:    s_brev_b32 s4, -2
1012; SIVI-NEXT:    v_mov_b32_e32 v0, v2
1013; SIVI-NEXT:    v_bfi_b32 v1, s4, v3, v9
1014; SIVI-NEXT:    v_bfi_b32 v3, s4, v5, v11
1015; SIVI-NEXT:    v_bfi_b32 v5, s4, v7, v13
1016; SIVI-NEXT:    v_mov_b32_e32 v2, v4
1017; SIVI-NEXT:    v_mov_b32_e32 v4, v6
1018; SIVI-NEXT:    s_setpc_b64 s[30:31]
1019;
1020; GFX11-LABEL: v_test_copysign_v3f64:
1021; GFX11:       ; %bb.0:
1022; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1023; GFX11-NEXT:    v_mov_b32_e32 v0, v2
1024; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v3, v9
1025; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, v5, v11
1026; GFX11-NEXT:    v_bfi_b32 v5, 0x7fffffff, v7, v13
1027; GFX11-NEXT:    v_mov_b32_e32 v2, v4
1028; GFX11-NEXT:    v_mov_b32_e32 v4, v6
1029; GFX11-NEXT:    s_setpc_b64 s[30:31]
1030  %result = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign)
1031  ret <3 x double> %result
1032}
1033
1034define <4 x double> @v_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %mag, <4 x double> %sign) {
1035; SIVI-LABEL: v_test_copysign_v4f64:
1036; SIVI:       ; %bb.0:
1037; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1038; SIVI-NEXT:    s_brev_b32 s4, -2
1039; SIVI-NEXT:    v_mov_b32_e32 v0, v2
1040; SIVI-NEXT:    v_bfi_b32 v1, s4, v3, v11
1041; SIVI-NEXT:    v_bfi_b32 v3, s4, v5, v13
1042; SIVI-NEXT:    v_bfi_b32 v5, s4, v7, v15
1043; SIVI-NEXT:    v_bfi_b32 v7, s4, v9, v17
1044; SIVI-NEXT:    v_mov_b32_e32 v2, v4
1045; SIVI-NEXT:    v_mov_b32_e32 v4, v6
1046; SIVI-NEXT:    v_mov_b32_e32 v6, v8
1047; SIVI-NEXT:    s_setpc_b64 s[30:31]
1048;
1049; GFX11-LABEL: v_test_copysign_v4f64:
1050; GFX11:       ; %bb.0:
1051; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1052; GFX11-NEXT:    v_mov_b32_e32 v0, v2
1053; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v3, v11
1054; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, v5, v13
1055; GFX11-NEXT:    v_bfi_b32 v5, 0x7fffffff, v7, v15
1056; GFX11-NEXT:    v_bfi_b32 v7, 0x7fffffff, v9, v17
1057; GFX11-NEXT:    v_mov_b32_e32 v2, v4
1058; GFX11-NEXT:    v_mov_b32_e32 v4, v6
1059; GFX11-NEXT:    v_mov_b32_e32 v6, v8
1060; GFX11-NEXT:    s_setpc_b64 s[30:31]
1061  %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
1062  ret <4 x double> %result
1063}
1064
1065attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
1066