xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope --check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope --check-prefixes=VI %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope --check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11 %s
6
7declare half @llvm.copysign.f16(half, half) #0
8declare float @llvm.copysign.f32(float, float) #0
9declare double @llvm.copysign.f64(double, double) #0
10declare <2 x half> @llvm.copysign.v2f16(<2 x half>, <2 x half>) #0
11declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>) #0
12declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>) #0
13declare i32 @llvm.amdgcn.workitem.id.x() #0
14
15define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, half %sign) {
16; SI-LABEL: s_copysign_f16:
17; SI:       ; %bb.0:
18; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
19; SI-NEXT:    s_brev_b32 s2, -2
20; SI-NEXT:    s_mov_b32 s3, 0xf000
21; SI-NEXT:    s_waitcnt lgkmcnt(0)
22; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
23; SI-NEXT:    s_lshr_b32 s0, s0, 16
24; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
25; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
26; SI-NEXT:    v_bfi_b32 v0, s2, v0, v1
27; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
28; SI-NEXT:    s_mov_b32 s2, -1
29; SI-NEXT:    s_waitcnt lgkmcnt(0)
30; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
31; SI-NEXT:    s_endpgm
32;
33; VI-LABEL: s_copysign_f16:
34; VI:       ; %bb.0:
35; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
36; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
37; VI-NEXT:    s_movk_i32 s3, 0x7fff
38; VI-NEXT:    s_waitcnt lgkmcnt(0)
39; VI-NEXT:    s_lshr_b32 s4, s2, 16
40; VI-NEXT:    v_mov_b32_e32 v0, s2
41; VI-NEXT:    v_mov_b32_e32 v1, s4
42; VI-NEXT:    v_bfi_b32 v2, s3, v0, v1
43; VI-NEXT:    v_mov_b32_e32 v0, s0
44; VI-NEXT:    v_mov_b32_e32 v1, s1
45; VI-NEXT:    flat_store_short v[0:1], v2
46; VI-NEXT:    s_endpgm
47;
48; GFX9-LABEL: s_copysign_f16:
49; GFX9:       ; %bb.0:
50; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
51; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
52; GFX9-NEXT:    s_movk_i32 s3, 0x7fff
53; GFX9-NEXT:    v_mov_b32_e32 v0, 0
54; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
55; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
56; GFX9-NEXT:    v_mov_b32_e32 v1, s2
57; GFX9-NEXT:    v_mov_b32_e32 v2, s4
58; GFX9-NEXT:    v_bfi_b32 v1, s3, v1, v2
59; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
60; GFX9-NEXT:    s_endpgm
61;
62; GFX11-LABEL: s_copysign_f16:
63; GFX11:       ; %bb.0:
64; GFX11-NEXT:    s_clause 0x1
65; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
66; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
67; GFX11-NEXT:    v_mov_b32_e32 v1, 0
68; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX11-NEXT:    s_lshr_b32 s3, s2, 16
70; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
71; GFX11-NEXT:    v_mov_b32_e32 v0, s3
72; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s2, v0
73; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
74; GFX11-NEXT:    s_endpgm
75  %out = call half @llvm.copysign.f16(half %mag, half %sign)
76  store half %out, ptr addrspace(1) %arg_out
77  ret void
78}
79
80define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %mag) {
81; SI-LABEL: s_test_copysign_f16_0:
82; SI:       ; %bb.0:
83; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
84; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
85; SI-NEXT:    s_mov_b32 s3, 0xf000
86; SI-NEXT:    s_mov_b32 s2, -1
87; SI-NEXT:    s_waitcnt lgkmcnt(0)
88; SI-NEXT:    s_and_b32 s4, s6, 0x7fff
89; SI-NEXT:    v_mov_b32_e32 v0, s4
90; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
91; SI-NEXT:    s_endpgm
92;
93; VI-LABEL: s_test_copysign_f16_0:
94; VI:       ; %bb.0:
95; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
96; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
97; VI-NEXT:    s_waitcnt lgkmcnt(0)
98; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
99; VI-NEXT:    v_mov_b32_e32 v0, s0
100; VI-NEXT:    v_mov_b32_e32 v1, s1
101; VI-NEXT:    v_mov_b32_e32 v2, s2
102; VI-NEXT:    flat_store_short v[0:1], v2
103; VI-NEXT:    s_endpgm
104;
105; GFX9-LABEL: s_test_copysign_f16_0:
106; GFX9:       ; %bb.0:
107; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
108; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
109; GFX9-NEXT:    v_mov_b32_e32 v0, 0
110; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff
112; GFX9-NEXT:    v_mov_b32_e32 v1, s2
113; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
114; GFX9-NEXT:    s_endpgm
115;
116; GFX11-LABEL: s_test_copysign_f16_0:
117; GFX11:       ; %bb.0:
118; GFX11-NEXT:    s_clause 0x1
119; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
120; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
121; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
122; GFX11-NEXT:    s_and_b32 s2, s2, 0x7fff
123; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
124; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
125; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
126; GFX11-NEXT:    s_endpgm
127  %result = call half @llvm.copysign.f16(half %mag, half 0.0)
128  store half %result, ptr addrspace(1) %out, align 4
129  ret void
130}
131
132define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %mag) {
133; SI-LABEL: s_test_copysign_f16_1:
134; SI:       ; %bb.0:
135; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
136; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
137; SI-NEXT:    s_mov_b32 s3, 0xf000
138; SI-NEXT:    s_mov_b32 s2, -1
139; SI-NEXT:    s_waitcnt lgkmcnt(0)
140; SI-NEXT:    s_and_b32 s4, s6, 0x7fff
141; SI-NEXT:    v_mov_b32_e32 v0, s4
142; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
143; SI-NEXT:    s_endpgm
144;
145; VI-LABEL: s_test_copysign_f16_1:
146; VI:       ; %bb.0:
147; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
148; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
149; VI-NEXT:    s_waitcnt lgkmcnt(0)
150; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
151; VI-NEXT:    v_mov_b32_e32 v0, s0
152; VI-NEXT:    v_mov_b32_e32 v1, s1
153; VI-NEXT:    v_mov_b32_e32 v2, s2
154; VI-NEXT:    flat_store_short v[0:1], v2
155; VI-NEXT:    s_endpgm
156;
157; GFX9-LABEL: s_test_copysign_f16_1:
158; GFX9:       ; %bb.0:
159; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
160; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
161; GFX9-NEXT:    v_mov_b32_e32 v0, 0
162; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
163; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff
164; GFX9-NEXT:    v_mov_b32_e32 v1, s2
165; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
166; GFX9-NEXT:    s_endpgm
167;
168; GFX11-LABEL: s_test_copysign_f16_1:
169; GFX11:       ; %bb.0:
170; GFX11-NEXT:    s_clause 0x1
171; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
172; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
173; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
174; GFX11-NEXT:    s_and_b32 s2, s2, 0x7fff
175; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
176; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
177; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
178; GFX11-NEXT:    s_endpgm
179  %result = call half @llvm.copysign.f16(half %mag, half 1.0)
180  store half %result, ptr addrspace(1) %out, align 4
181  ret void
182}
183
184define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half %mag) {
185; SI-LABEL: s_test_copysign_f16_10.0:
186; SI:       ; %bb.0:
187; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
188; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
189; SI-NEXT:    s_mov_b32 s3, 0xf000
190; SI-NEXT:    s_mov_b32 s2, -1
191; SI-NEXT:    s_waitcnt lgkmcnt(0)
192; SI-NEXT:    s_and_b32 s4, s6, 0x7fff
193; SI-NEXT:    v_mov_b32_e32 v0, s4
194; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
195; SI-NEXT:    s_endpgm
196;
197; VI-LABEL: s_test_copysign_f16_10.0:
198; VI:       ; %bb.0:
199; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
200; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
201; VI-NEXT:    s_waitcnt lgkmcnt(0)
202; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
203; VI-NEXT:    v_mov_b32_e32 v0, s0
204; VI-NEXT:    v_mov_b32_e32 v1, s1
205; VI-NEXT:    v_mov_b32_e32 v2, s2
206; VI-NEXT:    flat_store_short v[0:1], v2
207; VI-NEXT:    s_endpgm
208;
209; GFX9-LABEL: s_test_copysign_f16_10.0:
210; GFX9:       ; %bb.0:
211; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
212; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
213; GFX9-NEXT:    v_mov_b32_e32 v0, 0
214; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
215; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff
216; GFX9-NEXT:    v_mov_b32_e32 v1, s2
217; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
218; GFX9-NEXT:    s_endpgm
219;
220; GFX11-LABEL: s_test_copysign_f16_10.0:
221; GFX11:       ; %bb.0:
222; GFX11-NEXT:    s_clause 0x1
223; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
224; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
225; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
226; GFX11-NEXT:    s_and_b32 s2, s2, 0x7fff
227; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
228; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
229; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
230; GFX11-NEXT:    s_endpgm
231  %result = call half @llvm.copysign.f16(half %mag, half 10.0)
232  store half %result, ptr addrspace(1) %out, align 4
233  ret void
234}
235
236define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half %mag) {
237; SI-LABEL: s_test_copysign_f16_neg1:
238; SI:       ; %bb.0:
239; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
240; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
241; SI-NEXT:    s_mov_b32 s3, 0xf000
242; SI-NEXT:    s_mov_b32 s2, -1
243; SI-NEXT:    s_waitcnt lgkmcnt(0)
244; SI-NEXT:    s_or_b32 s4, s6, 0x8000
245; SI-NEXT:    v_mov_b32_e32 v0, s4
246; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
247; SI-NEXT:    s_endpgm
248;
249; VI-LABEL: s_test_copysign_f16_neg1:
250; VI:       ; %bb.0:
251; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
252; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
253; VI-NEXT:    s_waitcnt lgkmcnt(0)
254; VI-NEXT:    s_bitset1_b32 s2, 15
255; VI-NEXT:    v_mov_b32_e32 v0, s0
256; VI-NEXT:    v_mov_b32_e32 v1, s1
257; VI-NEXT:    v_mov_b32_e32 v2, s2
258; VI-NEXT:    flat_store_short v[0:1], v2
259; VI-NEXT:    s_endpgm
260;
261; GFX9-LABEL: s_test_copysign_f16_neg1:
262; GFX9:       ; %bb.0:
263; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
264; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
265; GFX9-NEXT:    v_mov_b32_e32 v0, 0
266; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
267; GFX9-NEXT:    s_bitset1_b32 s2, 15
268; GFX9-NEXT:    v_mov_b32_e32 v1, s2
269; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
270; GFX9-NEXT:    s_endpgm
271;
272; GFX11-LABEL: s_test_copysign_f16_neg1:
273; GFX11:       ; %bb.0:
274; GFX11-NEXT:    s_clause 0x1
275; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
276; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
277; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
278; GFX11-NEXT:    s_bitset1_b32 s2, 15
279; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
280; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
281; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
282; GFX11-NEXT:    s_endpgm
283  %result = call half @llvm.copysign.f16(half %mag, half -1.0)
284  store half %result, ptr addrspace(1) %out, align 4
285  ret void
286}
287
288define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half %mag) {
289; SI-LABEL: s_test_copysign_f16_neg10:
290; SI:       ; %bb.0:
291; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
292; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
293; SI-NEXT:    s_mov_b32 s3, 0xf000
294; SI-NEXT:    s_mov_b32 s2, -1
295; SI-NEXT:    s_waitcnt lgkmcnt(0)
296; SI-NEXT:    s_or_b32 s4, s6, 0x8000
297; SI-NEXT:    v_mov_b32_e32 v0, s4
298; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
299; SI-NEXT:    s_endpgm
300;
301; VI-LABEL: s_test_copysign_f16_neg10:
302; VI:       ; %bb.0:
303; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
304; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
305; VI-NEXT:    s_waitcnt lgkmcnt(0)
306; VI-NEXT:    s_bitset1_b32 s2, 15
307; VI-NEXT:    v_mov_b32_e32 v0, s0
308; VI-NEXT:    v_mov_b32_e32 v1, s1
309; VI-NEXT:    v_mov_b32_e32 v2, s2
310; VI-NEXT:    flat_store_short v[0:1], v2
311; VI-NEXT:    s_endpgm
312;
313; GFX9-LABEL: s_test_copysign_f16_neg10:
314; GFX9:       ; %bb.0:
315; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
316; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
317; GFX9-NEXT:    v_mov_b32_e32 v0, 0
318; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
319; GFX9-NEXT:    s_bitset1_b32 s2, 15
320; GFX9-NEXT:    v_mov_b32_e32 v1, s2
321; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
322; GFX9-NEXT:    s_endpgm
323;
324; GFX11-LABEL: s_test_copysign_f16_neg10:
325; GFX11:       ; %bb.0:
326; GFX11-NEXT:    s_clause 0x1
327; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
328; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
329; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
330; GFX11-NEXT:    s_bitset1_b32 s2, 15
331; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
332; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
333; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
334; GFX11-NEXT:    s_endpgm
335  %result = call half @llvm.copysign.f16(half %mag, half -10.0)
336  store half %result, ptr addrspace(1) %out, align 4
337  ret void
338}
339
340define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half %sign) {
341; SI-LABEL: s_test_copysign_f16_0_mag:
342; SI:       ; %bb.0:
343; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
344; SI-NEXT:    s_brev_b32 s2, -2
345; SI-NEXT:    s_mov_b32 s3, 0xf000
346; SI-NEXT:    s_waitcnt lgkmcnt(0)
347; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
348; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
349; SI-NEXT:    v_bfi_b32 v0, s2, 0, v0
350; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
351; SI-NEXT:    s_mov_b32 s2, -1
352; SI-NEXT:    s_waitcnt lgkmcnt(0)
353; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
354; SI-NEXT:    s_endpgm
355;
356; VI-LABEL: s_test_copysign_f16_0_mag:
357; VI:       ; %bb.0:
358; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
359; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
360; VI-NEXT:    s_waitcnt lgkmcnt(0)
361; VI-NEXT:    s_and_b32 s2, s2, 0x8000
362; VI-NEXT:    v_mov_b32_e32 v0, s0
363; VI-NEXT:    v_mov_b32_e32 v1, s1
364; VI-NEXT:    v_mov_b32_e32 v2, s2
365; VI-NEXT:    flat_store_short v[0:1], v2
366; VI-NEXT:    s_endpgm
367;
368; GFX9-LABEL: s_test_copysign_f16_0_mag:
369; GFX9:       ; %bb.0:
370; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
371; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
372; GFX9-NEXT:    v_mov_b32_e32 v0, 0
373; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX9-NEXT:    s_and_b32 s2, s2, 0x8000
375; GFX9-NEXT:    v_mov_b32_e32 v1, s2
376; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
377; GFX9-NEXT:    s_endpgm
378;
379; GFX11-LABEL: s_test_copysign_f16_0_mag:
380; GFX11:       ; %bb.0:
381; GFX11-NEXT:    s_clause 0x1
382; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
383; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
384; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
385; GFX11-NEXT:    s_and_b32 s2, s2, 0x8000
386; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
387; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
388; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
389; GFX11-NEXT:    s_endpgm
390  %result = call half @llvm.copysign.f16(half 0.0, half %sign)
391  store half %result, ptr addrspace(1) %out, align 4
392  ret void
393}
394
395
396define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half %sign) {
397; SI-LABEL: s_test_copysign_f16_1_mag:
398; SI:       ; %bb.0:
399; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
400; SI-NEXT:    s_brev_b32 s2, -2
401; SI-NEXT:    s_mov_b32 s3, 0xf000
402; SI-NEXT:    s_waitcnt lgkmcnt(0)
403; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
404; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
405; SI-NEXT:    v_bfi_b32 v0, s2, 1.0, v0
406; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
407; SI-NEXT:    s_mov_b32 s2, -1
408; SI-NEXT:    s_waitcnt lgkmcnt(0)
409; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
410; SI-NEXT:    s_endpgm
411;
412; VI-LABEL: s_test_copysign_f16_1_mag:
413; VI:       ; %bb.0:
414; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
415; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
416; VI-NEXT:    s_waitcnt lgkmcnt(0)
417; VI-NEXT:    s_and_b32 s2, s2, 0x8000
418; VI-NEXT:    s_or_b32 s2, s2, 0x3c00
419; VI-NEXT:    v_mov_b32_e32 v0, s0
420; VI-NEXT:    v_mov_b32_e32 v1, s1
421; VI-NEXT:    v_mov_b32_e32 v2, s2
422; VI-NEXT:    flat_store_short v[0:1], v2
423; VI-NEXT:    s_endpgm
424;
425; GFX9-LABEL: s_test_copysign_f16_1_mag:
426; GFX9:       ; %bb.0:
427; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
428; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
429; GFX9-NEXT:    v_mov_b32_e32 v0, 0
430; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
431; GFX9-NEXT:    s_and_b32 s2, s2, 0x8000
432; GFX9-NEXT:    s_or_b32 s2, s2, 0x3c00
433; GFX9-NEXT:    v_mov_b32_e32 v1, s2
434; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
435; GFX9-NEXT:    s_endpgm
436;
437; GFX11-LABEL: s_test_copysign_f16_1_mag:
438; GFX11:       ; %bb.0:
439; GFX11-NEXT:    s_clause 0x1
440; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
441; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
442; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
443; GFX11-NEXT:    s_and_b32 s2, s2, 0x8000
444; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
445; GFX11-NEXT:    s_or_b32 s2, s2, 0x3c00
446; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
447; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
448; GFX11-NEXT:    s_endpgm
449  %result = call half @llvm.copysign.f16(half 1.0, half %sign)
450  store half %result, ptr addrspace(1) %out, align 4
451  ret void
452}
453
454define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, half %sign) {
455; SI-LABEL: s_test_copysign_f16_10_mag:
456; SI:       ; %bb.0:
457; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
458; SI-NEXT:    s_brev_b32 s2, -2
459; SI-NEXT:    v_mov_b32_e32 v1, 0x41200000
460; SI-NEXT:    s_mov_b32 s3, 0xf000
461; SI-NEXT:    s_waitcnt lgkmcnt(0)
462; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
463; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
464; SI-NEXT:    v_bfi_b32 v0, s2, v1, v0
465; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
466; SI-NEXT:    s_mov_b32 s2, -1
467; SI-NEXT:    s_waitcnt lgkmcnt(0)
468; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
469; SI-NEXT:    s_endpgm
470;
471; VI-LABEL: s_test_copysign_f16_10_mag:
472; VI:       ; %bb.0:
473; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
474; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
475; VI-NEXT:    s_waitcnt lgkmcnt(0)
476; VI-NEXT:    s_and_b32 s2, s2, 0x8000
477; VI-NEXT:    s_or_b32 s2, s2, 0x4900
478; VI-NEXT:    v_mov_b32_e32 v0, s0
479; VI-NEXT:    v_mov_b32_e32 v1, s1
480; VI-NEXT:    v_mov_b32_e32 v2, s2
481; VI-NEXT:    flat_store_short v[0:1], v2
482; VI-NEXT:    s_endpgm
483;
484; GFX9-LABEL: s_test_copysign_f16_10_mag:
485; GFX9:       ; %bb.0:
486; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
487; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
488; GFX9-NEXT:    v_mov_b32_e32 v0, 0
489; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
490; GFX9-NEXT:    s_and_b32 s2, s2, 0x8000
491; GFX9-NEXT:    s_or_b32 s2, s2, 0x4900
492; GFX9-NEXT:    v_mov_b32_e32 v1, s2
493; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
494; GFX9-NEXT:    s_endpgm
495;
496; GFX11-LABEL: s_test_copysign_f16_10_mag:
497; GFX11:       ; %bb.0:
498; GFX11-NEXT:    s_clause 0x1
499; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
500; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
501; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
502; GFX11-NEXT:    s_and_b32 s2, s2, 0x8000
503; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
504; GFX11-NEXT:    s_or_b32 s2, s2, 0x4900
505; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
506; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
507; GFX11-NEXT:    s_endpgm
508  %result = call half @llvm.copysign.f16(half 10.0, half %sign)
509  store half %result, ptr addrspace(1) %out, align 4
510  ret void
511}
512
513define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, half %sign) {
514; SI-LABEL: s_test_copysign_f16_neg1_mag:
515; SI:       ; %bb.0:
516; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
517; SI-NEXT:    s_brev_b32 s2, -2
518; SI-NEXT:    s_mov_b32 s3, 0xf000
519; SI-NEXT:    s_waitcnt lgkmcnt(0)
520; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
521; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
522; SI-NEXT:    v_bfi_b32 v0, s2, -1.0, v0
523; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
524; SI-NEXT:    s_mov_b32 s2, -1
525; SI-NEXT:    s_waitcnt lgkmcnt(0)
526; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
527; SI-NEXT:    s_endpgm
528;
529; VI-LABEL: s_test_copysign_f16_neg1_mag:
530; VI:       ; %bb.0:
531; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
532; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
533; VI-NEXT:    s_waitcnt lgkmcnt(0)
534; VI-NEXT:    s_and_b32 s2, s2, 0x8000
535; VI-NEXT:    s_or_b32 s2, s2, 0x3c00
536; VI-NEXT:    v_mov_b32_e32 v0, s0
537; VI-NEXT:    v_mov_b32_e32 v1, s1
538; VI-NEXT:    v_mov_b32_e32 v2, s2
539; VI-NEXT:    flat_store_short v[0:1], v2
540; VI-NEXT:    s_endpgm
541;
542; GFX9-LABEL: s_test_copysign_f16_neg1_mag:
543; GFX9:       ; %bb.0:
544; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
545; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
546; GFX9-NEXT:    v_mov_b32_e32 v0, 0
547; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
548; GFX9-NEXT:    s_and_b32 s2, s2, 0x8000
549; GFX9-NEXT:    s_or_b32 s2, s2, 0x3c00
550; GFX9-NEXT:    v_mov_b32_e32 v1, s2
551; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
552; GFX9-NEXT:    s_endpgm
553;
554; GFX11-LABEL: s_test_copysign_f16_neg1_mag:
555; GFX11:       ; %bb.0:
556; GFX11-NEXT:    s_clause 0x1
557; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
558; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
559; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
560; GFX11-NEXT:    s_and_b32 s2, s2, 0x8000
561; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
562; GFX11-NEXT:    s_or_b32 s2, s2, 0x3c00
563; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
564; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
565; GFX11-NEXT:    s_endpgm
566  %result = call half @llvm.copysign.f16(half -1.0, half %sign)
567  store half %result, ptr addrspace(1) %out, align 4
568  ret void
569}
570
571define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, half %sign) {
572; SI-LABEL: s_test_copysign_f16_neg10_mag:
573; SI:       ; %bb.0:
574; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
575; SI-NEXT:    s_brev_b32 s2, -2
576; SI-NEXT:    v_mov_b32_e32 v1, 0xc1200000
577; SI-NEXT:    s_mov_b32 s3, 0xf000
578; SI-NEXT:    s_waitcnt lgkmcnt(0)
579; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
580; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
581; SI-NEXT:    v_bfi_b32 v0, s2, v1, v0
582; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
583; SI-NEXT:    s_mov_b32 s2, -1
584; SI-NEXT:    s_waitcnt lgkmcnt(0)
585; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
586; SI-NEXT:    s_endpgm
587;
588; VI-LABEL: s_test_copysign_f16_neg10_mag:
589; VI:       ; %bb.0:
590; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
591; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
592; VI-NEXT:    s_waitcnt lgkmcnt(0)
593; VI-NEXT:    s_and_b32 s2, s2, 0x8000
594; VI-NEXT:    s_or_b32 s2, s2, 0x4900
595; VI-NEXT:    v_mov_b32_e32 v0, s0
596; VI-NEXT:    v_mov_b32_e32 v1, s1
597; VI-NEXT:    v_mov_b32_e32 v2, s2
598; VI-NEXT:    flat_store_short v[0:1], v2
599; VI-NEXT:    s_endpgm
600;
601; GFX9-LABEL: s_test_copysign_f16_neg10_mag:
602; GFX9:       ; %bb.0:
603; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
604; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
605; GFX9-NEXT:    v_mov_b32_e32 v0, 0
606; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
607; GFX9-NEXT:    s_and_b32 s2, s2, 0x8000
608; GFX9-NEXT:    s_or_b32 s2, s2, 0x4900
609; GFX9-NEXT:    v_mov_b32_e32 v1, s2
610; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
611; GFX9-NEXT:    s_endpgm
612;
613; GFX11-LABEL: s_test_copysign_f16_neg10_mag:
614; GFX11:       ; %bb.0:
615; GFX11-NEXT:    s_clause 0x1
616; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
617; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
618; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
619; GFX11-NEXT:    s_and_b32 s2, s2, 0x8000
620; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
621; GFX11-NEXT:    s_or_b32 s2, s2, 0x4900
622; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
623; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
624; GFX11-NEXT:    s_endpgm
625  %result = call half @llvm.copysign.f16(half -10.0, half %sign)
626  store half %result, ptr addrspace(1) %out, align 4
627  ret void
628}
629
630define half @v_copysign_f16(half %mag, half %sign) {
631; SI-LABEL: v_copysign_f16:
632; SI:       ; %bb.0:
633; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
634; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
635; SI-NEXT:    s_brev_b32 s4, -2
636; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
637; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
638; SI-NEXT:    s_setpc_b64 s[30:31]
639;
640; VI-LABEL: v_copysign_f16:
641; VI:       ; %bb.0:
642; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
643; VI-NEXT:    s_movk_i32 s4, 0x7fff
644; VI-NEXT:    v_bfi_b32 v0, s4, v0, v1
645; VI-NEXT:    s_setpc_b64 s[30:31]
646;
647; GFX9-LABEL: v_copysign_f16:
648; GFX9:       ; %bb.0:
649; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
650; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
651; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
652; GFX9-NEXT:    s_setpc_b64 s[30:31]
653;
654; GFX11-LABEL: v_copysign_f16:
655; GFX11:       ; %bb.0:
656; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
657; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
658; GFX11-NEXT:    s_setpc_b64 s[30:31]
659  %result = call half @llvm.copysign.f16(half %mag, half %sign)
660  ret half %result
661}
662
663define half @v_test_copysign_f16_0(half %mag) {
664; SI-LABEL: v_test_copysign_f16_0:
665; SI:       ; %bb.0:
666; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
667; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
668; SI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
669; SI-NEXT:    s_setpc_b64 s[30:31]
670;
671; VI-LABEL: v_test_copysign_f16_0:
672; VI:       ; %bb.0:
673; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
674; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
675; VI-NEXT:    s_setpc_b64 s[30:31]
676;
677; GFX9-LABEL: v_test_copysign_f16_0:
678; GFX9:       ; %bb.0:
679; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
680; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
681; GFX9-NEXT:    s_setpc_b64 s[30:31]
682;
683; GFX11-LABEL: v_test_copysign_f16_0:
684; GFX11:       ; %bb.0:
685; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
686; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
687; GFX11-NEXT:    s_setpc_b64 s[30:31]
688  %result = call half @llvm.copysign.f16(half %mag, half 0.0)
689  ret half %result
690}
691
692define half @v_test_copysign_f16_1(half %mag) {
693; SI-LABEL: v_test_copysign_f16_1:
694; SI:       ; %bb.0:
695; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
696; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
697; SI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
698; SI-NEXT:    s_setpc_b64 s[30:31]
699;
700; VI-LABEL: v_test_copysign_f16_1:
701; VI:       ; %bb.0:
702; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
703; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
704; VI-NEXT:    s_setpc_b64 s[30:31]
705;
706; GFX9-LABEL: v_test_copysign_f16_1:
707; GFX9:       ; %bb.0:
708; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
709; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
710; GFX9-NEXT:    s_setpc_b64 s[30:31]
711;
712; GFX11-LABEL: v_test_copysign_f16_1:
713; GFX11:       ; %bb.0:
714; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
715; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
716; GFX11-NEXT:    s_setpc_b64 s[30:31]
717  %result = call half @llvm.copysign.f16(half %mag, half 1.0)
718  ret half %result
719}
720
721define half @v_test_copysign_f16_10(half %mag) {
722; SI-LABEL: v_test_copysign_f16_10:
723; SI:       ; %bb.0:
724; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
725; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
726; SI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
727; SI-NEXT:    s_setpc_b64 s[30:31]
728;
729; VI-LABEL: v_test_copysign_f16_10:
730; VI:       ; %bb.0:
731; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
732; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
733; VI-NEXT:    s_setpc_b64 s[30:31]
734;
735; GFX9-LABEL: v_test_copysign_f16_10:
736; GFX9:       ; %bb.0:
737; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
738; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
739; GFX9-NEXT:    s_setpc_b64 s[30:31]
740;
741; GFX11-LABEL: v_test_copysign_f16_10:
742; GFX11:       ; %bb.0:
743; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
744; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
745; GFX11-NEXT:    s_setpc_b64 s[30:31]
746  %result = call half @llvm.copysign.f16(half %mag, half 10.0)
747  ret half %result
748}
749
750define half @v_test_copysign_f16_neg1(half %mag) {
751; SI-LABEL: v_test_copysign_f16_neg1:
752; SI:       ; %bb.0:
753; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
754; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
755; SI-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
756; SI-NEXT:    s_setpc_b64 s[30:31]
757;
758; VI-LABEL: v_test_copysign_f16_neg1:
759; VI:       ; %bb.0:
760; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
761; VI-NEXT:    v_or_b32_e32 v0, 0x8000, v0
762; VI-NEXT:    s_setpc_b64 s[30:31]
763;
764; GFX9-LABEL: v_test_copysign_f16_neg1:
765; GFX9:       ; %bb.0:
766; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
767; GFX9-NEXT:    v_or_b32_e32 v0, 0x8000, v0
768; GFX9-NEXT:    s_setpc_b64 s[30:31]
769;
770; GFX11-LABEL: v_test_copysign_f16_neg1:
771; GFX11:       ; %bb.0:
772; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
773; GFX11-NEXT:    v_or_b32_e32 v0, 0x8000, v0
774; GFX11-NEXT:    s_setpc_b64 s[30:31]
775  %result = call half @llvm.copysign.f16(half %mag, half -1.0)
776  ret half %result
777}
778
779define half @v_test_copysign_f16_neg10(half %mag) {
780; SI-LABEL: v_test_copysign_f16_neg10:
781; SI:       ; %bb.0:
782; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
783; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
784; SI-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
785; SI-NEXT:    s_setpc_b64 s[30:31]
786;
787; VI-LABEL: v_test_copysign_f16_neg10:
788; VI:       ; %bb.0:
789; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
790; VI-NEXT:    v_or_b32_e32 v0, 0x8000, v0
791; VI-NEXT:    s_setpc_b64 s[30:31]
792;
793; GFX9-LABEL: v_test_copysign_f16_neg10:
794; GFX9:       ; %bb.0:
795; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
796; GFX9-NEXT:    v_or_b32_e32 v0, 0x8000, v0
797; GFX9-NEXT:    s_setpc_b64 s[30:31]
798;
799; GFX11-LABEL: v_test_copysign_f16_neg10:
800; GFX11:       ; %bb.0:
801; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
802; GFX11-NEXT:    v_or_b32_e32 v0, 0x8000, v0
803; GFX11-NEXT:    s_setpc_b64 s[30:31]
804  %result = call half @llvm.copysign.f16(half %mag, half -10.0)
805  ret half %result
806}
807
808define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
809; SI-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
810; SI:       ; %bb.0:
811; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
812; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
813; SI-NEXT:    s_mov_b32 s11, 0xf000
814; SI-NEXT:    s_mov_b32 s14, 0
815; SI-NEXT:    s_mov_b32 s15, s11
816; SI-NEXT:    s_waitcnt lgkmcnt(0)
817; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
818; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
819; SI-NEXT:    v_mov_b32_e32 v2, 0
820; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[12:15], 0 addr64
821; SI-NEXT:    s_mov_b64 s[6:7], s[14:15]
822; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
823; SI-NEXT:    buffer_load_dword v0, v[1:2], s[4:7], 0 addr64
824; SI-NEXT:    s_mov_b32 s8, s0
825; SI-NEXT:    s_brev_b32 s0, -2
826; SI-NEXT:    s_mov_b32 s10, -1
827; SI-NEXT:    s_mov_b32 s9, s1
828; SI-NEXT:    s_waitcnt vmcnt(1)
829; SI-NEXT:    v_cvt_f32_f16_e32 v1, v3
830; SI-NEXT:    s_waitcnt vmcnt(0)
831; SI-NEXT:    v_bfi_b32 v0, s0, v1, v0
832; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
833; SI-NEXT:    s_endpgm
834;
835; VI-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
836; VI:       ; %bb.0:
837; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
838; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
839; VI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
840; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
841; VI-NEXT:    s_waitcnt lgkmcnt(0)
842; VI-NEXT:    v_mov_b32_e32 v2, s3
843; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
844; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
845; VI-NEXT:    flat_load_ushort v2, v[1:2]
846; VI-NEXT:    v_mov_b32_e32 v1, s5
847; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
848; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
849; VI-NEXT:    flat_load_dword v3, v[0:1]
850; VI-NEXT:    v_mov_b32_e32 v0, s0
851; VI-NEXT:    s_brev_b32 s0, -2
852; VI-NEXT:    v_mov_b32_e32 v1, s1
853; VI-NEXT:    s_waitcnt vmcnt(1)
854; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
855; VI-NEXT:    s_waitcnt vmcnt(0)
856; VI-NEXT:    v_bfi_b32 v2, s0, v2, v3
857; VI-NEXT:    flat_store_dword v[0:1], v2
858; VI-NEXT:    s_endpgm
859;
860; GFX9-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
861; GFX9:       ; %bb.0:
862; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
863; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
864; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
865; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
866; GFX9-NEXT:    v_mov_b32_e32 v2, 0
867; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
868; GFX9-NEXT:    global_load_ushort v1, v1, s[2:3]
869; GFX9-NEXT:    s_brev_b32 s2, -2
870; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
871; GFX9-NEXT:    s_waitcnt vmcnt(1)
872; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
873; GFX9-NEXT:    s_waitcnt vmcnt(0)
874; GFX9-NEXT:    v_bfi_b32 v0, s2, v1, v0
875; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
876; GFX9-NEXT:    s_endpgm
877;
878; GFX11-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
879; GFX11:       ; %bb.0:
880; GFX11-NEXT:    s_clause 0x1
881; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
882; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
883; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
884; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
885; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
886; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
887; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
888; GFX11-NEXT:    global_load_u16 v1, v1, s[2:3]
889; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5]
890; GFX11-NEXT:    s_waitcnt vmcnt(1)
891; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
892; GFX11-NEXT:    s_waitcnt vmcnt(0)
893; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
894; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v1, v0
895; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
896; GFX11-NEXT:    s_endpgm
897  %tid = call i32 @llvm.amdgcn.workitem.id.x()
898  %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
899  %mag = load half, ptr addrspace(1) %arg_mag_gep
900  %mag.ext = fpext half %mag to float
901  %arg_sign_gep = getelementptr float, ptr addrspace(1) %arg_sign, i32 %tid
902  %sign = load float, ptr addrspace(1) %arg_sign_gep
903  %out = call float @llvm.copysign.f32(float %mag.ext, float %sign)
904  store float %out, ptr addrspace(1) %arg_out
905  ret void
906}
907
908define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
909; SI-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
910; SI:       ; %bb.0:
911; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
912; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
913; SI-NEXT:    s_mov_b32 s11, 0xf000
914; SI-NEXT:    s_mov_b32 s14, 0
915; SI-NEXT:    s_mov_b32 s15, s11
916; SI-NEXT:    s_waitcnt lgkmcnt(0)
917; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
918; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
919; SI-NEXT:    v_mov_b32_e32 v2, 0
920; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[12:15], 0 addr64
921; SI-NEXT:    s_mov_b64 s[6:7], s[14:15]
922; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
923; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[1:2], s[4:7], 0 addr64
924; SI-NEXT:    s_mov_b32 s8, s0
925; SI-NEXT:    s_brev_b32 s0, -2
926; SI-NEXT:    s_mov_b32 s10, -1
927; SI-NEXT:    s_mov_b32 s9, s1
928; SI-NEXT:    s_waitcnt vmcnt(0)
929; SI-NEXT:    v_cvt_f32_f16_e32 v0, v3
930; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
931; SI-NEXT:    v_bfi_b32 v3, s0, v3, v1
932; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
933; SI-NEXT:    s_endpgm
934;
935; VI-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
936; VI:       ; %bb.0:
937; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
938; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
939; VI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
940; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
941; VI-NEXT:    s_waitcnt lgkmcnt(0)
942; VI-NEXT:    v_mov_b32_e32 v2, s3
943; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
944; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
945; VI-NEXT:    flat_load_ushort v2, v[1:2]
946; VI-NEXT:    v_mov_b32_e32 v1, s5
947; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
948; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
949; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
950; VI-NEXT:    v_mov_b32_e32 v4, s0
951; VI-NEXT:    s_brev_b32 s0, -2
952; VI-NEXT:    v_mov_b32_e32 v5, s1
953; VI-NEXT:    s_waitcnt vmcnt(0)
954; VI-NEXT:    v_cvt_f32_f16_e32 v0, v2
955; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
956; VI-NEXT:    v_bfi_b32 v3, s0, v3, v1
957; VI-NEXT:    flat_store_dwordx2 v[4:5], v[2:3]
958; VI-NEXT:    s_endpgm
959;
960; GFX9-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
961; GFX9:       ; %bb.0:
962; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
963; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
964; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
965; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
966; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
967; GFX9-NEXT:    global_load_ushort v2, v1, s[2:3]
968; GFX9-NEXT:    s_nop 0
969; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
970; GFX9-NEXT:    s_brev_b32 s2, -2
971; GFX9-NEXT:    s_waitcnt vmcnt(0)
972; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v2
973; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
974; GFX9-NEXT:    v_mov_b32_e32 v0, 0
975; GFX9-NEXT:    v_bfi_b32 v3, s2, v3, v1
976; GFX9-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
977; GFX9-NEXT:    s_endpgm
978;
979; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
980; GFX11:       ; %bb.0:
981; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
982; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
983; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
984; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
985; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
986; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
987; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
988; GFX11-NEXT:    global_load_u16 v2, v1, s[2:3]
989; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[4:5]
990; GFX11-NEXT:    s_waitcnt vmcnt(0)
991; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v2
992; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
993; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
994; GFX11-NEXT:    v_mov_b32_e32 v0, 0
995; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, v3, v1
996; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[0:1]
997; GFX11-NEXT:    s_endpgm
998  %tid = call i32 @llvm.amdgcn.workitem.id.x()
999  %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
1000  %mag = load half, ptr addrspace(1) %arg_mag_gep
1001  %mag.ext = fpext half %mag to double
1002  %arg_sign_gep = getelementptr double, ptr addrspace(1) %arg_sign, i32 %tid
1003  %sign = load double, ptr addrspace(1) %arg_sign_gep
1004  %out = call double @llvm.copysign.f64(double %mag.ext, double %sign)
1005  store double %out, ptr addrspace(1) %arg_out
1006  ret void
1007}
1008
1009define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
1010; SI-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
1011; SI:       ; %bb.0:
1012; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1013; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1014; SI-NEXT:    s_mov_b32 s11, 0xf000
1015; SI-NEXT:    s_mov_b32 s14, 0
1016; SI-NEXT:    s_mov_b32 s15, s11
1017; SI-NEXT:    s_waitcnt lgkmcnt(0)
1018; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
1019; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1020; SI-NEXT:    v_mov_b32_e32 v2, 0
1021; SI-NEXT:    buffer_load_dword v3, v[1:2], s[12:15], 0 addr64
1022; SI-NEXT:    s_mov_b64 s[6:7], s[14:15]
1023; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1024; SI-NEXT:    buffer_load_ushort v0, v[1:2], s[4:7], 0 addr64
1025; SI-NEXT:    s_mov_b32 s8, s0
1026; SI-NEXT:    s_brev_b32 s0, -2
1027; SI-NEXT:    s_mov_b32 s10, -1
1028; SI-NEXT:    s_mov_b32 s9, s1
1029; SI-NEXT:    s_waitcnt vmcnt(0)
1030; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1031; SI-NEXT:    v_bfi_b32 v0, s0, v3, v0
1032; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
1033; SI-NEXT:    s_endpgm
1034;
1035; VI-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
1036; VI:       ; %bb.0:
1037; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1038; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1039; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1040; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1041; VI-NEXT:    s_waitcnt lgkmcnt(0)
1042; VI-NEXT:    v_mov_b32_e32 v3, s3
1043; VI-NEXT:    v_mov_b32_e32 v1, s5
1044; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
1045; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1046; VI-NEXT:    flat_load_ushort v4, v[0:1]
1047; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1048; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
1049; VI-NEXT:    flat_load_dword v2, v[0:1]
1050; VI-NEXT:    v_mov_b32_e32 v0, s0
1051; VI-NEXT:    s_brev_b32 s0, -2
1052; VI-NEXT:    v_mov_b32_e32 v1, s1
1053; VI-NEXT:    s_waitcnt vmcnt(1)
1054; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
1055; VI-NEXT:    s_waitcnt vmcnt(0)
1056; VI-NEXT:    v_bfi_b32 v2, s0, v2, v3
1057; VI-NEXT:    flat_store_dword v[0:1], v2
1058; VI-NEXT:    s_endpgm
1059;
1060; GFX9-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
1061; GFX9:       ; %bb.0:
1062; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1063; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1064; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1065; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1066; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1067; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1068; GFX9-NEXT:    global_load_ushort v1, v1, s[6:7]
1069; GFX9-NEXT:    s_waitcnt vmcnt(0)
1070; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1071; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
1072; GFX9-NEXT:    s_brev_b32 s2, -2
1073; GFX9-NEXT:    s_waitcnt vmcnt(0)
1074; GFX9-NEXT:    v_bfi_b32 v0, s2, v0, v1
1075; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
1076; GFX9-NEXT:    s_endpgm
1077;
1078; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
1079; GFX11:       ; %bb.0:
1080; GFX11-NEXT:    s_clause 0x1
1081; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
1082; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1083; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1084; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1085; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
1086; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1087; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1088; GFX11-NEXT:    global_load_u16 v1, v1, s[6:7]
1089; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1090; GFX11-NEXT:    s_waitcnt vmcnt(1)
1091; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1092; GFX11-NEXT:    s_waitcnt vmcnt(0)
1093; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1094; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v1
1095; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
1096; GFX11-NEXT:    s_endpgm
1097  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1098  %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid
1099  %mag = load float, ptr addrspace(1) %arg_mag_gep
1100  %arg_sign_gep = getelementptr half, ptr addrspace(1) %arg_sign, i32 %tid
1101  %sign = load half, ptr addrspace(1) %arg_sign_gep
1102  %sign.ext = fpext half %sign to float
1103  %out = call float @llvm.copysign.f32(float %mag, float %sign.ext)
1104  store float %out, ptr addrspace(1) %arg_out
1105  ret void
1106}
1107
1108define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
1109; SI-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
1110; SI:       ; %bb.0:
1111; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1112; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1113; SI-NEXT:    s_mov_b32 s11, 0xf000
1114; SI-NEXT:    s_mov_b32 s14, 0
1115; SI-NEXT:    s_mov_b32 s15, s11
1116; SI-NEXT:    v_mov_b32_e32 v1, 0
1117; SI-NEXT:    s_mov_b64 s[6:7], s[14:15]
1118; SI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
1119; SI-NEXT:    v_mov_b32_e32 v3, v1
1120; SI-NEXT:    s_waitcnt lgkmcnt(0)
1121; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
1122; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
1123; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1124; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[12:15], 0 addr64
1125; SI-NEXT:    s_mov_b32 s8, s0
1126; SI-NEXT:    s_brev_b32 s0, -2
1127; SI-NEXT:    s_mov_b32 s10, -1
1128; SI-NEXT:    s_mov_b32 s9, s1
1129; SI-NEXT:    s_waitcnt vmcnt(1)
1130; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1131; SI-NEXT:    s_waitcnt vmcnt(0)
1132; SI-NEXT:    v_bfi_b32 v1, s0, v1, v2
1133; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1134; SI-NEXT:    s_endpgm
1135;
1136; VI-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
1137; VI:       ; %bb.0:
1138; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1139; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1140; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1141; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1142; VI-NEXT:    s_waitcnt lgkmcnt(0)
1143; VI-NEXT:    v_mov_b32_e32 v3, s3
1144; VI-NEXT:    v_mov_b32_e32 v1, s5
1145; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
1146; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1147; VI-NEXT:    flat_load_ushort v4, v[0:1]
1148; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1149; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
1150; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1151; VI-NEXT:    v_mov_b32_e32 v2, s0
1152; VI-NEXT:    s_brev_b32 s0, -2
1153; VI-NEXT:    v_mov_b32_e32 v3, s1
1154; VI-NEXT:    s_waitcnt vmcnt(1)
1155; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
1156; VI-NEXT:    s_waitcnt vmcnt(0)
1157; VI-NEXT:    v_bfi_b32 v1, s0, v1, v4
1158; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1159; VI-NEXT:    s_endpgm
1160;
1161; GFX9-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
1162; GFX9:       ; %bb.0:
1163; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1164; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1165; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1166; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1167; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1168; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1169; GFX9-NEXT:    global_load_ushort v2, v1, s[6:7]
1170; GFX9-NEXT:    s_waitcnt vmcnt(0)
1171; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1172; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
1173; GFX9-NEXT:    s_brev_b32 s2, -2
1174; GFX9-NEXT:    s_waitcnt vmcnt(0)
1175; GFX9-NEXT:    v_bfi_b32 v1, s2, v1, v2
1176; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
1177; GFX9-NEXT:    s_endpgm
1178;
1179; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
1180; GFX11:       ; %bb.0:
1181; GFX11-NEXT:    s_clause 0x1
1182; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
1183; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1184; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1185; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1186; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1187; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1188; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1189; GFX11-NEXT:    global_load_u16 v2, v1, s[6:7]
1190; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
1191; GFX11-NEXT:    s_waitcnt vmcnt(1)
1192; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1193; GFX11-NEXT:    s_waitcnt vmcnt(0)
1194; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1195; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, v2
1196; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
1197; GFX11-NEXT:    s_endpgm
1198  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1199  %arg_mag_gep = getelementptr double, ptr addrspace(1) %arg_mag, i32 %tid
1200  %mag = load double, ptr addrspace(1) %arg_mag_gep
1201  %arg_sign_gep = getelementptr half, ptr addrspace(1) %arg_sign, i32 %tid
1202  %sign = load half, ptr addrspace(1) %arg_sign_gep
1203  %sign.ext = fpext half %sign to double
1204  %out = call double @llvm.copysign.f64(double %mag, double %sign.ext)
1205  store double %out, ptr addrspace(1) %arg_out
1206  ret void
1207}
1208
1209define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
1210; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
1211; SI:       ; %bb.0:
1212; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1213; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1214; SI-NEXT:    s_mov_b32 s11, 0xf000
1215; SI-NEXT:    s_mov_b32 s14, 0
1216; SI-NEXT:    s_mov_b32 s15, s11
1217; SI-NEXT:    s_waitcnt lgkmcnt(0)
1218; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
1219; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1220; SI-NEXT:    v_mov_b32_e32 v2, 0
1221; SI-NEXT:    buffer_load_ushort v3, v[1:2], s[12:15], 0 addr64
1222; SI-NEXT:    s_mov_b64 s[6:7], s[14:15]
1223; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1224; SI-NEXT:    buffer_load_dword v0, v[1:2], s[4:7], 0 addr64
1225; SI-NEXT:    s_brev_b32 s2, -2
1226; SI-NEXT:    s_mov_b32 s10, -1
1227; SI-NEXT:    s_mov_b32 s8, s0
1228; SI-NEXT:    s_mov_b32 s9, s1
1229; SI-NEXT:    s_waitcnt vmcnt(1)
1230; SI-NEXT:    v_cvt_f32_f16_e32 v1, v3
1231; SI-NEXT:    s_waitcnt vmcnt(0)
1232; SI-NEXT:    v_bfi_b32 v0, s2, v1, v0
1233; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1234; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
1235; SI-NEXT:    s_endpgm
1236;
1237; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
1238; VI:       ; %bb.0:
1239; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1240; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1241; VI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
1242; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1243; VI-NEXT:    s_waitcnt lgkmcnt(0)
1244; VI-NEXT:    v_mov_b32_e32 v3, s3
1245; VI-NEXT:    v_mov_b32_e32 v1, s5
1246; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
1247; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1248; VI-NEXT:    flat_load_dword v4, v[0:1]
1249; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1250; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
1251; VI-NEXT:    flat_load_ushort v2, v[0:1]
1252; VI-NEXT:    v_mov_b32_e32 v0, s0
1253; VI-NEXT:    s_movk_i32 s0, 0x7fff
1254; VI-NEXT:    v_mov_b32_e32 v1, s1
1255; VI-NEXT:    s_waitcnt vmcnt(1)
1256; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
1257; VI-NEXT:    s_waitcnt vmcnt(0)
1258; VI-NEXT:    v_bfi_b32 v2, s0, v2, v3
1259; VI-NEXT:    flat_store_short v[0:1], v2
1260; VI-NEXT:    s_endpgm
1261;
1262; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
1263; GFX9:       ; %bb.0:
1264; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1265; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1266; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1267; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1268; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1269; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1270; GFX9-NEXT:    global_load_dword v1, v1, s[6:7]
1271; GFX9-NEXT:    s_waitcnt vmcnt(0)
1272; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1273; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3]
1274; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
1275; GFX9-NEXT:    s_waitcnt vmcnt(0)
1276; GFX9-NEXT:    v_bfi_b32 v0, s2, v0, v1
1277; GFX9-NEXT:    global_store_short v2, v0, s[0:1]
1278; GFX9-NEXT:    s_endpgm
1279;
1280; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
1281; GFX11:       ; %bb.0:
1282; GFX11-NEXT:    s_clause 0x1
1283; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
1284; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1285; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1286; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1287; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
1288; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1289; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1290; GFX11-NEXT:    global_load_b32 v1, v1, s[6:7]
1291; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
1292; GFX11-NEXT:    s_waitcnt vmcnt(1)
1293; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1294; GFX11-NEXT:    s_waitcnt vmcnt(0)
1295; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1296; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
1297; GFX11-NEXT:    global_store_b16 v2, v0, s[0:1]
1298; GFX11-NEXT:    s_endpgm
1299  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1300  %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
1301  %mag = load half, ptr addrspace(1) %arg_mag_gep
1302  %arg_sign_gep = getelementptr float, ptr addrspace(1) %arg_sign, i32 %tid
1303  %sign = load float, ptr addrspace(1) %arg_sign_gep
1304  %sign.trunc = fptrunc float %sign to half
1305  %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
1306  store half %out, ptr addrspace(1) %arg_out
1307  ret void
1308}
1309
1310define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
1311; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
1312; SI:       ; %bb.0:
1313; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1314; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1315; SI-NEXT:    s_mov_b32 s7, 0xf000
1316; SI-NEXT:    s_mov_b32 s6, -1
1317; SI-NEXT:    s_mov_b32 s14, s6
1318; SI-NEXT:    s_waitcnt lgkmcnt(0)
1319; SI-NEXT:    s_mov_b32 s12, s2
1320; SI-NEXT:    s_mov_b32 s13, s3
1321; SI-NEXT:    s_mov_b32 s15, s7
1322; SI-NEXT:    buffer_load_ushort v2, off, s[12:15], 0
1323; SI-NEXT:    s_mov_b32 s10, 0
1324; SI-NEXT:    s_mov_b32 s11, s7
1325; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1326; SI-NEXT:    v_mov_b32_e32 v1, 0
1327; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
1328; SI-NEXT:    s_brev_b32 s2, -2
1329; SI-NEXT:    s_mov_b32 s4, s0
1330; SI-NEXT:    s_mov_b32 s5, s1
1331; SI-NEXT:    s_waitcnt vmcnt(0)
1332; SI-NEXT:    v_cvt_f32_f16_e32 v0, v2
1333; SI-NEXT:    v_bfi_b32 v0, s2, v0, v1
1334; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1335; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1336; SI-NEXT:    s_endpgm
1337;
1338; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
1339; VI:       ; %bb.0:
1340; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1341; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1342; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
1343; VI-NEXT:    s_waitcnt lgkmcnt(0)
1344; VI-NEXT:    v_mov_b32_e32 v0, s2
1345; VI-NEXT:    v_mov_b32_e32 v2, s5
1346; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
1347; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
1348; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
1349; VI-NEXT:    s_waitcnt vmcnt(0)
1350; VI-NEXT:    v_mov_b32_e32 v1, s3
1351; VI-NEXT:    flat_load_ushort v3, v[0:1]
1352; VI-NEXT:    v_mov_b32_e32 v0, s0
1353; VI-NEXT:    s_movk_i32 s0, 0x7fff
1354; VI-NEXT:    v_mov_b32_e32 v1, s1
1355; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1356; VI-NEXT:    s_waitcnt vmcnt(0)
1357; VI-NEXT:    v_bfi_b32 v2, s0, v3, v2
1358; VI-NEXT:    flat_store_short v[0:1], v2
1359; VI-NEXT:    s_endpgm
1360;
1361; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
1362; GFX9:       ; %bb.0:
1363; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1364; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1365; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1366; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1367; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
1368; GFX9-NEXT:    s_waitcnt vmcnt(0)
1369; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1370; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3]
1371; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
1372; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1373; GFX9-NEXT:    s_waitcnt vmcnt(0)
1374; GFX9-NEXT:    v_bfi_b32 v1, s2, v2, v1
1375; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
1376; GFX9-NEXT:    s_endpgm
1377;
1378; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
1379; GFX11:       ; %bb.0:
1380; GFX11-NEXT:    s_clause 0x1
1381; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
1382; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1383; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1384; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1385; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1386; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1387; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1388; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[6:7]
1389; GFX11-NEXT:    global_load_u16 v0, v2, s[2:3]
1390; GFX11-NEXT:    s_waitcnt vmcnt(1)
1391; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1392; GFX11-NEXT:    s_waitcnt vmcnt(0)
1393; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1394; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
1395; GFX11-NEXT:    global_store_b16 v2, v0, s[0:1]
1396; GFX11-NEXT:    s_endpgm
1397  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1398  %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
1399  %mag = load half, ptr addrspace(1) %arg_mag
1400  %arg_sign_gep = getelementptr double, ptr addrspace(1) %arg_sign, i32 %tid
1401  %sign = load double, ptr addrspace(1) %arg_sign_gep
1402  %sign.trunc = fptrunc double %sign to half
1403  %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
1404  store half %out, ptr addrspace(1) %arg_out
1405  ret void
1406}
1407
1408define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
1409; SI-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
1410; SI:       ; %bb.0:
1411; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1412; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1413; SI-NEXT:    s_mov_b32 s11, 0xf000
1414; SI-NEXT:    s_mov_b32 s14, 0
1415; SI-NEXT:    s_mov_b32 s15, s11
1416; SI-NEXT:    s_waitcnt lgkmcnt(0)
1417; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
1418; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1419; SI-NEXT:    v_mov_b32_e32 v2, 0
1420; SI-NEXT:    buffer_load_dword v3, v[1:2], s[12:15], 0 addr64
1421; SI-NEXT:    s_mov_b64 s[6:7], s[14:15]
1422; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
1423; SI-NEXT:    buffer_load_ushort v0, v[1:2], s[4:7], 0 addr64
1424; SI-NEXT:    s_brev_b32 s2, -2
1425; SI-NEXT:    s_mov_b32 s10, -1
1426; SI-NEXT:    s_mov_b32 s8, s0
1427; SI-NEXT:    s_mov_b32 s9, s1
1428; SI-NEXT:    s_waitcnt vmcnt(1)
1429; SI-NEXT:    v_cvt_f16_f32_e32 v1, v3
1430; SI-NEXT:    s_waitcnt vmcnt(0)
1431; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1432; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1433; SI-NEXT:    v_bfi_b32 v0, s2, v1, v0
1434; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1435; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
1436; SI-NEXT:    s_endpgm
1437;
1438; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
1439; VI:       ; %bb.0:
1440; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1441; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1442; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1443; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1444; VI-NEXT:    s_waitcnt lgkmcnt(0)
1445; VI-NEXT:    v_mov_b32_e32 v2, s3
1446; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
1447; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
1448; VI-NEXT:    flat_load_dword v2, v[1:2]
1449; VI-NEXT:    v_mov_b32_e32 v1, s5
1450; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
1451; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1452; VI-NEXT:    flat_load_ushort v3, v[0:1]
1453; VI-NEXT:    v_mov_b32_e32 v0, s0
1454; VI-NEXT:    s_movk_i32 s0, 0x7fff
1455; VI-NEXT:    v_mov_b32_e32 v1, s1
1456; VI-NEXT:    s_waitcnt vmcnt(1)
1457; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1458; VI-NEXT:    s_waitcnt vmcnt(0)
1459; VI-NEXT:    v_bfi_b32 v2, s0, v2, v3
1460; VI-NEXT:    flat_store_short v[0:1], v2
1461; VI-NEXT:    s_endpgm
1462;
1463; GFX9-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
1464; GFX9:       ; %bb.0:
1465; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1466; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1467; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1468; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1469; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1470; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1471; GFX9-NEXT:    global_load_dword v1, v1, s[2:3]
1472; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
1473; GFX9-NEXT:    global_load_ushort v0, v0, s[6:7]
1474; GFX9-NEXT:    s_waitcnt vmcnt(1)
1475; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
1476; GFX9-NEXT:    s_waitcnt vmcnt(0)
1477; GFX9-NEXT:    v_bfi_b32 v0, s2, v1, v0
1478; GFX9-NEXT:    global_store_short v2, v0, s[0:1]
1479; GFX9-NEXT:    s_endpgm
1480;
1481; GFX11-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
1482; GFX11:       ; %bb.0:
1483; GFX11-NEXT:    s_clause 0x1
1484; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1485; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1486; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1487; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1488; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
1489; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1490; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1491; GFX11-NEXT:    global_load_b32 v1, v1, s[2:3]
1492; GFX11-NEXT:    global_load_u16 v0, v0, s[4:5]
1493; GFX11-NEXT:    s_waitcnt vmcnt(1)
1494; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
1495; GFX11-NEXT:    s_waitcnt vmcnt(0)
1496; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1497; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
1498; GFX11-NEXT:    global_store_b16 v2, v0, s[0:1]
1499; GFX11-NEXT:    s_endpgm
1500  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1501  %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid
1502  %mag = load float, ptr addrspace(1) %arg_mag_gep
1503  %mag.trunc = fptrunc float %mag to half
1504  %arg_sign_gep = getelementptr half, ptr addrspace(1) %arg_sign, i32 %tid
1505  %sign = load half, ptr addrspace(1) %arg_sign_gep
1506  %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
1507  store half %out, ptr addrspace(1) %arg_out
1508  ret void
1509}
1510
1511define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) %arg_out, double %mag, half %sign) {
1512; SI-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
1513; SI:       ; %bb.0:
1514; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
1515; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1516; SI-NEXT:    s_waitcnt lgkmcnt(0)
1517; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
1518; SI-NEXT:    s_lshr_b32 s4, s3, 8
1519; SI-NEXT:    s_and_b32 s5, s3, 0x1ff
1520; SI-NEXT:    s_and_b32 s6, s4, 0xffe
1521; SI-NEXT:    s_or_b32 s2, s5, s2
1522; SI-NEXT:    s_cmp_lg_u32 s2, 0
1523; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
1524; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
1525; SI-NEXT:    v_readfirstlane_b32 s2, v1
1526; SI-NEXT:    s_bfe_u32 s5, s3, 0xb0014
1527; SI-NEXT:    s_or_b32 s2, s6, s2
1528; SI-NEXT:    s_sub_i32 s6, 0x3f1, s5
1529; SI-NEXT:    v_med3_i32 v1, s6, 0, 13
1530; SI-NEXT:    s_or_b32 s4, s2, 0x1000
1531; SI-NEXT:    v_readfirstlane_b32 s6, v1
1532; SI-NEXT:    s_lshr_b32 s6, s4, s6
1533; SI-NEXT:    v_lshl_b32_e32 v1, s6, v1
1534; SI-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
1535; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
1536; SI-NEXT:    s_add_i32 s8, s5, 0xfffffc10
1537; SI-NEXT:    v_readfirstlane_b32 s4, v1
1538; SI-NEXT:    s_lshl_b32 s5, s8, 12
1539; SI-NEXT:    s_or_b32 s4, s6, s4
1540; SI-NEXT:    s_or_b32 s5, s2, s5
1541; SI-NEXT:    s_cmp_lt_i32 s8, 1
1542; SI-NEXT:    s_cselect_b32 s9, s4, s5
1543; SI-NEXT:    s_and_b32 s6, s9, 7
1544; SI-NEXT:    s_cmp_gt_i32 s6, 5
1545; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
1546; SI-NEXT:    s_cmp_eq_u32 s6, 3
1547; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
1548; SI-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
1549; SI-NEXT:    s_lshr_b32 s6, s9, 2
1550; SI-NEXT:    s_or_b32 s4, s4, s5
1551; SI-NEXT:    s_cmp_lg_u32 s4, 0
1552; SI-NEXT:    s_addc_u32 s4, s6, 0
1553; SI-NEXT:    s_cmp_lt_i32 s8, 31
1554; SI-NEXT:    s_cselect_b32 s6, s4, 0x7c00
1555; SI-NEXT:    s_cmp_lg_u32 s2, 0
1556; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
1557; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
1558; SI-NEXT:    v_lshlrev_b32_e32 v1, 9, v1
1559; SI-NEXT:    s_cmpk_eq_i32 s8, 0x40f
1560; SI-NEXT:    v_or_b32_e32 v1, 0x7c00, v1
1561; SI-NEXT:    v_mov_b32_e32 v2, s6
1562; SI-NEXT:    s_cselect_b64 vcc, -1, 0
1563; SI-NEXT:    s_lshr_b32 s2, s3, 16
1564; SI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
1565; SI-NEXT:    s_and_b32 s2, s2, 0x8000
1566; SI-NEXT:    v_or_b32_e32 v1, s2, v1
1567; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1568; SI-NEXT:    s_brev_b32 s2, -2
1569; SI-NEXT:    s_mov_b32 s3, 0xf000
1570; SI-NEXT:    v_bfi_b32 v0, s2, v1, v0
1571; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1572; SI-NEXT:    s_mov_b32 s2, -1
1573; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1574; SI-NEXT:    s_endpgm
1575;
1576; VI-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
1577; VI:       ; %bb.0:
1578; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1579; VI-NEXT:    s_load_dword s4, s[4:5], 0x34
1580; VI-NEXT:    s_waitcnt lgkmcnt(0)
1581; VI-NEXT:    v_mov_b32_e32 v0, s0
1582; VI-NEXT:    v_mov_b32_e32 v1, s1
1583; VI-NEXT:    s_lshr_b32 s0, s3, 8
1584; VI-NEXT:    s_and_b32 s1, s3, 0x1ff
1585; VI-NEXT:    s_and_b32 s5, s0, 0xffe
1586; VI-NEXT:    s_or_b32 s0, s1, s2
1587; VI-NEXT:    s_cmp_lg_u32 s0, 0
1588; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
1589; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1590; VI-NEXT:    s_bfe_u32 s1, s3, 0xb0014
1591; VI-NEXT:    v_readfirstlane_b32 s0, v2
1592; VI-NEXT:    s_sub_i32 s2, 0x3f1, s1
1593; VI-NEXT:    s_or_b32 s5, s5, s0
1594; VI-NEXT:    v_med3_i32 v2, s2, 0, 13
1595; VI-NEXT:    s_or_b32 s0, s5, 0x1000
1596; VI-NEXT:    v_readfirstlane_b32 s2, v2
1597; VI-NEXT:    s_lshr_b32 s2, s0, s2
1598; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, s2
1599; VI-NEXT:    v_cmp_ne_u32_e32 vcc, s0, v2
1600; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1601; VI-NEXT:    s_add_i32 s6, s1, 0xfffffc10
1602; VI-NEXT:    v_readfirstlane_b32 s0, v2
1603; VI-NEXT:    s_lshl_b32 s1, s6, 12
1604; VI-NEXT:    s_or_b32 s0, s2, s0
1605; VI-NEXT:    s_or_b32 s1, s5, s1
1606; VI-NEXT:    s_cmp_lt_i32 s6, 1
1607; VI-NEXT:    s_cselect_b32 s7, s0, s1
1608; VI-NEXT:    s_and_b32 s2, s7, 7
1609; VI-NEXT:    s_cmp_gt_i32 s2, 5
1610; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
1611; VI-NEXT:    s_cmp_eq_u32 s2, 3
1612; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
1613; VI-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1614; VI-NEXT:    s_lshr_b32 s2, s7, 2
1615; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
1616; VI-NEXT:    s_addc_u32 s0, s2, 0
1617; VI-NEXT:    s_cmp_lt_i32 s6, 31
1618; VI-NEXT:    s_cselect_b32 s2, s0, 0x7c00
1619; VI-NEXT:    s_cmp_lg_u32 s5, 0
1620; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
1621; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1622; VI-NEXT:    v_lshlrev_b32_e32 v2, 9, v2
1623; VI-NEXT:    s_cmpk_eq_i32 s6, 0x40f
1624; VI-NEXT:    v_or_b32_e32 v2, 0x7c00, v2
1625; VI-NEXT:    v_mov_b32_e32 v3, s2
1626; VI-NEXT:    s_cselect_b64 vcc, -1, 0
1627; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
1628; VI-NEXT:    s_movk_i32 s0, 0x7fff
1629; VI-NEXT:    v_mov_b32_e32 v3, s4
1630; VI-NEXT:    v_bfi_b32 v2, s0, v2, v3
1631; VI-NEXT:    flat_store_short v[0:1], v2
1632; VI-NEXT:    s_endpgm
1633;
1634; GFX9-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
1635; GFX9:       ; %bb.0:
1636; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1637; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x34
1638; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1639; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1640; GFX9-NEXT:    s_lshr_b32 s4, s3, 8
1641; GFX9-NEXT:    s_and_b32 s5, s3, 0x1ff
1642; GFX9-NEXT:    s_and_b32 s7, s4, 0xffe
1643; GFX9-NEXT:    s_or_b32 s2, s5, s2
1644; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
1645; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
1646; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
1647; GFX9-NEXT:    s_bfe_u32 s3, s3, 0xb0014
1648; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1649; GFX9-NEXT:    s_sub_i32 s4, 0x3f1, s3
1650; GFX9-NEXT:    s_or_b32 s7, s7, s2
1651; GFX9-NEXT:    v_med3_i32 v1, s4, 0, 13
1652; GFX9-NEXT:    s_or_b32 s2, s7, 0x1000
1653; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
1654; GFX9-NEXT:    s_lshr_b32 s4, s2, s4
1655; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s4
1656; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, s2, v1
1657; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
1658; GFX9-NEXT:    s_add_i32 s8, s3, 0xfffffc10
1659; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
1660; GFX9-NEXT:    s_lshl_b32 s3, s8, 12
1661; GFX9-NEXT:    s_or_b32 s2, s4, s2
1662; GFX9-NEXT:    s_or_b32 s3, s7, s3
1663; GFX9-NEXT:    s_cmp_lt_i32 s8, 1
1664; GFX9-NEXT:    s_cselect_b32 s9, s2, s3
1665; GFX9-NEXT:    s_and_b32 s4, s9, 7
1666; GFX9-NEXT:    s_cmp_gt_i32 s4, 5
1667; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
1668; GFX9-NEXT:    s_cmp_eq_u32 s4, 3
1669; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
1670; GFX9-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
1671; GFX9-NEXT:    s_lshr_b32 s4, s9, 2
1672; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
1673; GFX9-NEXT:    s_addc_u32 s2, s4, 0
1674; GFX9-NEXT:    s_cmp_lt_i32 s8, 31
1675; GFX9-NEXT:    s_cselect_b32 s4, s2, 0x7c00
1676; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
1677; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
1678; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
1679; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 9, v1
1680; GFX9-NEXT:    s_cmpk_eq_i32 s8, 0x40f
1681; GFX9-NEXT:    v_or_b32_e32 v1, 0x7c00, v1
1682; GFX9-NEXT:    v_mov_b32_e32 v2, s4
1683; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
1684; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
1685; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
1686; GFX9-NEXT:    v_mov_b32_e32 v2, s6
1687; GFX9-NEXT:    v_bfi_b32 v1, s2, v1, v2
1688; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
1689; GFX9-NEXT:    s_endpgm
1690;
1691; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
1692; GFX11:       ; %bb.0:
1693; GFX11-NEXT:    s_clause 0x1
1694; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1695; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
1696; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1697; GFX11-NEXT:    s_and_b32 s5, s3, 0x1ff
1698; GFX11-NEXT:    s_lshr_b32 s6, s3, 8
1699; GFX11-NEXT:    s_or_b32 s2, s5, s2
1700; GFX11-NEXT:    s_and_b32 s5, s6, 0xffe
1701; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
1702; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
1703; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1704; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
1705; GFX11-NEXT:    s_bfe_u32 s2, s3, 0xb0014
1706; GFX11-NEXT:    s_sub_i32 s3, 0x3f1, s2
1707; GFX11-NEXT:    s_addk_i32 s2, 0xfc10
1708; GFX11-NEXT:    v_med3_i32 v1, s3, 0, 13
1709; GFX11-NEXT:    v_readfirstlane_b32 s3, v0
1710; GFX11-NEXT:    s_lshl_b32 s7, s2, 12
1711; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1712; GFX11-NEXT:    v_readfirstlane_b32 s6, v1
1713; GFX11-NEXT:    s_or_b32 s3, s5, s3
1714; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1715; GFX11-NEXT:    s_or_b32 s5, s3, 0x1000
1716; GFX11-NEXT:    s_or_b32 s7, s3, s7
1717; GFX11-NEXT:    s_lshr_b32 s6, s5, s6
1718; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1719; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v1, s6
1720; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1721; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
1722; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
1723; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1724; GFX11-NEXT:    v_readfirstlane_b32 s5, v0
1725; GFX11-NEXT:    s_or_b32 s5, s6, s5
1726; GFX11-NEXT:    s_cmp_lt_i32 s2, 1
1727; GFX11-NEXT:    s_cselect_b32 s5, s5, s7
1728; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1729; GFX11-NEXT:    s_and_b32 s6, s5, 7
1730; GFX11-NEXT:    s_cmp_gt_i32 s6, 5
1731; GFX11-NEXT:    s_cselect_b32 s7, -1, 0
1732; GFX11-NEXT:    s_cmp_eq_u32 s6, 3
1733; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
1734; GFX11-NEXT:    s_lshr_b32 s5, s5, 2
1735; GFX11-NEXT:    s_or_b32 s6, s6, s7
1736; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1737; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
1738; GFX11-NEXT:    s_addc_u32 s5, s5, 0
1739; GFX11-NEXT:    s_cmp_lt_i32 s2, 31
1740; GFX11-NEXT:    s_cselect_b32 s5, s5, 0x7c00
1741; GFX11-NEXT:    s_cmp_lg_u32 s3, 0
1742; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
1743; GFX11-NEXT:    s_cmpk_eq_i32 s2, 0x40f
1744; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
1745; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
1746; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1747; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
1748; GFX11-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
1749; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1750; GFX11-NEXT:    v_cndmask_b32_e32 v0, s5, v0, vcc_lo
1751; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, s4
1752; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
1753; GFX11-NEXT:    s_endpgm
1754  %mag.trunc = fptrunc double %mag to half
1755  %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
1756  store half %result, ptr addrspace(1) %arg_out
1757  ret void
1758}
1759
1760define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half> %arg_mag, <2 x half> %arg_sign) {
1761; SI-LABEL: s_copysign_v2f16:
1762; SI:       ; %bb.0:
1763; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1764; SI-NEXT:    s_mov_b32 s7, 0xf000
1765; SI-NEXT:    s_mov_b32 s6, -1
1766; SI-NEXT:    s_waitcnt lgkmcnt(0)
1767; SI-NEXT:    s_lshr_b32 s4, s2, 16
1768; SI-NEXT:    s_lshr_b32 s5, s3, 16
1769; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
1770; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
1771; SI-NEXT:    v_cvt_f32_f16_e32 v2, s2
1772; SI-NEXT:    v_cvt_f32_f16_e32 v3, s3
1773; SI-NEXT:    s_brev_b32 s2, -2
1774; SI-NEXT:    v_bfi_b32 v0, s2, v0, v1
1775; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1776; SI-NEXT:    v_bfi_b32 v1, s2, v2, v3
1777; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1778; SI-NEXT:    s_mov_b32 s4, s0
1779; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1780; SI-NEXT:    s_mov_b32 s5, s1
1781; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1782; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1783; SI-NEXT:    s_endpgm
1784;
1785; VI-LABEL: s_copysign_v2f16:
1786; VI:       ; %bb.0:
1787; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1788; VI-NEXT:    s_movk_i32 s4, 0x7fff
1789; VI-NEXT:    s_waitcnt lgkmcnt(0)
1790; VI-NEXT:    v_mov_b32_e32 v0, s2
1791; VI-NEXT:    v_mov_b32_e32 v1, s3
1792; VI-NEXT:    s_lshr_b32 s3, s3, 16
1793; VI-NEXT:    s_lshr_b32 s2, s2, 16
1794; VI-NEXT:    v_bfi_b32 v0, s4, v0, v1
1795; VI-NEXT:    v_mov_b32_e32 v1, s2
1796; VI-NEXT:    v_mov_b32_e32 v2, s3
1797; VI-NEXT:    v_bfi_b32 v1, s4, v1, v2
1798; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1799; VI-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1800; VI-NEXT:    v_mov_b32_e32 v0, s0
1801; VI-NEXT:    v_mov_b32_e32 v1, s1
1802; VI-NEXT:    flat_store_dword v[0:1], v2
1803; VI-NEXT:    s_endpgm
1804;
1805; GFX9-LABEL: s_copysign_v2f16:
1806; GFX9:       ; %bb.0:
1807; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1808; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
1809; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1810; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1811; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1812; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1813; GFX9-NEXT:    s_lshr_b32 s3, s3, 16
1814; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
1815; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
1816; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1817; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1818; GFX9-NEXT:    v_bfi_b32 v2, s4, v2, v3
1819; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1820; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
1821; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1822; GFX9-NEXT:    s_endpgm
1823;
1824; GFX11-LABEL: s_copysign_v2f16:
1825; GFX11:       ; %bb.0:
1826; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1827; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1828; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1829; GFX11-NEXT:    v_mov_b32_e32 v0, s3
1830; GFX11-NEXT:    s_lshr_b32 s3, s3, 16
1831; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1832; GFX11-NEXT:    v_mov_b32_e32 v1, s3
1833; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s2, v0
1834; GFX11-NEXT:    s_lshr_b32 s2, s2, 16
1835; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
1836; GFX11-NEXT:    v_bfi_b32 v1, 0x7fff, s2, v1
1837; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1838; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1839; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1840; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
1841; GFX11-NEXT:    s_endpgm
1842  %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign)
1843  store <2 x half> %out, ptr addrspace(1) %arg_out
1844  ret void
1845}
1846
1847define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half> %arg_mag, <3 x half> %arg_sign) {
1848; SI-LABEL: s_copysign_v3f16:
1849; SI:       ; %bb.0:
1850; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
1851; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
1852; SI-NEXT:    s_mov_b32 s7, 0xf000
1853; SI-NEXT:    s_waitcnt lgkmcnt(0)
1854; SI-NEXT:    s_lshr_b32 s6, s0, 16
1855; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
1856; SI-NEXT:    s_lshr_b32 s0, s2, 16
1857; SI-NEXT:    v_cvt_f32_f16_e32 v2, s6
1858; SI-NEXT:    v_cvt_f32_f16_e32 v3, s0
1859; SI-NEXT:    v_cvt_f32_f16_e32 v0, s1
1860; SI-NEXT:    v_cvt_f32_f16_e32 v4, s3
1861; SI-NEXT:    v_cvt_f32_f16_e32 v5, s2
1862; SI-NEXT:    s_brev_b32 s0, -2
1863; SI-NEXT:    v_bfi_b32 v2, s0, v2, v3
1864; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1865; SI-NEXT:    v_bfi_b32 v1, s0, v1, v5
1866; SI-NEXT:    v_bfi_b32 v0, s0, v0, v4
1867; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1868; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1869; SI-NEXT:    s_mov_b32 s6, -1
1870; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1871; SI-NEXT:    v_or_b32_e32 v1, v1, v2
1872; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
1873; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
1874; SI-NEXT:    s_endpgm
1875;
1876; VI-LABEL: s_copysign_v3f16:
1877; VI:       ; %bb.0:
1878; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
1879; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
1880; VI-NEXT:    s_movk_i32 s6, 0x7fff
1881; VI-NEXT:    s_waitcnt lgkmcnt(0)
1882; VI-NEXT:    v_mov_b32_e32 v0, s0
1883; VI-NEXT:    v_mov_b32_e32 v1, s2
1884; VI-NEXT:    s_lshr_b32 s2, s2, 16
1885; VI-NEXT:    s_lshr_b32 s0, s0, 16
1886; VI-NEXT:    v_bfi_b32 v0, s6, v0, v1
1887; VI-NEXT:    v_mov_b32_e32 v1, s0
1888; VI-NEXT:    v_mov_b32_e32 v2, s2
1889; VI-NEXT:    v_bfi_b32 v1, s6, v1, v2
1890; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1891; VI-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1892; VI-NEXT:    v_mov_b32_e32 v0, s1
1893; VI-NEXT:    v_mov_b32_e32 v1, s3
1894; VI-NEXT:    s_add_u32 s0, s4, 4
1895; VI-NEXT:    v_bfi_b32 v3, s6, v0, v1
1896; VI-NEXT:    s_addc_u32 s1, s5, 0
1897; VI-NEXT:    v_mov_b32_e32 v0, s0
1898; VI-NEXT:    v_mov_b32_e32 v1, s1
1899; VI-NEXT:    flat_store_short v[0:1], v3
1900; VI-NEXT:    v_mov_b32_e32 v0, s4
1901; VI-NEXT:    v_mov_b32_e32 v1, s5
1902; VI-NEXT:    flat_store_dword v[0:1], v2
1903; VI-NEXT:    s_endpgm
1904;
1905; GFX9-LABEL: s_copysign_v3f16:
1906; GFX9:       ; %bb.0:
1907; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
1908; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
1909; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
1910; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1911; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1912; GFX9-NEXT:    v_mov_b32_e32 v1, s0
1913; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1914; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
1915; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
1916; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
1917; GFX9-NEXT:    v_mov_b32_e32 v2, s0
1918; GFX9-NEXT:    v_mov_b32_e32 v3, s2
1919; GFX9-NEXT:    v_bfi_b32 v2, s4, v2, v3
1920; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1921; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
1922; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1923; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1924; GFX9-NEXT:    v_bfi_b32 v2, s4, v2, v3
1925; GFX9-NEXT:    global_store_short v0, v2, s[6:7] offset:4
1926; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
1927; GFX9-NEXT:    s_endpgm
1928;
1929; GFX11-LABEL: s_copysign_v3f16:
1930; GFX11:       ; %bb.0:
1931; GFX11-NEXT:    s_clause 0x1
1932; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
1933; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
1934; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1935; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
1936; GFX11-NEXT:    s_lshr_b32 s2, s2, 16
1937; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1938; GFX11-NEXT:    v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s2
1939; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
1940; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
1941; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
1942; GFX11-NEXT:    v_bfi_b32 v2, 0x7fff, s1, v2
1943; GFX11-NEXT:    v_bfi_b32 v1, 0x7fff, s0, v1
1944; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
1945; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1946; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1947; GFX11-NEXT:    s_clause 0x1
1948; GFX11-NEXT:    global_store_b16 v3, v2, s[4:5] offset:4
1949; GFX11-NEXT:    global_store_b32 v3, v0, s[4:5]
1950; GFX11-NEXT:    s_endpgm
1951  %out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign)
1952  store <3 x half> %out, ptr addrspace(1) %arg_out
1953  ret void
1954}
1955
1956define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half> %arg_mag, <4 x half> %arg_sign) {
1957; SI-LABEL: s_copysign_v4f16:
1958; SI:       ; %bb.0:
1959; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
1960; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
1961; SI-NEXT:    s_mov_b32 s7, 0xf000
1962; SI-NEXT:    s_mov_b32 s6, -1
1963; SI-NEXT:    s_waitcnt lgkmcnt(0)
1964; SI-NEXT:    s_lshr_b32 s8, s0, 16
1965; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
1966; SI-NEXT:    s_lshr_b32 s0, s2, 16
1967; SI-NEXT:    s_lshr_b32 s9, s1, 16
1968; SI-NEXT:    v_cvt_f32_f16_e32 v4, s0
1969; SI-NEXT:    s_lshr_b32 s0, s3, 16
1970; SI-NEXT:    v_cvt_f32_f16_e32 v0, s8
1971; SI-NEXT:    v_cvt_f32_f16_e32 v1, s9
1972; SI-NEXT:    v_cvt_f32_f16_e32 v5, s0
1973; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
1974; SI-NEXT:    v_cvt_f32_f16_e32 v6, s2
1975; SI-NEXT:    v_cvt_f32_f16_e32 v7, s3
1976; SI-NEXT:    s_brev_b32 s0, -2
1977; SI-NEXT:    v_bfi_b32 v1, s0, v1, v5
1978; SI-NEXT:    v_bfi_b32 v0, s0, v0, v4
1979; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1980; SI-NEXT:    v_bfi_b32 v3, s0, v3, v7
1981; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1982; SI-NEXT:    v_bfi_b32 v2, s0, v2, v6
1983; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1984; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1985; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1986; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1987; SI-NEXT:    v_or_b32_e32 v1, v3, v1
1988; SI-NEXT:    v_or_b32_e32 v0, v2, v0
1989; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1990; SI-NEXT:    s_endpgm
1991;
1992; VI-LABEL: s_copysign_v4f16:
1993; VI:       ; %bb.0:
1994; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
1995; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
1996; VI-NEXT:    s_movk_i32 s6, 0x7fff
1997; VI-NEXT:    s_waitcnt lgkmcnt(0)
1998; VI-NEXT:    v_mov_b32_e32 v0, s1
1999; VI-NEXT:    v_mov_b32_e32 v1, s3
2000; VI-NEXT:    s_lshr_b32 s3, s3, 16
2001; VI-NEXT:    s_lshr_b32 s1, s1, 16
2002; VI-NEXT:    v_bfi_b32 v0, s6, v0, v1
2003; VI-NEXT:    v_mov_b32_e32 v1, s1
2004; VI-NEXT:    v_mov_b32_e32 v2, s3
2005; VI-NEXT:    v_bfi_b32 v1, s6, v1, v2
2006; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2007; VI-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2008; VI-NEXT:    v_mov_b32_e32 v0, s0
2009; VI-NEXT:    v_mov_b32_e32 v2, s2
2010; VI-NEXT:    s_lshr_b32 s1, s2, 16
2011; VI-NEXT:    s_lshr_b32 s0, s0, 16
2012; VI-NEXT:    v_bfi_b32 v0, s6, v0, v2
2013; VI-NEXT:    v_mov_b32_e32 v2, s0
2014; VI-NEXT:    v_mov_b32_e32 v3, s1
2015; VI-NEXT:    v_bfi_b32 v2, s6, v2, v3
2016; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2017; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2018; VI-NEXT:    v_mov_b32_e32 v2, s4
2019; VI-NEXT:    v_mov_b32_e32 v3, s5
2020; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2021; VI-NEXT:    s_endpgm
2022;
2023; GFX9-LABEL: s_copysign_v4f16:
2024; GFX9:       ; %bb.0:
2025; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
2026; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
2027; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
2028; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2029; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2030; GFX9-NEXT:    v_mov_b32_e32 v0, s1
2031; GFX9-NEXT:    v_mov_b32_e32 v1, s3
2032; GFX9-NEXT:    s_lshr_b32 s3, s3, 16
2033; GFX9-NEXT:    s_lshr_b32 s1, s1, 16
2034; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
2035; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2036; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2037; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v3
2038; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2039; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v0
2040; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2041; GFX9-NEXT:    v_mov_b32_e32 v3, s2
2042; GFX9-NEXT:    s_lshr_b32 s1, s2, 16
2043; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
2044; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v3
2045; GFX9-NEXT:    v_mov_b32_e32 v3, s0
2046; GFX9-NEXT:    v_mov_b32_e32 v4, s1
2047; GFX9-NEXT:    v_bfi_b32 v3, s4, v3, v4
2048; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2049; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
2050; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
2051; GFX9-NEXT:    s_endpgm
2052;
2053; GFX11-LABEL: s_copysign_v4f16:
2054; GFX11:       ; %bb.0:
2055; GFX11-NEXT:    s_clause 0x1
2056; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
2057; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
2058; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2059; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s3
2060; GFX11-NEXT:    v_mov_b32_e32 v1, s2
2061; GFX11-NEXT:    s_lshr_b32 s3, s3, 16
2062; GFX11-NEXT:    s_lshr_b32 s2, s2, 16
2063; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2064; GFX11-NEXT:    v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2
2065; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s1, v0
2066; GFX11-NEXT:    v_bfi_b32 v1, 0x7fff, s0, v1
2067; GFX11-NEXT:    s_lshr_b32 s6, s1, 16
2068; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
2069; GFX11-NEXT:    v_bfi_b32 v2, 0x7fff, s6, v2
2070; GFX11-NEXT:    v_bfi_b32 v3, 0x7fff, s0, v3
2071; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2072; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v1
2073; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2074; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 16, v0
2075; GFX11-NEXT:    v_lshl_or_b32 v0, v3, 16, v4
2076; GFX11-NEXT:    global_store_b64 v5, v[0:1], s[4:5]
2077; GFX11-NEXT:    s_endpgm
2078  %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign)
2079  store <4 x half> %out, ptr addrspace(1) %arg_out
2080  ret void
2081}
2082
2083attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
2084