xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fneg.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s
5; RUN: not llc -mtriple=r600 -mcpu=redwood < %s
6
7define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) {
8; SI-LABEL: s_fneg_f32:
9; SI:       ; %bb.0:
10; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
11; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
12; SI-NEXT:    s_mov_b32 s3, 0xf000
13; SI-NEXT:    s_mov_b32 s2, -1
14; SI-NEXT:    s_waitcnt lgkmcnt(0)
15; SI-NEXT:    s_xor_b32 s4, s6, 0x80000000
16; SI-NEXT:    v_mov_b32_e32 v0, s4
17; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
18; SI-NEXT:    s_endpgm
19;
20; VI-LABEL: s_fneg_f32:
21; VI:       ; %bb.0:
22; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
23; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
24; VI-NEXT:    s_waitcnt lgkmcnt(0)
25; VI-NEXT:    s_xor_b32 s2, s2, 0x80000000
26; VI-NEXT:    v_mov_b32_e32 v0, s0
27; VI-NEXT:    v_mov_b32_e32 v1, s1
28; VI-NEXT:    v_mov_b32_e32 v2, s2
29; VI-NEXT:    flat_store_dword v[0:1], v2
30; VI-NEXT:    s_endpgm
31;
32; GFX11-LABEL: s_fneg_f32:
33; GFX11:       ; %bb.0:
34; GFX11-NEXT:    s_clause 0x1
35; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
36; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
37; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
38; GFX11-NEXT:    s_xor_b32 s2, s2, 0x80000000
39; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
40; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
41; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
42; GFX11-NEXT:    s_endpgm
43  %fneg = fsub float -0.000000e+00, %in
44  store float %fneg, ptr addrspace(1) %out
45  ret void
46}
47
48define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x float> %in) {
49; SI-LABEL: s_fneg_v2f32:
50; SI:       ; %bb.0:
51; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
52; SI-NEXT:    s_mov_b32 s7, 0xf000
53; SI-NEXT:    s_mov_b32 s6, -1
54; SI-NEXT:    s_waitcnt lgkmcnt(0)
55; SI-NEXT:    s_mov_b32 s4, s0
56; SI-NEXT:    s_mov_b32 s5, s1
57; SI-NEXT:    s_xor_b32 s0, s3, 0x80000000
58; SI-NEXT:    s_xor_b32 s1, s2, 0x80000000
59; SI-NEXT:    v_mov_b32_e32 v0, s1
60; SI-NEXT:    v_mov_b32_e32 v1, s0
61; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
62; SI-NEXT:    s_endpgm
63;
64; VI-LABEL: s_fneg_v2f32:
65; VI:       ; %bb.0:
66; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
67; VI-NEXT:    s_waitcnt lgkmcnt(0)
68; VI-NEXT:    s_xor_b32 s3, s3, 0x80000000
69; VI-NEXT:    s_xor_b32 s2, s2, 0x80000000
70; VI-NEXT:    v_mov_b32_e32 v3, s1
71; VI-NEXT:    v_mov_b32_e32 v0, s2
72; VI-NEXT:    v_mov_b32_e32 v1, s3
73; VI-NEXT:    v_mov_b32_e32 v2, s0
74; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
75; VI-NEXT:    s_endpgm
76;
77; GFX11-LABEL: s_fneg_v2f32:
78; GFX11:       ; %bb.0:
79; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
80; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX11-NEXT:    s_xor_b32 s2, s2, 0x80000000
82; GFX11-NEXT:    s_xor_b32 s3, s3, 0x80000000
83; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
84; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
85; GFX11-NEXT:    v_mov_b32_e32 v0, s2
86; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
87; GFX11-NEXT:    s_endpgm
88  %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
89  store <2 x float> %fneg, ptr addrspace(1) %out
90  ret void
91}
92
93define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x float> %in) {
94; SI-LABEL: s_fneg_v4f32:
95; SI:       ; %bb.0:
96; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
97; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
98; SI-NEXT:    s_mov_b32 s7, 0xf000
99; SI-NEXT:    s_mov_b32 s6, -1
100; SI-NEXT:    s_waitcnt lgkmcnt(0)
101; SI-NEXT:    s_xor_b32 s3, s3, 0x80000000
102; SI-NEXT:    s_xor_b32 s2, s2, 0x80000000
103; SI-NEXT:    s_xor_b32 s1, s1, 0x80000000
104; SI-NEXT:    s_xor_b32 s0, s0, 0x80000000
105; SI-NEXT:    v_mov_b32_e32 v0, s0
106; SI-NEXT:    v_mov_b32_e32 v1, s1
107; SI-NEXT:    v_mov_b32_e32 v2, s2
108; SI-NEXT:    v_mov_b32_e32 v3, s3
109; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
110; SI-NEXT:    s_endpgm
111;
112; VI-LABEL: s_fneg_v4f32:
113; VI:       ; %bb.0:
114; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
115; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
116; VI-NEXT:    s_waitcnt lgkmcnt(0)
117; VI-NEXT:    s_xor_b32 s3, s3, 0x80000000
118; VI-NEXT:    s_xor_b32 s2, s2, 0x80000000
119; VI-NEXT:    s_xor_b32 s1, s1, 0x80000000
120; VI-NEXT:    s_xor_b32 s0, s0, 0x80000000
121; VI-NEXT:    v_mov_b32_e32 v4, s4
122; VI-NEXT:    v_mov_b32_e32 v0, s0
123; VI-NEXT:    v_mov_b32_e32 v1, s1
124; VI-NEXT:    v_mov_b32_e32 v2, s2
125; VI-NEXT:    v_mov_b32_e32 v3, s3
126; VI-NEXT:    v_mov_b32_e32 v5, s5
127; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
128; VI-NEXT:    s_endpgm
129;
130; GFX11-LABEL: s_fneg_v4f32:
131; GFX11:       ; %bb.0:
132; GFX11-NEXT:    s_clause 0x1
133; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
134; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
135; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
136; GFX11-NEXT:    s_xor_b32 s3, s3, 0x80000000
137; GFX11-NEXT:    s_xor_b32 s2, s2, 0x80000000
138; GFX11-NEXT:    s_xor_b32 s0, s0, 0x80000000
139; GFX11-NEXT:    s_xor_b32 s1, s1, 0x80000000
140; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
141; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
142; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
143; GFX11-NEXT:    v_mov_b32_e32 v2, s2
144; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
145; GFX11-NEXT:    s_endpgm
146  %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
147  store <4 x float> %fneg, ptr addrspace(1) %out
148  ret void
149}
150
151define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) {
152; SI-LABEL: fsub0_f32:
153; SI:       ; %bb.0:
154; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
155; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
156; SI-NEXT:    s_mov_b32 s3, 0xf000
157; SI-NEXT:    s_mov_b32 s2, -1
158; SI-NEXT:    s_waitcnt lgkmcnt(0)
159; SI-NEXT:    v_sub_f32_e64 v0, 0, s6
160; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
161; SI-NEXT:    s_endpgm
162;
163; VI-LABEL: fsub0_f32:
164; VI:       ; %bb.0:
165; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
166; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
167; VI-NEXT:    s_waitcnt lgkmcnt(0)
168; VI-NEXT:    v_sub_f32_e64 v2, 0, s2
169; VI-NEXT:    v_mov_b32_e32 v0, s0
170; VI-NEXT:    v_mov_b32_e32 v1, s1
171; VI-NEXT:    flat_store_dword v[0:1], v2
172; VI-NEXT:    s_endpgm
173;
174; GFX11-LABEL: fsub0_f32:
175; GFX11:       ; %bb.0:
176; GFX11-NEXT:    s_clause 0x1
177; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
178; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
179; GFX11-NEXT:    v_mov_b32_e32 v0, 0
180; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
181; GFX11-NEXT:    v_sub_f32_e64 v1, 0, s2
182; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
183; GFX11-NEXT:    s_endpgm
184  %bc = bitcast i32 %in to float
185  %fsub = fsub float 0.0, %bc
186  store float %fsub, ptr addrspace(1) %out
187  ret void
188}
189
190define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) {
191; SI-LABEL: fneg_free_f32:
192; SI:       ; %bb.0:
193; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
194; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
195; SI-NEXT:    s_mov_b32 s3, 0xf000
196; SI-NEXT:    s_mov_b32 s2, -1
197; SI-NEXT:    s_waitcnt lgkmcnt(0)
198; SI-NEXT:    s_xor_b32 s4, s6, 0x80000000
199; SI-NEXT:    v_mov_b32_e32 v0, s4
200; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
201; SI-NEXT:    s_endpgm
202;
203; VI-LABEL: fneg_free_f32:
204; VI:       ; %bb.0:
205; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
206; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
207; VI-NEXT:    s_waitcnt lgkmcnt(0)
208; VI-NEXT:    s_xor_b32 s2, s2, 0x80000000
209; VI-NEXT:    v_mov_b32_e32 v0, s0
210; VI-NEXT:    v_mov_b32_e32 v1, s1
211; VI-NEXT:    v_mov_b32_e32 v2, s2
212; VI-NEXT:    flat_store_dword v[0:1], v2
213; VI-NEXT:    s_endpgm
214;
215; GFX11-LABEL: fneg_free_f32:
216; GFX11:       ; %bb.0:
217; GFX11-NEXT:    s_clause 0x1
218; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
219; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
220; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
221; GFX11-NEXT:    s_xor_b32 s2, s2, 0x80000000
222; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
223; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
224; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
225; GFX11-NEXT:    s_endpgm
226  %bc = bitcast i32 %in to float
227  %fsub = fsub float -0.0, %bc
228  store float %fsub, ptr addrspace(1) %out
229  ret void
230}
231
232define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) {
233; SI-LABEL: fneg_fold_f32:
234; SI:       ; %bb.0:
235; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
236; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
237; SI-NEXT:    s_mov_b32 s3, 0xf000
238; SI-NEXT:    s_mov_b32 s2, -1
239; SI-NEXT:    s_waitcnt lgkmcnt(0)
240; SI-NEXT:    v_mul_f32_e64 v0, -s6, s6
241; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
242; SI-NEXT:    s_endpgm
243;
244; VI-LABEL: fneg_fold_f32:
245; VI:       ; %bb.0:
246; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
247; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
248; VI-NEXT:    s_waitcnt lgkmcnt(0)
249; VI-NEXT:    v_mul_f32_e64 v2, -s2, s2
250; VI-NEXT:    v_mov_b32_e32 v0, s0
251; VI-NEXT:    v_mov_b32_e32 v1, s1
252; VI-NEXT:    flat_store_dword v[0:1], v2
253; VI-NEXT:    s_endpgm
254;
255; GFX11-LABEL: fneg_fold_f32:
256; GFX11:       ; %bb.0:
257; GFX11-NEXT:    s_clause 0x1
258; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
259; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
260; GFX11-NEXT:    v_mov_b32_e32 v0, 0
261; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
262; GFX11-NEXT:    v_mul_f32_e64 v1, -s2, s2
263; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
264; GFX11-NEXT:    s_endpgm
265  %fsub = fsub float -0.0, %in
266  %fmul = fmul float %fsub, %in
267  store float %fmul, ptr addrspace(1) %out
268  ret void
269}
270
271; Make sure we turn some integer operations back into fabs
272define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in) {
273; SI-LABEL: bitpreserve_fneg_f32:
274; SI:       ; %bb.0:
275; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
276; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
277; SI-NEXT:    s_mov_b32 s3, 0xf000
278; SI-NEXT:    s_mov_b32 s2, -1
279; SI-NEXT:    s_waitcnt lgkmcnt(0)
280; SI-NEXT:    v_mul_f32_e64 v0, s6, -4.0
281; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
282; SI-NEXT:    s_endpgm
283;
284; VI-LABEL: bitpreserve_fneg_f32:
285; VI:       ; %bb.0:
286; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
287; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
288; VI-NEXT:    s_waitcnt lgkmcnt(0)
289; VI-NEXT:    v_mul_f32_e64 v2, s2, -4.0
290; VI-NEXT:    v_mov_b32_e32 v0, s0
291; VI-NEXT:    v_mov_b32_e32 v1, s1
292; VI-NEXT:    flat_store_dword v[0:1], v2
293; VI-NEXT:    s_endpgm
294;
295; GFX11-LABEL: bitpreserve_fneg_f32:
296; GFX11:       ; %bb.0:
297; GFX11-NEXT:    s_clause 0x1
298; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
299; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
300; GFX11-NEXT:    v_mov_b32_e32 v0, 0
301; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
302; GFX11-NEXT:    v_mul_f32_e64 v1, s2, -4.0
303; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
304; GFX11-NEXT:    s_endpgm
305  %in.bc = bitcast float %in to i32
306  %int.abs = xor i32 %in.bc, 2147483648
307  %bc = bitcast i32 %int.abs to float
308  %fadd = fmul float %bc, 4.0
309  store float %fadd, ptr addrspace(1) %out
310  ret void
311}
312
313define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) {
314; SI-LABEL: s_fneg_i32:
315; SI:       ; %bb.0:
316; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
317; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
318; SI-NEXT:    s_mov_b32 s3, 0xf000
319; SI-NEXT:    s_mov_b32 s2, -1
320; SI-NEXT:    s_waitcnt lgkmcnt(0)
321; SI-NEXT:    s_xor_b32 s4, s6, 0x80000000
322; SI-NEXT:    v_mov_b32_e32 v0, s4
323; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
324; SI-NEXT:    s_endpgm
325;
326; VI-LABEL: s_fneg_i32:
327; VI:       ; %bb.0:
328; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
329; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
330; VI-NEXT:    s_waitcnt lgkmcnt(0)
331; VI-NEXT:    s_xor_b32 s2, s2, 0x80000000
332; VI-NEXT:    v_mov_b32_e32 v0, s0
333; VI-NEXT:    v_mov_b32_e32 v1, s1
334; VI-NEXT:    v_mov_b32_e32 v2, s2
335; VI-NEXT:    flat_store_dword v[0:1], v2
336; VI-NEXT:    s_endpgm
337;
338; GFX11-LABEL: s_fneg_i32:
339; GFX11:       ; %bb.0:
340; GFX11-NEXT:    s_clause 0x1
341; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
342; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
343; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
344; GFX11-NEXT:    s_xor_b32 s2, s2, 0x80000000
345; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
346; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
347; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
348; GFX11-NEXT:    s_endpgm
349  %fneg = xor i32 %in, -2147483648
350  store i32 %fneg, ptr addrspace(1) %out
351  ret void
352}
353
354define i32 @v_fneg_i32(i32 %in) {
355; GCN-LABEL: v_fneg_i32:
356; GCN:       ; %bb.0:
357; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
358; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
359; GCN-NEXT:    s_setpc_b64 s[30:31]
360  %fneg = xor i32 %in, -2147483648
361  ret i32 %fneg
362}
363
364define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) {
365; SI-LABEL: s_fneg_i32_fp_use:
366; SI:       ; %bb.0:
367; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
368; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
369; SI-NEXT:    s_mov_b32 s3, 0xf000
370; SI-NEXT:    s_mov_b32 s2, -1
371; SI-NEXT:    s_waitcnt lgkmcnt(0)
372; SI-NEXT:    v_sub_f32_e64 v0, 2.0, s6
373; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
374; SI-NEXT:    s_endpgm
375;
376; VI-LABEL: s_fneg_i32_fp_use:
377; VI:       ; %bb.0:
378; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
379; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
380; VI-NEXT:    s_waitcnt lgkmcnt(0)
381; VI-NEXT:    v_sub_f32_e64 v2, 2.0, s2
382; VI-NEXT:    v_mov_b32_e32 v0, s0
383; VI-NEXT:    v_mov_b32_e32 v1, s1
384; VI-NEXT:    flat_store_dword v[0:1], v2
385; VI-NEXT:    s_endpgm
386;
387; GFX11-LABEL: s_fneg_i32_fp_use:
388; GFX11:       ; %bb.0:
389; GFX11-NEXT:    s_clause 0x1
390; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
391; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
392; GFX11-NEXT:    v_mov_b32_e32 v0, 0
393; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX11-NEXT:    v_sub_f32_e64 v1, 2.0, s2
395; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
396; GFX11-NEXT:    s_endpgm
397  %fneg = xor i32 %in, -2147483648
398  %bitcast = bitcast i32 %fneg to float
399  %fadd = fadd float %bitcast, 2.0
400  store float %fadd, ptr addrspace(1) %out
401  ret void
402}
403
404define float @v_fneg_i32_fp_use(i32 %in) {
405; GCN-LABEL: v_fneg_i32_fp_use:
406; GCN:       ; %bb.0:
407; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
408; GCN-NEXT:    v_sub_f32_e32 v0, 2.0, v0
409; GCN-NEXT:    s_setpc_b64 s[30:31]
410  %fneg = xor i32 %in, -2147483648
411  %bitcast = bitcast i32 %fneg to float
412  %fadd = fadd float %bitcast, 2.0
413  ret float %fadd
414}
415
416define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) {
417; SI-LABEL: s_fneg_i64:
418; SI:       ; %bb.0:
419; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
420; SI-NEXT:    s_mov_b32 s7, 0xf000
421; SI-NEXT:    s_mov_b32 s6, -1
422; SI-NEXT:    s_waitcnt lgkmcnt(0)
423; SI-NEXT:    s_mov_b32 s4, s0
424; SI-NEXT:    s_xor_b32 s0, s3, 0x80000000
425; SI-NEXT:    s_mov_b32 s5, s1
426; SI-NEXT:    v_mov_b32_e32 v0, s2
427; SI-NEXT:    v_mov_b32_e32 v1, s0
428; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
429; SI-NEXT:    s_endpgm
430;
431; VI-LABEL: s_fneg_i64:
432; VI:       ; %bb.0:
433; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
434; VI-NEXT:    s_waitcnt lgkmcnt(0)
435; VI-NEXT:    v_mov_b32_e32 v0, s0
436; VI-NEXT:    s_xor_b32 s0, s3, 0x80000000
437; VI-NEXT:    v_mov_b32_e32 v1, s1
438; VI-NEXT:    v_mov_b32_e32 v2, s2
439; VI-NEXT:    v_mov_b32_e32 v3, s0
440; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
441; VI-NEXT:    s_endpgm
442;
443; GFX11-LABEL: s_fneg_i64:
444; GFX11:       ; %bb.0:
445; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
446; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
447; GFX11-NEXT:    s_xor_b32 s3, s3, 0x80000000
448; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
449; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
450; GFX11-NEXT:    v_mov_b32_e32 v0, s2
451; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
452; GFX11-NEXT:    s_endpgm
453  %fneg = xor i64 %in, -9223372036854775808
454  store i64 %fneg, ptr addrspace(1) %out
455  ret void
456}
457
458define i64 @v_fneg_i64(i64 %in) {
459; GCN-LABEL: v_fneg_i64:
460; GCN:       ; %bb.0:
461; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
462; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
463; GCN-NEXT:    s_setpc_b64 s[30:31]
464  %fneg = xor i64 %in, -9223372036854775808
465  ret i64 %fneg
466}
467
468define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) {
469; SI-LABEL: s_fneg_i64_fp_use:
470; SI:       ; %bb.0:
471; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
472; SI-NEXT:    s_mov_b32 s7, 0xf000
473; SI-NEXT:    s_mov_b32 s6, -1
474; SI-NEXT:    s_waitcnt lgkmcnt(0)
475; SI-NEXT:    v_add_f64 v[0:1], -s[2:3], 2.0
476; SI-NEXT:    s_mov_b32 s4, s0
477; SI-NEXT:    s_mov_b32 s5, s1
478; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
479; SI-NEXT:    s_endpgm
480;
481; VI-LABEL: s_fneg_i64_fp_use:
482; VI:       ; %bb.0:
483; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
484; VI-NEXT:    s_waitcnt lgkmcnt(0)
485; VI-NEXT:    v_add_f64 v[0:1], -s[2:3], 2.0
486; VI-NEXT:    v_mov_b32_e32 v2, s0
487; VI-NEXT:    v_mov_b32_e32 v3, s1
488; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
489; VI-NEXT:    s_endpgm
490;
491; GFX11-LABEL: s_fneg_i64_fp_use:
492; GFX11:       ; %bb.0:
493; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
494; GFX11-NEXT:    v_mov_b32_e32 v2, 0
495; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
496; GFX11-NEXT:    v_add_f64 v[0:1], -s[2:3], 2.0
497; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
498; GFX11-NEXT:    s_endpgm
499  %fneg = xor i64 %in, -9223372036854775808
500  %bitcast = bitcast i64 %fneg to double
501  %fadd = fadd double %bitcast, 2.0
502  store double %fadd, ptr addrspace(1) %out
503  ret void
504}
505
506define double @v_fneg_i64_fp_use(i64 %in) {
507; GCN-LABEL: v_fneg_i64_fp_use:
508; GCN:       ; %bb.0:
509; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
510; GCN-NEXT:    v_add_f64 v[0:1], -v[0:1], 2.0
511; GCN-NEXT:    s_setpc_b64 s[30:31]
512  %fneg = xor i64 %in, -9223372036854775808
513  %bitcast = bitcast i64 %fneg to double
514  %fadd = fadd double %bitcast, 2.0
515  ret double %fadd
516}
517
518define i16 @v_fneg_i16(i16 %in) {
519; GCN-LABEL: v_fneg_i16:
520; GCN:       ; %bb.0:
521; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
522; GCN-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
523; GCN-NEXT:    s_setpc_b64 s[30:31]
524  %fneg = xor i16 %in, -32768
525  ret i16 %fneg
526}
527
528define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) {
529; SI-LABEL: s_fneg_i16_fp_use:
530; SI:       ; %bb.0:
531; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
532; SI-NEXT:    s_mov_b32 s3, 0xf000
533; SI-NEXT:    s_mov_b32 s2, -1
534; SI-NEXT:    s_waitcnt lgkmcnt(0)
535; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
536; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
537; SI-NEXT:    v_sub_f32_e32 v0, 2.0, v0
538; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
539; SI-NEXT:    s_waitcnt lgkmcnt(0)
540; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
541; SI-NEXT:    s_endpgm
542;
543; VI-LABEL: s_fneg_i16_fp_use:
544; VI:       ; %bb.0:
545; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
546; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
547; VI-NEXT:    s_waitcnt lgkmcnt(0)
548; VI-NEXT:    v_sub_f16_e64 v2, 2.0, s2
549; VI-NEXT:    v_mov_b32_e32 v0, s0
550; VI-NEXT:    v_mov_b32_e32 v1, s1
551; VI-NEXT:    flat_store_short v[0:1], v2
552; VI-NEXT:    s_endpgm
553;
554; GFX11-LABEL: s_fneg_i16_fp_use:
555; GFX11:       ; %bb.0:
556; GFX11-NEXT:    s_clause 0x1
557; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
558; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
559; GFX11-NEXT:    v_mov_b32_e32 v0, 0
560; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
561; GFX11-NEXT:    v_sub_f16_e64 v1, 2.0, s2
562; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
563; GFX11-NEXT:    s_endpgm
564  %fneg = xor i16 %in, -32768
565  %bitcast = bitcast i16 %fneg to half
566  %fadd = fadd half %bitcast, 2.0
567  store half %fadd, ptr addrspace(1) %out
568  ret void
569}
570
571define half @v_fneg_i16_fp_use(i16 %in) {
572; SI-LABEL: v_fneg_i16_fp_use:
573; SI:       ; %bb.0:
574; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
575; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
576; SI-NEXT:    v_sub_f32_e32 v0, 2.0, v0
577; SI-NEXT:    s_setpc_b64 s[30:31]
578;
579; VI-LABEL: v_fneg_i16_fp_use:
580; VI:       ; %bb.0:
581; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
582; VI-NEXT:    v_sub_f16_e32 v0, 2.0, v0
583; VI-NEXT:    s_setpc_b64 s[30:31]
584;
585; GFX11-LABEL: v_fneg_i16_fp_use:
586; GFX11:       ; %bb.0:
587; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
588; GFX11-NEXT:    v_sub_f16_e32 v0, 2.0, v0
589; GFX11-NEXT:    s_setpc_b64 s[30:31]
590  %fneg = xor i16 %in, -32768
591  %bitcast = bitcast i16 %fneg to half
592  %fadd = fadd half %bitcast, 2.0
593  ret half %fadd
594}
595
596define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) {
597; SI-LABEL: s_fneg_v2i16:
598; SI:       ; %bb.0:
599; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
600; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
601; SI-NEXT:    s_mov_b32 s3, 0xf000
602; SI-NEXT:    s_mov_b32 s2, -1
603; SI-NEXT:    s_waitcnt lgkmcnt(0)
604; SI-NEXT:    s_xor_b32 s4, s6, 0x80008000
605; SI-NEXT:    v_mov_b32_e32 v0, s4
606; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
607; SI-NEXT:    s_endpgm
608;
609; VI-LABEL: s_fneg_v2i16:
610; VI:       ; %bb.0:
611; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
612; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
613; VI-NEXT:    s_waitcnt lgkmcnt(0)
614; VI-NEXT:    s_lshr_b32 s3, s2, 16
615; VI-NEXT:    s_xor_b32 s2, s2, 0x8000
616; VI-NEXT:    s_xor_b32 s3, s3, 0x8000
617; VI-NEXT:    s_and_b32 s2, s2, 0xffff
618; VI-NEXT:    s_lshl_b32 s3, s3, 16
619; VI-NEXT:    s_or_b32 s2, s2, s3
620; VI-NEXT:    v_mov_b32_e32 v0, s0
621; VI-NEXT:    v_mov_b32_e32 v1, s1
622; VI-NEXT:    v_mov_b32_e32 v2, s2
623; VI-NEXT:    flat_store_dword v[0:1], v2
624; VI-NEXT:    s_endpgm
625;
626; GFX11-LABEL: s_fneg_v2i16:
627; GFX11:       ; %bb.0:
628; GFX11-NEXT:    s_clause 0x1
629; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
630; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
631; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
632; GFX11-NEXT:    s_xor_b32 s2, s2, 0x80008000
633; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
634; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
635; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
636; GFX11-NEXT:    s_endpgm
637  %in = bitcast i32 %arg to <2 x i16>
638  %fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
639  store <2 x i16> %fneg, ptr addrspace(1) %out
640  ret void
641}
642
643define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) {
644; SI-LABEL: v_fneg_v2i16:
645; SI:       ; %bb.0:
646; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
647; SI-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
648; SI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
649; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
650; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
651; SI-NEXT:    v_or_b32_e32 v0, v0, v2
652; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
653; SI-NEXT:    s_setpc_b64 s[30:31]
654;
655; VI-LABEL: v_fneg_v2i16:
656; VI:       ; %bb.0:
657; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
658; VI-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
659; VI-NEXT:    s_setpc_b64 s[30:31]
660;
661; GFX11-LABEL: v_fneg_v2i16:
662; GFX11:       ; %bb.0:
663; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
664; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
665; GFX11-NEXT:    s_setpc_b64 s[30:31]
666  %fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
667  ret <2 x i16> %fneg
668}
669
670define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) {
671; SI-LABEL: s_fneg_v2i16_fp_use:
672; SI:       ; %bb.0:
673; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
674; SI-NEXT:    s_mov_b32 s3, 0xf000
675; SI-NEXT:    s_mov_b32 s2, -1
676; SI-NEXT:    s_waitcnt lgkmcnt(0)
677; SI-NEXT:    s_lshr_b32 s1, s0, 16
678; SI-NEXT:    v_cvt_f32_f16_e32 v0, s1
679; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
680; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
681; SI-NEXT:    v_sub_f32_e32 v0, 2.0, v0
682; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
683; SI-NEXT:    v_sub_f32_e32 v1, 2.0, v1
684; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
685; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
686; SI-NEXT:    v_or_b32_e32 v0, v1, v0
687; SI-NEXT:    s_waitcnt lgkmcnt(0)
688; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
689; SI-NEXT:    s_endpgm
690;
691; VI-LABEL: s_fneg_v2i16_fp_use:
692; VI:       ; %bb.0:
693; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
694; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
695; VI-NEXT:    v_mov_b32_e32 v0, 0x4000
696; VI-NEXT:    s_waitcnt lgkmcnt(0)
697; VI-NEXT:    s_lshr_b32 s3, s2, 16
698; VI-NEXT:    s_xor_b32 s3, s3, 0x8000
699; VI-NEXT:    s_xor_b32 s2, s2, 0x8000
700; VI-NEXT:    v_mov_b32_e32 v2, s3
701; VI-NEXT:    v_add_f16_e64 v1, s2, 2.0
702; VI-NEXT:    v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
703; VI-NEXT:    v_or_b32_e32 v2, v1, v0
704; VI-NEXT:    v_mov_b32_e32 v0, s0
705; VI-NEXT:    v_mov_b32_e32 v1, s1
706; VI-NEXT:    flat_store_dword v[0:1], v2
707; VI-NEXT:    s_endpgm
708;
709; GFX11-LABEL: s_fneg_v2i16_fp_use:
710; GFX11:       ; %bb.0:
711; GFX11-NEXT:    s_clause 0x1
712; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
713; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
714; GFX11-NEXT:    v_mov_b32_e32 v0, 0
715; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
716; GFX11-NEXT:    v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
717; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
718; GFX11-NEXT:    s_endpgm
719  %in = bitcast i32 %arg to <2 x i16>
720  %fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
721  %bitcast = bitcast <2 x i16> %fneg to <2 x half>
722  %fadd = fadd <2 x half> %bitcast, <half 2.0, half 2.0>
723  store <2 x half> %fadd, ptr addrspace(1) %out
724  ret void
725}
726
727define <2 x half> @v_fneg_v2i16_fp_use(i32 %arg) {
728; SI-LABEL: v_fneg_v2i16_fp_use:
729; SI:       ; %bb.0:
730; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
731; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
732; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
733; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
734; SI-NEXT:    v_sub_f32_e32 v0, 2.0, v0
735; SI-NEXT:    v_sub_f32_e32 v1, 2.0, v1
736; SI-NEXT:    s_setpc_b64 s[30:31]
737;
738; VI-LABEL: v_fneg_v2i16_fp_use:
739; VI:       ; %bb.0:
740; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
741; VI-NEXT:    v_mov_b32_e32 v1, 0x4000
742; VI-NEXT:    v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
743; VI-NEXT:    v_sub_f16_e32 v0, 2.0, v0
744; VI-NEXT:    v_or_b32_e32 v0, v0, v1
745; VI-NEXT:    s_setpc_b64 s[30:31]
746;
747; GFX11-LABEL: v_fneg_v2i16_fp_use:
748; GFX11:       ; %bb.0:
749; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
750; GFX11-NEXT:    v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
751; GFX11-NEXT:    s_setpc_b64 s[30:31]
752  %in = bitcast i32 %arg to <2 x i16>
753  %fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
754  %bitcast = bitcast <2 x i16> %fneg to <2 x half>
755  %fadd = fadd <2 x half> %bitcast, <half 2.0, half 2.0>
756  ret <2 x half> %fadd
757}
758