xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fneg.f16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,CI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,GFX8 %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1100 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
6
7; FIXME: Should be able to do scalar op
8define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
9; CI-LABEL: s_fneg_f16:
10; CI:       ; %bb.0:
11; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
12; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
13; CI-NEXT:    s_waitcnt lgkmcnt(0)
14; CI-NEXT:    s_xor_b32 s2, s2, 0x8000
15; CI-NEXT:    v_mov_b32_e32 v0, s0
16; CI-NEXT:    v_mov_b32_e32 v1, s1
17; CI-NEXT:    v_mov_b32_e32 v2, s2
18; CI-NEXT:    flat_store_short v[0:1], v2
19; CI-NEXT:    s_endpgm
20;
21; GFX8-LABEL: s_fneg_f16:
22; GFX8:       ; %bb.0:
23; GFX8-NEXT:    s_load_dword s2, s[8:9], 0x8
24; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
25; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
26; GFX8-NEXT:    s_xor_b32 s2, s2, 0x8000
27; GFX8-NEXT:    v_mov_b32_e32 v0, s0
28; GFX8-NEXT:    v_mov_b32_e32 v1, s1
29; GFX8-NEXT:    v_mov_b32_e32 v2, s2
30; GFX8-NEXT:    flat_store_short v[0:1], v2
31; GFX8-NEXT:    s_endpgm
32;
33; GFX9-LABEL: s_fneg_f16:
34; GFX9:       ; %bb.0:
35; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
36; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
37; GFX9-NEXT:    v_mov_b32_e32 v0, 0
38; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX9-NEXT:    s_xor_b32 s2, s2, 0x8000
40; GFX9-NEXT:    v_mov_b32_e32 v1, s2
41; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
42; GFX9-NEXT:    s_endpgm
43;
44; GFX11-LABEL: s_fneg_f16:
45; GFX11:       ; %bb.0:
46; GFX11-NEXT:    s_clause 0x1
47; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
48; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
49; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
50; GFX11-NEXT:    s_xor_b32 s2, s2, 0x8000
51; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
52; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
53; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
54; GFX11-NEXT:    s_endpgm
55  %fneg = fsub half -0.0, %in
56  store half %fneg, ptr addrspace(1) %out
57  ret void
58}
59
60; FIXME: Should be able to use bit operations when illegal type as
61; well.
62define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
63; CI-LABEL: v_fneg_f16:
64; CI:       ; %bb.0:
65; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x2
66; CI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
67; CI-NEXT:    s_waitcnt lgkmcnt(0)
68; CI-NEXT:    v_mov_b32_e32 v1, s1
69; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
70; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
71; CI-NEXT:    flat_load_ushort v2, v[0:1]
72; CI-NEXT:    s_waitcnt vmcnt(0)
73; CI-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
74; CI-NEXT:    flat_store_short v[0:1], v2
75; CI-NEXT:    s_endpgm
76;
77; GFX8-LABEL: v_fneg_f16:
78; GFX8:       ; %bb.0:
79; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
80; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
81; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
82; GFX8-NEXT:    v_mov_b32_e32 v1, s1
83; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
84; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
85; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
86; GFX8-NEXT:    s_waitcnt vmcnt(0)
87; GFX8-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
88; GFX8-NEXT:    flat_store_short v[0:1], v2
89; GFX8-NEXT:    s_endpgm
90;
91; GFX9-LABEL: v_fneg_f16:
92; GFX9:       ; %bb.0:
93; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
94; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
95; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1]
97; GFX9-NEXT:    s_waitcnt vmcnt(0)
98; GFX9-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
99; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
100; GFX9-NEXT:    s_endpgm
101;
102; GFX11-LABEL: v_fneg_f16:
103; GFX11:       ; %bb.0:
104; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
105; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
106; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
107; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
108; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
109; GFX11-NEXT:    global_load_u16 v1, v0, s[0:1]
110; GFX11-NEXT:    s_waitcnt vmcnt(0)
111; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
112; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
113; GFX11-NEXT:    s_endpgm
114  %tid = call i32 @llvm.amdgcn.workitem.id.x()
115  %gep.in = getelementptr inbounds half, ptr addrspace(1) %in, i32 %tid
116  %gep.out = getelementptr inbounds half, ptr addrspace(1) %in, i32 %tid
117  %val = load half, ptr addrspace(1) %gep.in, align 2
118  %fneg = fsub half -0.0, %val
119  store half %fneg, ptr addrspace(1) %gep.out
120  ret void
121}
122
123define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
124; CI-LABEL: s_fneg_free_f16:
125; CI:       ; %bb.0:
126; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
127; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
128; CI-NEXT:    s_waitcnt lgkmcnt(0)
129; CI-NEXT:    s_xor_b32 s2, s2, 0x8000
130; CI-NEXT:    v_mov_b32_e32 v0, s0
131; CI-NEXT:    v_mov_b32_e32 v1, s1
132; CI-NEXT:    v_mov_b32_e32 v2, s2
133; CI-NEXT:    flat_store_short v[0:1], v2
134; CI-NEXT:    s_endpgm
135;
136; GFX8-LABEL: s_fneg_free_f16:
137; GFX8:       ; %bb.0:
138; GFX8-NEXT:    s_load_dword s2, s[8:9], 0x8
139; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
140; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
141; GFX8-NEXT:    s_xor_b32 s2, s2, 0x8000
142; GFX8-NEXT:    v_mov_b32_e32 v0, s0
143; GFX8-NEXT:    v_mov_b32_e32 v1, s1
144; GFX8-NEXT:    v_mov_b32_e32 v2, s2
145; GFX8-NEXT:    flat_store_short v[0:1], v2
146; GFX8-NEXT:    s_endpgm
147;
148; GFX9-LABEL: s_fneg_free_f16:
149; GFX9:       ; %bb.0:
150; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
151; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
152; GFX9-NEXT:    v_mov_b32_e32 v0, 0
153; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
154; GFX9-NEXT:    s_xor_b32 s2, s2, 0x8000
155; GFX9-NEXT:    v_mov_b32_e32 v1, s2
156; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
157; GFX9-NEXT:    s_endpgm
158;
159; GFX11-LABEL: s_fneg_free_f16:
160; GFX11:       ; %bb.0:
161; GFX11-NEXT:    s_clause 0x1
162; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
163; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
164; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
165; GFX11-NEXT:    s_xor_b32 s2, s2, 0x8000
166; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
167; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
168; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
169; GFX11-NEXT:    s_endpgm
170  %bc = bitcast i16 %in to half
171  %fsub = fsub half -0.0, %bc
172  store half %fsub, ptr addrspace(1) %out
173  ret void
174}
175
176define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
177; CI-LABEL: v_fneg_fold_f16:
178; CI:       ; %bb.0:
179; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
180; CI-NEXT:    s_waitcnt lgkmcnt(0)
181; CI-NEXT:    v_mov_b32_e32 v0, s2
182; CI-NEXT:    v_mov_b32_e32 v1, s3
183; CI-NEXT:    flat_load_ushort v0, v[0:1]
184; CI-NEXT:    s_waitcnt vmcnt(0)
185; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
186; CI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
187; CI-NEXT:    v_mul_f32_e32 v0, v0, v1
188; CI-NEXT:    v_cvt_f16_f32_e32 v2, v0
189; CI-NEXT:    v_mov_b32_e32 v0, s0
190; CI-NEXT:    v_mov_b32_e32 v1, s1
191; CI-NEXT:    flat_store_short v[0:1], v2
192; CI-NEXT:    s_endpgm
193;
194; GFX8-LABEL: v_fneg_fold_f16:
195; GFX8:       ; %bb.0:
196; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
197; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
198; GFX8-NEXT:    v_mov_b32_e32 v0, s2
199; GFX8-NEXT:    v_mov_b32_e32 v1, s3
200; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
201; GFX8-NEXT:    v_mov_b32_e32 v0, s0
202; GFX8-NEXT:    v_mov_b32_e32 v1, s1
203; GFX8-NEXT:    s_waitcnt vmcnt(0)
204; GFX8-NEXT:    v_mul_f16_e64 v2, -v2, v2
205; GFX8-NEXT:    flat_store_short v[0:1], v2
206; GFX8-NEXT:    s_endpgm
207;
208; GFX9-LABEL: v_fneg_fold_f16:
209; GFX9:       ; %bb.0:
210; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
211; GFX9-NEXT:    v_mov_b32_e32 v0, 0
212; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
213; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
214; GFX9-NEXT:    s_waitcnt vmcnt(0)
215; GFX9-NEXT:    v_mul_f16_e64 v1, -v1, v1
216; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
217; GFX9-NEXT:    s_endpgm
218;
219; GFX11-LABEL: v_fneg_fold_f16:
220; GFX11:       ; %bb.0:
221; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
222; GFX11-NEXT:    v_mov_b32_e32 v0, 0
223; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
224; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
225; GFX11-NEXT:    s_waitcnt vmcnt(0)
226; GFX11-NEXT:    v_mul_f16_e64 v1, -v1, v1
227; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
228; GFX11-NEXT:    s_endpgm
229  %val = load half, ptr addrspace(1) %in
230  %fsub = fsub half -0.0, %val
231  %fmul = fmul half %fsub, %val
232  store half %fmul, ptr addrspace(1) %out
233  ret void
234}
235
236define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 {
237; CI-LABEL: s_fneg_v2f16:
238; CI:       ; %bb.0:
239; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
240; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
241; CI-NEXT:    s_waitcnt lgkmcnt(0)
242; CI-NEXT:    s_xor_b32 s2, s2, 0x80008000
243; CI-NEXT:    v_mov_b32_e32 v0, s0
244; CI-NEXT:    v_mov_b32_e32 v1, s1
245; CI-NEXT:    v_mov_b32_e32 v2, s2
246; CI-NEXT:    flat_store_dword v[0:1], v2
247; CI-NEXT:    s_endpgm
248;
249; GFX8-LABEL: s_fneg_v2f16:
250; GFX8:       ; %bb.0:
251; GFX8-NEXT:    s_load_dword s2, s[8:9], 0x8
252; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
253; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX8-NEXT:    s_xor_b32 s2, s2, 0x80008000
255; GFX8-NEXT:    v_mov_b32_e32 v0, s0
256; GFX8-NEXT:    v_mov_b32_e32 v1, s1
257; GFX8-NEXT:    v_mov_b32_e32 v2, s2
258; GFX8-NEXT:    flat_store_dword v[0:1], v2
259; GFX8-NEXT:    s_endpgm
260;
261; GFX9-LABEL: s_fneg_v2f16:
262; GFX9:       ; %bb.0:
263; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
264; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
265; GFX9-NEXT:    v_mov_b32_e32 v0, 0
266; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
267; GFX9-NEXT:    s_xor_b32 s2, s2, 0x80008000
268; GFX9-NEXT:    v_mov_b32_e32 v1, s2
269; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
270; GFX9-NEXT:    s_endpgm
271;
272; GFX11-LABEL: s_fneg_v2f16:
273; GFX11:       ; %bb.0:
274; GFX11-NEXT:    s_clause 0x1
275; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
276; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
277; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
278; GFX11-NEXT:    s_xor_b32 s2, s2, 0x80008000
279; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
280; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
281; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
282; GFX11-NEXT:    s_endpgm
283  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %in
284  store <2 x half> %fneg, ptr addrspace(1) %out
285  ret void
286}
287
288define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 {
289; CIVI-LABEL: s_fneg_v2f16_nonload:
290; CIVI:       ; %bb.0:
291; CIVI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
292; CIVI-NEXT:    ;;#ASMSTART
293; CIVI-NEXT:    ; def s2
294; CIVI-NEXT:    ;;#ASMEND
295; CIVI-NEXT:    s_xor_b32 s2, s2, 0x80008000
296; CIVI-NEXT:    v_mov_b32_e32 v2, s2
297; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
298; CIVI-NEXT:    v_mov_b32_e32 v0, s0
299; CIVI-NEXT:    v_mov_b32_e32 v1, s1
300; CIVI-NEXT:    flat_store_dword v[0:1], v2
301; CIVI-NEXT:    s_endpgm
302;
303; GFX9-LABEL: s_fneg_v2f16_nonload:
304; GFX9:       ; %bb.0:
305; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
306; GFX9-NEXT:    ;;#ASMSTART
307; GFX9-NEXT:    ; def s2
308; GFX9-NEXT:    ;;#ASMEND
309; GFX9-NEXT:    s_xor_b32 s2, s2, 0x80008000
310; GFX9-NEXT:    v_mov_b32_e32 v0, 0
311; GFX9-NEXT:    v_mov_b32_e32 v1, s2
312; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
313; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
314; GFX9-NEXT:    s_endpgm
315;
316; GFX11-LABEL: s_fneg_v2f16_nonload:
317; GFX11:       ; %bb.0:
318; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
319; GFX11-NEXT:    ;;#ASMSTART
320; GFX11-NEXT:    ; def s2
321; GFX11-NEXT:    ;;#ASMEND
322; GFX11-NEXT:    s_xor_b32 s2, s2, 0x80008000
323; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
324; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
325; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
326; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
327; GFX11-NEXT:    s_endpgm
328  %in = call i32 asm sideeffect "; def $0", "=s"()
329  %in.bc = bitcast i32 %in to <2 x half>
330  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %in.bc
331  store <2 x half> %fneg, ptr addrspace(1) %out
332  ret void
333}
334
335define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
336; CI-LABEL: v_fneg_v2f16:
337; CI:       ; %bb.0:
338; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x2
339; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
340; CI-NEXT:    s_waitcnt lgkmcnt(0)
341; CI-NEXT:    v_mov_b32_e32 v1, s1
342; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
343; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
344; CI-NEXT:    flat_load_dword v2, v[0:1]
345; CI-NEXT:    s_waitcnt vmcnt(0)
346; CI-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
347; CI-NEXT:    flat_store_dword v[0:1], v2
348; CI-NEXT:    s_endpgm
349;
350; GFX8-LABEL: v_fneg_v2f16:
351; GFX8:       ; %bb.0:
352; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
353; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
354; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
355; GFX8-NEXT:    v_mov_b32_e32 v1, s1
356; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
357; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
358; GFX8-NEXT:    flat_load_dword v2, v[0:1]
359; GFX8-NEXT:    s_waitcnt vmcnt(0)
360; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
361; GFX8-NEXT:    flat_store_dword v[0:1], v2
362; GFX8-NEXT:    s_endpgm
363;
364; GFX9-LABEL: v_fneg_v2f16:
365; GFX9:       ; %bb.0:
366; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
367; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
368; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
369; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
370; GFX9-NEXT:    s_waitcnt vmcnt(0)
371; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
372; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
373; GFX9-NEXT:    s_endpgm
374;
375; GFX11-LABEL: v_fneg_v2f16:
376; GFX11:       ; %bb.0:
377; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
378; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
379; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
380; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
381; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
382; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
383; GFX11-NEXT:    s_waitcnt vmcnt(0)
384; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
385; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
386; GFX11-NEXT:    s_endpgm
387  %tid = call i32 @llvm.amdgcn.workitem.id.x()
388  %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
389  %gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
390  %val = load <2 x half>, ptr addrspace(1) %gep.in, align 2
391  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
392  store <2 x half> %fneg, ptr addrspace(1) %gep.out
393  ret void
394}
395
396define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
397; CI-LABEL: fneg_free_v2f16:
398; CI:       ; %bb.0:
399; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
400; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
401; CI-NEXT:    s_waitcnt lgkmcnt(0)
402; CI-NEXT:    s_xor_b32 s2, s2, 0x80008000
403; CI-NEXT:    v_mov_b32_e32 v0, s0
404; CI-NEXT:    v_mov_b32_e32 v1, s1
405; CI-NEXT:    v_mov_b32_e32 v2, s2
406; CI-NEXT:    flat_store_dword v[0:1], v2
407; CI-NEXT:    s_endpgm
408;
409; GFX8-LABEL: fneg_free_v2f16:
410; GFX8:       ; %bb.0:
411; GFX8-NEXT:    s_load_dword s2, s[8:9], 0x8
412; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
413; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
414; GFX8-NEXT:    s_xor_b32 s2, s2, 0x80008000
415; GFX8-NEXT:    v_mov_b32_e32 v0, s0
416; GFX8-NEXT:    v_mov_b32_e32 v1, s1
417; GFX8-NEXT:    v_mov_b32_e32 v2, s2
418; GFX8-NEXT:    flat_store_dword v[0:1], v2
419; GFX8-NEXT:    s_endpgm
420;
421; GFX9-LABEL: fneg_free_v2f16:
422; GFX9:       ; %bb.0:
423; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
424; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
425; GFX9-NEXT:    v_mov_b32_e32 v0, 0
426; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
427; GFX9-NEXT:    s_xor_b32 s2, s2, 0x80008000
428; GFX9-NEXT:    v_mov_b32_e32 v1, s2
429; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
430; GFX9-NEXT:    s_endpgm
431;
432; GFX11-LABEL: fneg_free_v2f16:
433; GFX11:       ; %bb.0:
434; GFX11-NEXT:    s_clause 0x1
435; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
436; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
437; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
438; GFX11-NEXT:    s_xor_b32 s2, s2, 0x80008000
439; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
440; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
441; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
442; GFX11-NEXT:    s_endpgm
443  %bc = bitcast i32 %in to <2 x half>
444  %fsub = fsub <2 x half> <half -0.0, half -0.0>, %bc
445  store <2 x half> %fsub, ptr addrspace(1) %out
446  ret void
447}
448
449define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
450; CI-LABEL: v_fneg_fold_v2f16:
451; CI:       ; %bb.0:
452; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
453; CI-NEXT:    s_waitcnt lgkmcnt(0)
454; CI-NEXT:    v_mov_b32_e32 v0, s2
455; CI-NEXT:    v_mov_b32_e32 v1, s3
456; CI-NEXT:    flat_load_dword v0, v[0:1]
457; CI-NEXT:    s_waitcnt vmcnt(0)
458; CI-NEXT:    v_xor_b32_e32 v2, 0x80008000, v0
459; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
460; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
461; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
462; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
463; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
464; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
465; CI-NEXT:    v_mul_f32_e32 v1, v3, v1
466; CI-NEXT:    v_cvt_f16_f32_e32 v3, v1
467; CI-NEXT:    v_mul_f32_e32 v0, v2, v0
468; CI-NEXT:    v_cvt_f16_f32_e32 v2, v0
469; CI-NEXT:    v_mov_b32_e32 v0, s0
470; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
471; CI-NEXT:    v_mov_b32_e32 v1, s1
472; CI-NEXT:    v_or_b32_e32 v2, v2, v3
473; CI-NEXT:    flat_store_dword v[0:1], v2
474; CI-NEXT:    s_endpgm
475;
476; GFX8-LABEL: v_fneg_fold_v2f16:
477; GFX8:       ; %bb.0:
478; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
479; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
480; GFX8-NEXT:    v_mov_b32_e32 v0, s2
481; GFX8-NEXT:    v_mov_b32_e32 v1, s3
482; GFX8-NEXT:    flat_load_dword v2, v[0:1]
483; GFX8-NEXT:    v_mov_b32_e32 v0, s0
484; GFX8-NEXT:    v_mov_b32_e32 v1, s1
485; GFX8-NEXT:    s_waitcnt vmcnt(0)
486; GFX8-NEXT:    v_mul_f16_sdwa v3, -v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
487; GFX8-NEXT:    v_mul_f16_e64 v2, -v2, v2
488; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
489; GFX8-NEXT:    flat_store_dword v[0:1], v2
490; GFX8-NEXT:    s_endpgm
491;
492; GFX9-LABEL: v_fneg_fold_v2f16:
493; GFX9:       ; %bb.0:
494; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
495; GFX9-NEXT:    v_mov_b32_e32 v0, 0
496; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
497; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
498; GFX9-NEXT:    s_waitcnt vmcnt(0)
499; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v1 neg_lo:[1,0] neg_hi:[1,0]
500; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
501; GFX9-NEXT:    s_endpgm
502;
503; GFX11-LABEL: v_fneg_fold_v2f16:
504; GFX11:       ; %bb.0:
505; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
506; GFX11-NEXT:    v_mov_b32_e32 v0, 0
507; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
508; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
509; GFX11-NEXT:    s_waitcnt vmcnt(0)
510; GFX11-NEXT:    v_pk_mul_f16 v1, v1, v1 neg_lo:[1,0] neg_hi:[1,0]
511; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
512; GFX11-NEXT:    s_endpgm
513  %val = load <2 x half>, ptr addrspace(1) %in
514  %fsub = fsub <2 x half> <half -0.0, half -0.0>, %val
515  %fmul = fmul <2 x half> %fsub, %val
516  store <2 x half> %fmul, ptr addrspace(1) %out
517  ret void
518}
519
520define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
521; CI-LABEL: v_extract_fneg_fold_v2f16:
522; CI:       ; %bb.0:
523; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
524; CI-NEXT:    s_waitcnt lgkmcnt(0)
525; CI-NEXT:    v_mov_b32_e32 v0, s0
526; CI-NEXT:    v_mov_b32_e32 v1, s1
527; CI-NEXT:    flat_load_dword v0, v[0:1]
528; CI-NEXT:    s_waitcnt vmcnt(0)
529; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
530; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
531; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
532; CI-NEXT:    v_mul_f32_e32 v0, -4.0, v0
533; CI-NEXT:    v_sub_f32_e32 v1, 2.0, v1
534; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
535; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
536; CI-NEXT:    flat_store_short v[0:1], v0
537; CI-NEXT:    s_waitcnt vmcnt(0)
538; CI-NEXT:    flat_store_short v[0:1], v1
539; CI-NEXT:    s_waitcnt vmcnt(0)
540; CI-NEXT:    s_endpgm
541;
542; GFX8-LABEL: v_extract_fneg_fold_v2f16:
543; GFX8:       ; %bb.0:
544; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
545; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
546; GFX8-NEXT:    v_mov_b32_e32 v0, s0
547; GFX8-NEXT:    v_mov_b32_e32 v1, s1
548; GFX8-NEXT:    flat_load_dword v0, v[0:1]
549; GFX8-NEXT:    v_mov_b32_e32 v1, 0x4000
550; GFX8-NEXT:    s_waitcnt vmcnt(0)
551; GFX8-NEXT:    v_mul_f16_e32 v2, -4.0, v0
552; GFX8-NEXT:    v_sub_f16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
553; GFX8-NEXT:    flat_store_short v[0:1], v2
554; GFX8-NEXT:    s_waitcnt vmcnt(0)
555; GFX8-NEXT:    flat_store_short v[0:1], v0
556; GFX8-NEXT:    s_waitcnt vmcnt(0)
557; GFX8-NEXT:    s_endpgm
558;
559; GFX9-LABEL: v_extract_fneg_fold_v2f16:
560; GFX9:       ; %bb.0:
561; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
562; GFX9-NEXT:    v_mov_b32_e32 v0, 0
563; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4000
564; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
565; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
566; GFX9-NEXT:    s_waitcnt vmcnt(0)
567; GFX9-NEXT:    v_mul_f16_e32 v2, -4.0, v0
568; GFX9-NEXT:    v_sub_f16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
569; GFX9-NEXT:    global_store_short v[0:1], v2, off
570; GFX9-NEXT:    s_waitcnt vmcnt(0)
571; GFX9-NEXT:    global_store_short v[0:1], v0, off
572; GFX9-NEXT:    s_waitcnt vmcnt(0)
573; GFX9-NEXT:    s_endpgm
574;
575; GFX11-LABEL: v_extract_fneg_fold_v2f16:
576; GFX11:       ; %bb.0:
577; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
578; GFX11-NEXT:    v_mov_b32_e32 v0, 0
579; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
580; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
581; GFX11-NEXT:    s_waitcnt vmcnt(0)
582; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
583; GFX11-NEXT:    v_mul_f16_e32 v0, -4.0, v0
584; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
585; GFX11-NEXT:    v_sub_f16_e32 v1, 2.0, v1
586; GFX11-NEXT:    global_store_b16 v[0:1], v0, off dlc
587; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
588; GFX11-NEXT:    global_store_b16 v[0:1], v1, off dlc
589; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
590; GFX11-NEXT:    s_endpgm
591  %val = load <2 x half>, ptr addrspace(1) %in
592  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
593  %elt0 = extractelement <2 x half> %fneg, i32 0
594  %elt1 = extractelement <2 x half> %fneg, i32 1
595
596  %fmul0 = fmul half %elt0, 4.0
597  %fadd1 = fadd half %elt1, 2.0
598  store volatile half %fmul0, ptr addrspace(1) undef
599  store volatile half %fadd1, ptr addrspace(1) undef
600  ret void
601}
602
603define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 {
604; CIVI-LABEL: v_extract_fneg_no_fold_v2f16:
605; CIVI:       ; %bb.0:
606; CIVI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
607; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
608; CIVI-NEXT:    v_mov_b32_e32 v0, s0
609; CIVI-NEXT:    v_mov_b32_e32 v1, s1
610; CIVI-NEXT:    flat_load_dword v0, v[0:1]
611; CIVI-NEXT:    s_waitcnt vmcnt(0)
612; CIVI-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
613; CIVI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
614; CIVI-NEXT:    flat_store_short v[0:1], v0
615; CIVI-NEXT:    s_waitcnt vmcnt(0)
616; CIVI-NEXT:    flat_store_short v[0:1], v1
617; CIVI-NEXT:    s_waitcnt vmcnt(0)
618; CIVI-NEXT:    s_endpgm
619;
620; GFX9-LABEL: v_extract_fneg_no_fold_v2f16:
621; GFX9:       ; %bb.0:
622; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
623; GFX9-NEXT:    v_mov_b32_e32 v0, 0
624; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
625; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
626; GFX9-NEXT:    s_waitcnt vmcnt(0)
627; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
628; GFX9-NEXT:    global_store_short v[0:1], v0, off
629; GFX9-NEXT:    s_waitcnt vmcnt(0)
630; GFX9-NEXT:    global_store_short_d16_hi v[0:1], v0, off
631; GFX9-NEXT:    s_waitcnt vmcnt(0)
632; GFX9-NEXT:    s_endpgm
633;
634; GFX11-LABEL: v_extract_fneg_no_fold_v2f16:
635; GFX11:       ; %bb.0:
636; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
637; GFX11-NEXT:    v_mov_b32_e32 v0, 0
638; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
639; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
640; GFX11-NEXT:    s_waitcnt vmcnt(0)
641; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
642; GFX11-NEXT:    global_store_b16 v[0:1], v0, off dlc
643; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
644; GFX11-NEXT:    global_store_d16_hi_b16 v[0:1], v0, off dlc
645; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
646; GFX11-NEXT:    s_endpgm
647  %val = load <2 x half>, ptr addrspace(1) %in
648  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
649  %elt0 = extractelement <2 x half> %fneg, i32 0
650  %elt1 = extractelement <2 x half> %fneg, i32 1
651  store volatile half %elt0, ptr addrspace(1) undef
652  store volatile half %elt1, ptr addrspace(1) undef
653  ret void
654}
655
656declare i32 @llvm.amdgcn.workitem.id.x() #1
657
658attributes #0 = { nounwind }
659attributes #1 = { nounwind readnone }
660