xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fpext.f16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s
5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s
6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s
7
8define amdgpu_kernel void @fpext_f16_to_f32(
9; SI-LABEL: fpext_f16_to_f32:
10; SI:       ; %bb.0: ; %entry
11; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s6, -1
14; SI-NEXT:    s_mov_b32 s10, s6
15; SI-NEXT:    s_mov_b32 s11, s7
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b32 s8, s2
18; SI-NEXT:    s_mov_b32 s9, s3
19; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
20; SI-NEXT:    s_mov_b32 s4, s0
21; SI-NEXT:    s_mov_b32 s5, s1
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
24; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
25; SI-NEXT:    s_endpgm
26;
27; GFX89-LABEL: fpext_f16_to_f32:
28; GFX89:       ; %bb.0: ; %entry
29; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
30; GFX89-NEXT:    s_mov_b32 s7, 0xf000
31; GFX89-NEXT:    s_mov_b32 s6, -1
32; GFX89-NEXT:    s_mov_b32 s10, s6
33; GFX89-NEXT:    s_mov_b32 s11, s7
34; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX89-NEXT:    s_mov_b32 s8, s2
36; GFX89-NEXT:    s_mov_b32 s9, s3
37; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
38; GFX89-NEXT:    s_mov_b32 s4, s0
39; GFX89-NEXT:    s_mov_b32 s5, s1
40; GFX89-NEXT:    s_waitcnt vmcnt(0)
41; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v0
42; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
43; GFX89-NEXT:    s_endpgm
44;
45; GFX11-TRUE16-LABEL: fpext_f16_to_f32:
46; GFX11-TRUE16:       ; %bb.0: ; %entry
47; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
48; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
49; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
50; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
51; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
52; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
53; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
54; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
55; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
56; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
57; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
58; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
59; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
60; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
61; GFX11-TRUE16-NEXT:    s_endpgm
62;
63; GFX11-FAKE16-LABEL: fpext_f16_to_f32:
64; GFX11-FAKE16:       ; %bb.0: ; %entry
65; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
66; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
67; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
68; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
69; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
70; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
71; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
72; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
73; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
74; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
75; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
76; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
77; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
78; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
79; GFX11-FAKE16-NEXT:    s_endpgm
80    ptr addrspace(1) %r,
81    ptr addrspace(1) %a) #0 {
82entry:
83  %a.val = load half, ptr addrspace(1) %a
84  %r.val = fpext half %a.val to float
85  store float %r.val, ptr addrspace(1) %r
86  ret void
87}
88
89define amdgpu_kernel void @fpext_f16_to_f64(
90; SI-LABEL: fpext_f16_to_f64:
91; SI:       ; %bb.0: ; %entry
92; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
93; SI-NEXT:    s_mov_b32 s7, 0xf000
94; SI-NEXT:    s_mov_b32 s6, -1
95; SI-NEXT:    s_mov_b32 s10, s6
96; SI-NEXT:    s_mov_b32 s11, s7
97; SI-NEXT:    s_waitcnt lgkmcnt(0)
98; SI-NEXT:    s_mov_b32 s8, s2
99; SI-NEXT:    s_mov_b32 s9, s3
100; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
101; SI-NEXT:    s_mov_b32 s4, s0
102; SI-NEXT:    s_mov_b32 s5, s1
103; SI-NEXT:    s_waitcnt vmcnt(0)
104; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
105; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
106; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
107; SI-NEXT:    s_endpgm
108;
109; GFX89-LABEL: fpext_f16_to_f64:
110; GFX89:       ; %bb.0: ; %entry
111; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
112; GFX89-NEXT:    s_mov_b32 s7, 0xf000
113; GFX89-NEXT:    s_mov_b32 s6, -1
114; GFX89-NEXT:    s_mov_b32 s10, s6
115; GFX89-NEXT:    s_mov_b32 s11, s7
116; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
117; GFX89-NEXT:    s_mov_b32 s8, s2
118; GFX89-NEXT:    s_mov_b32 s9, s3
119; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
120; GFX89-NEXT:    s_mov_b32 s4, s0
121; GFX89-NEXT:    s_mov_b32 s5, s1
122; GFX89-NEXT:    s_waitcnt vmcnt(0)
123; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v0
124; GFX89-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
125; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
126; GFX89-NEXT:    s_endpgm
127;
128; GFX11-TRUE16-LABEL: fpext_f16_to_f64:
129; GFX11-TRUE16:       ; %bb.0: ; %entry
130; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
131; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
132; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
133; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
134; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
135; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
136; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
137; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
138; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
139; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
140; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
141; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
142; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
143; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
144; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
145; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
146; GFX11-TRUE16-NEXT:    s_endpgm
147;
148; GFX11-FAKE16-LABEL: fpext_f16_to_f64:
149; GFX11-FAKE16:       ; %bb.0: ; %entry
150; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
151; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
152; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
153; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
154; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
155; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
156; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
157; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
158; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
159; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
160; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
161; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
162; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
163; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
164; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
165; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
166; GFX11-FAKE16-NEXT:    s_endpgm
167    ptr addrspace(1) %r,
168    ptr addrspace(1) %a) #0 {
169entry:
170  %a.val = load half, ptr addrspace(1) %a
171  %r.val = fpext half %a.val to double
172  store double %r.val, ptr addrspace(1) %r
173  ret void
174}
175
176define amdgpu_kernel void @fpext_v2f16_to_v2f32(
177; SI-LABEL: fpext_v2f16_to_v2f32:
178; SI:       ; %bb.0: ; %entry
179; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
180; SI-NEXT:    s_mov_b32 s7, 0xf000
181; SI-NEXT:    s_mov_b32 s6, -1
182; SI-NEXT:    s_mov_b32 s10, s6
183; SI-NEXT:    s_mov_b32 s11, s7
184; SI-NEXT:    s_waitcnt lgkmcnt(0)
185; SI-NEXT:    s_mov_b32 s8, s2
186; SI-NEXT:    s_mov_b32 s9, s3
187; SI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
188; SI-NEXT:    s_mov_b32 s4, s0
189; SI-NEXT:    s_mov_b32 s5, s1
190; SI-NEXT:    s_waitcnt vmcnt(0)
191; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
192; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
193; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
194; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
195; SI-NEXT:    s_endpgm
196;
197; GFX89-LABEL: fpext_v2f16_to_v2f32:
198; GFX89:       ; %bb.0: ; %entry
199; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
200; GFX89-NEXT:    s_mov_b32 s7, 0xf000
201; GFX89-NEXT:    s_mov_b32 s6, -1
202; GFX89-NEXT:    s_mov_b32 s10, s6
203; GFX89-NEXT:    s_mov_b32 s11, s7
204; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX89-NEXT:    s_mov_b32 s8, s2
206; GFX89-NEXT:    s_mov_b32 s9, s3
207; GFX89-NEXT:    buffer_load_dword v1, off, s[8:11], 0
208; GFX89-NEXT:    s_mov_b32 s4, s0
209; GFX89-NEXT:    s_mov_b32 s5, s1
210; GFX89-NEXT:    s_waitcnt vmcnt(0)
211; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v1
212; GFX89-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
213; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
214; GFX89-NEXT:    s_endpgm
215;
216; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f32:
217; GFX11-TRUE16:       ; %bb.0: ; %entry
218; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
219; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
220; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
221; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
222; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
223; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
224; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
225; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
226; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
227; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
228; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
229; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
230; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
231; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
232; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
233; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
234; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
235; GFX11-TRUE16-NEXT:    s_endpgm
236;
237; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f32:
238; GFX11-FAKE16:       ; %bb.0: ; %entry
239; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
240; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
241; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
242; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
243; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
244; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
245; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
246; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
247; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
248; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
249; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
250; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
251; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
252; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
253; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
254; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
255; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
256; GFX11-FAKE16-NEXT:    s_endpgm
257    ptr addrspace(1) %r,
258    ptr addrspace(1) %a) #0 {
259entry:
260  %a.val = load <2 x half>, ptr addrspace(1) %a
261  %r.val = fpext <2 x half> %a.val to <2 x float>
262  store <2 x float> %r.val, ptr addrspace(1) %r
263  ret void
264}
265
266define amdgpu_kernel void @fpext_v2f16_to_v2f64(
267; SI-LABEL: fpext_v2f16_to_v2f64:
268; SI:       ; %bb.0: ; %entry
269; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
270; SI-NEXT:    s_mov_b32 s7, 0xf000
271; SI-NEXT:    s_mov_b32 s6, -1
272; SI-NEXT:    s_mov_b32 s10, s6
273; SI-NEXT:    s_mov_b32 s11, s7
274; SI-NEXT:    s_waitcnt lgkmcnt(0)
275; SI-NEXT:    s_mov_b32 s8, s2
276; SI-NEXT:    s_mov_b32 s9, s3
277; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
278; SI-NEXT:    s_mov_b32 s4, s0
279; SI-NEXT:    s_mov_b32 s5, s1
280; SI-NEXT:    s_waitcnt vmcnt(0)
281; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
282; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
283; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
284; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
285; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
286; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
287; SI-NEXT:    s_endpgm
288;
289; GFX89-LABEL: fpext_v2f16_to_v2f64:
290; GFX89:       ; %bb.0: ; %entry
291; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
292; GFX89-NEXT:    s_mov_b32 s7, 0xf000
293; GFX89-NEXT:    s_mov_b32 s6, -1
294; GFX89-NEXT:    s_mov_b32 s10, s6
295; GFX89-NEXT:    s_mov_b32 s11, s7
296; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX89-NEXT:    s_mov_b32 s8, s2
298; GFX89-NEXT:    s_mov_b32 s9, s3
299; GFX89-NEXT:    buffer_load_dword v0, off, s[8:11], 0
300; GFX89-NEXT:    s_mov_b32 s4, s0
301; GFX89-NEXT:    s_mov_b32 s5, s1
302; GFX89-NEXT:    s_waitcnt vmcnt(0)
303; GFX89-NEXT:    v_cvt_f32_f16_e32 v1, v0
304; GFX89-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
305; GFX89-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
306; GFX89-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
307; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
308; GFX89-NEXT:    s_endpgm
309;
310; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f64:
311; GFX11-TRUE16:       ; %bb.0: ; %entry
312; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
313; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
314; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
315; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
316; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
317; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
318; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
319; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
320; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
321; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
322; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
323; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
324; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
325; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
326; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
327; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v1.l
328; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
329; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
330; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
331; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
332; GFX11-TRUE16-NEXT:    s_endpgm
333;
334; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f64:
335; GFX11-FAKE16:       ; %bb.0: ; %entry
336; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
337; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
338; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
339; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
340; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
341; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
342; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
343; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
344; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
345; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
346; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
347; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
348; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
349; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
350; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
351; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v1
352; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
353; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
354; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
355; GFX11-FAKE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
356; GFX11-FAKE16-NEXT:    s_endpgm
357    ptr addrspace(1) %r,
358    ptr addrspace(1) %a) {
359entry:
360  %a.val = load <2 x half>, ptr addrspace(1) %a
361  %r.val = fpext <2 x half> %a.val to <2 x double>
362  store <2 x double> %r.val, ptr addrspace(1) %r
363  ret void
364}
365
366define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a) {
367; SI-LABEL: s_fneg_fpext_f16_to_f32:
368; SI:       ; %bb.0: ; %entry
369; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
370; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
371; SI-NEXT:    s_mov_b32 s3, 0xf000
372; SI-NEXT:    s_waitcnt lgkmcnt(0)
373; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
374; SI-NEXT:    s_mov_b32 s2, -1
375; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
376; SI-NEXT:    s_endpgm
377;
378; GFX89-LABEL: s_fneg_fpext_f16_to_f32:
379; GFX89:       ; %bb.0: ; %entry
380; GFX89-NEXT:    s_load_dword s2, s[4:5], 0x2c
381; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
382; GFX89-NEXT:    s_mov_b32 s3, 0xf000
383; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
384; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, s2
385; GFX89-NEXT:    s_mov_b32 s2, -1
386; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
387; GFX89-NEXT:    s_endpgm
388;
389; GFX11-TRUE16-LABEL: s_fneg_fpext_f16_to_f32:
390; GFX11-TRUE16:       ; %bb.0: ; %entry
391; GFX11-TRUE16-NEXT:    s_clause 0x1
392; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
393; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
394; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
395; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
396; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
397; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
398; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
399; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
400; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
401; GFX11-TRUE16-NEXT:    s_endpgm
402;
403; GFX11-FAKE16-LABEL: s_fneg_fpext_f16_to_f32:
404; GFX11-FAKE16:       ; %bb.0: ; %entry
405; GFX11-FAKE16-NEXT:    s_clause 0x1
406; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
407; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
408; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
409; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
410; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, s2
411; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
412; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
413; GFX11-FAKE16-NEXT:    s_endpgm
414entry:
415  %a.trunc = trunc i32 %a to i16
416  %a.val = bitcast i16 %a.trunc to half
417  %r.val = fpext half %a.val to float
418  store float %r.val, ptr addrspace(1) %r
419  ret void
420}
421
422define amdgpu_kernel void @fneg_fpext_f16_to_f32(
423; SI-LABEL: fneg_fpext_f16_to_f32:
424; SI:       ; %bb.0: ; %entry
425; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
426; SI-NEXT:    s_mov_b32 s7, 0xf000
427; SI-NEXT:    s_mov_b32 s6, -1
428; SI-NEXT:    s_mov_b32 s10, s6
429; SI-NEXT:    s_mov_b32 s11, s7
430; SI-NEXT:    s_waitcnt lgkmcnt(0)
431; SI-NEXT:    s_mov_b32 s8, s2
432; SI-NEXT:    s_mov_b32 s9, s3
433; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
434; SI-NEXT:    s_mov_b32 s4, s0
435; SI-NEXT:    s_mov_b32 s5, s1
436; SI-NEXT:    s_waitcnt vmcnt(0)
437; SI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
438; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
439; SI-NEXT:    s_endpgm
440;
441; GFX89-LABEL: fneg_fpext_f16_to_f32:
442; GFX89:       ; %bb.0: ; %entry
443; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
444; GFX89-NEXT:    s_mov_b32 s7, 0xf000
445; GFX89-NEXT:    s_mov_b32 s6, -1
446; GFX89-NEXT:    s_mov_b32 s10, s6
447; GFX89-NEXT:    s_mov_b32 s11, s7
448; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX89-NEXT:    s_mov_b32 s8, s2
450; GFX89-NEXT:    s_mov_b32 s9, s3
451; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
452; GFX89-NEXT:    s_mov_b32 s4, s0
453; GFX89-NEXT:    s_mov_b32 s5, s1
454; GFX89-NEXT:    s_waitcnt vmcnt(0)
455; GFX89-NEXT:    v_cvt_f32_f16_e64 v0, -v0
456; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
457; GFX89-NEXT:    s_endpgm
458;
459; GFX11-TRUE16-LABEL: fneg_fpext_f16_to_f32:
460; GFX11-TRUE16:       ; %bb.0: ; %entry
461; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
462; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
463; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
464; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
465; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
466; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
467; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
468; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
469; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
470; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
471; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
472; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
473; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0.l
474; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
475; GFX11-TRUE16-NEXT:    s_endpgm
476;
477; GFX11-FAKE16-LABEL: fneg_fpext_f16_to_f32:
478; GFX11-FAKE16:       ; %bb.0: ; %entry
479; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
480; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
481; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
482; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
483; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
484; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
485; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
486; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
487; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
488; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
489; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
490; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
491; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0
492; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
493; GFX11-FAKE16-NEXT:    s_endpgm
494    ptr addrspace(1) %r,
495    ptr addrspace(1) %a) {
496entry:
497  %a.val = load half, ptr addrspace(1) %a
498  %a.neg = fsub half -0.0, %a.val
499  %r.val = fpext half %a.neg to float
500  store float %r.val, ptr addrspace(1) %r
501  ret void
502}
503
504define amdgpu_kernel void @fabs_fpext_f16_to_f32(
505; SI-LABEL: fabs_fpext_f16_to_f32:
506; SI:       ; %bb.0: ; %entry
507; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
508; SI-NEXT:    s_mov_b32 s7, 0xf000
509; SI-NEXT:    s_mov_b32 s6, -1
510; SI-NEXT:    s_mov_b32 s10, s6
511; SI-NEXT:    s_mov_b32 s11, s7
512; SI-NEXT:    s_waitcnt lgkmcnt(0)
513; SI-NEXT:    s_mov_b32 s8, s2
514; SI-NEXT:    s_mov_b32 s9, s3
515; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
516; SI-NEXT:    s_mov_b32 s4, s0
517; SI-NEXT:    s_mov_b32 s5, s1
518; SI-NEXT:    s_waitcnt vmcnt(0)
519; SI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
520; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
521; SI-NEXT:    s_endpgm
522;
523; GFX89-LABEL: fabs_fpext_f16_to_f32:
524; GFX89:       ; %bb.0: ; %entry
525; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
526; GFX89-NEXT:    s_mov_b32 s7, 0xf000
527; GFX89-NEXT:    s_mov_b32 s6, -1
528; GFX89-NEXT:    s_mov_b32 s10, s6
529; GFX89-NEXT:    s_mov_b32 s11, s7
530; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
531; GFX89-NEXT:    s_mov_b32 s8, s2
532; GFX89-NEXT:    s_mov_b32 s9, s3
533; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
534; GFX89-NEXT:    s_mov_b32 s4, s0
535; GFX89-NEXT:    s_mov_b32 s5, s1
536; GFX89-NEXT:    s_waitcnt vmcnt(0)
537; GFX89-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
538; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
539; GFX89-NEXT:    s_endpgm
540;
541; GFX11-TRUE16-LABEL: fabs_fpext_f16_to_f32:
542; GFX11-TRUE16:       ; %bb.0: ; %entry
543; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
544; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
545; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
546; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
547; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
548; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
549; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
550; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
551; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
552; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
553; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
554; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
555; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, |v0.l|
556; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
557; GFX11-TRUE16-NEXT:    s_endpgm
558;
559; GFX11-FAKE16-LABEL: fabs_fpext_f16_to_f32:
560; GFX11-FAKE16:       ; %bb.0: ; %entry
561; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
562; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
563; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
564; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
565; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
566; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
567; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
568; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
569; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
570; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
571; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
572; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
573; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
574; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
575; GFX11-FAKE16-NEXT:    s_endpgm
576    ptr addrspace(1) %r,
577    ptr addrspace(1) %a) {
578entry:
579  %a.val = load half, ptr addrspace(1) %a
580  %a.fabs = call half @llvm.fabs.f16(half %a.val)
581  %r.val = fpext half %a.fabs to float
582  store float %r.val, ptr addrspace(1) %r
583  ret void
584}
585
586define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
587; SI-LABEL: fneg_fabs_fpext_f16_to_f32:
588; SI:       ; %bb.0: ; %entry
589; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
590; SI-NEXT:    s_mov_b32 s7, 0xf000
591; SI-NEXT:    s_mov_b32 s6, -1
592; SI-NEXT:    s_mov_b32 s10, s6
593; SI-NEXT:    s_mov_b32 s11, s7
594; SI-NEXT:    s_waitcnt lgkmcnt(0)
595; SI-NEXT:    s_mov_b32 s8, s2
596; SI-NEXT:    s_mov_b32 s9, s3
597; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
598; SI-NEXT:    s_mov_b32 s4, s0
599; SI-NEXT:    s_mov_b32 s5, s1
600; SI-NEXT:    s_waitcnt vmcnt(0)
601; SI-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
602; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
603; SI-NEXT:    s_endpgm
604;
605; GFX89-LABEL: fneg_fabs_fpext_f16_to_f32:
606; GFX89:       ; %bb.0: ; %entry
607; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
608; GFX89-NEXT:    s_mov_b32 s7, 0xf000
609; GFX89-NEXT:    s_mov_b32 s6, -1
610; GFX89-NEXT:    s_mov_b32 s10, s6
611; GFX89-NEXT:    s_mov_b32 s11, s7
612; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
613; GFX89-NEXT:    s_mov_b32 s8, s2
614; GFX89-NEXT:    s_mov_b32 s9, s3
615; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
616; GFX89-NEXT:    s_mov_b32 s4, s0
617; GFX89-NEXT:    s_mov_b32 s5, s1
618; GFX89-NEXT:    s_waitcnt vmcnt(0)
619; GFX89-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
620; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
621; GFX89-NEXT:    s_endpgm
622;
623; GFX11-TRUE16-LABEL: fneg_fabs_fpext_f16_to_f32:
624; GFX11-TRUE16:       ; %bb.0: ; %entry
625; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
626; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
627; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
628; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
629; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
630; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
631; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
632; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
633; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
634; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
635; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
636; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
637; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -|v0.l|
638; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
639; GFX11-TRUE16-NEXT:    s_endpgm
640;
641; GFX11-FAKE16-LABEL: fneg_fabs_fpext_f16_to_f32:
642; GFX11-FAKE16:       ; %bb.0: ; %entry
643; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
644; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
645; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
646; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
647; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
648; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
649; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
650; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
651; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
652; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
653; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
654; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
655; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
656; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
657; GFX11-FAKE16-NEXT:    s_endpgm
658    ptr addrspace(1) %r,
659    ptr addrspace(1) %a) {
660entry:
661  %a.val = load half, ptr addrspace(1) %a
662  %a.fabs = call half @llvm.fabs.f16(half %a.val)
663  %a.fneg.fabs = fsub half -0.0, %a.fabs
664  %r.val = fpext half %a.fneg.fabs to float
665  store float %r.val, ptr addrspace(1) %r
666  ret void
667}
668
669; FIXME: Using the source modifier here only wastes code size
670
671define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
672; SI-LABEL: fneg_multi_use_fpext_f16_to_f32:
673; SI:       ; %bb.0: ; %entry
674; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
675; SI-NEXT:    s_mov_b32 s7, 0xf000
676; SI-NEXT:    s_mov_b32 s6, -1
677; SI-NEXT:    s_mov_b32 s10, s6
678; SI-NEXT:    s_mov_b32 s11, s7
679; SI-NEXT:    s_waitcnt lgkmcnt(0)
680; SI-NEXT:    s_mov_b32 s8, s2
681; SI-NEXT:    s_mov_b32 s9, s3
682; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
683; SI-NEXT:    s_mov_b32 s4, s0
684; SI-NEXT:    s_mov_b32 s5, s1
685; SI-NEXT:    s_waitcnt vmcnt(0)
686; SI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
687; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
688; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
689; SI-NEXT:    s_waitcnt vmcnt(0)
690; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
691; SI-NEXT:    s_waitcnt vmcnt(0)
692; SI-NEXT:    s_endpgm
693;
694; GFX89-LABEL: fneg_multi_use_fpext_f16_to_f32:
695; GFX89:       ; %bb.0: ; %entry
696; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
697; GFX89-NEXT:    s_mov_b32 s7, 0xf000
698; GFX89-NEXT:    s_mov_b32 s6, -1
699; GFX89-NEXT:    s_mov_b32 s10, s6
700; GFX89-NEXT:    s_mov_b32 s11, s7
701; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
702; GFX89-NEXT:    s_mov_b32 s8, s2
703; GFX89-NEXT:    s_mov_b32 s9, s3
704; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
705; GFX89-NEXT:    s_mov_b32 s4, s0
706; GFX89-NEXT:    s_mov_b32 s5, s1
707; GFX89-NEXT:    s_waitcnt vmcnt(0)
708; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, -v0
709; GFX89-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
710; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
711; GFX89-NEXT:    s_waitcnt vmcnt(0)
712; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
713; GFX89-NEXT:    s_waitcnt vmcnt(0)
714; GFX89-NEXT:    s_endpgm
715;
716; GFX11-TRUE16-LABEL: fneg_multi_use_fpext_f16_to_f32:
717; GFX11-TRUE16:       ; %bb.0: ; %entry
718; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
719; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
720; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
721; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
722; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
723; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
724; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
725; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
726; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
727; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[8:11], 0
728; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
729; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
730; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
731; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
732; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
733; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0.l
734; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0 dlc
735; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
736; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
737; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
738; GFX11-TRUE16-NEXT:    s_endpgm
739;
740; GFX11-FAKE16-LABEL: fneg_multi_use_fpext_f16_to_f32:
741; GFX11-FAKE16:       ; %bb.0: ; %entry
742; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
743; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
744; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
745; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
746; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
747; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
748; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
749; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
750; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
751; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
752; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
753; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
754; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v1, -v0
755; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
756; GFX11-FAKE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
757; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
758; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
759; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
760; GFX11-FAKE16-NEXT:    s_endpgm
761    ptr addrspace(1) %r,
762    ptr addrspace(1) %a) {
763entry:
764  %a.val = load half, ptr addrspace(1) %a
765  %a.neg = fsub half -0.0, %a.val
766  %r.val = fpext half %a.neg to float
767  store volatile float %r.val, ptr addrspace(1) %r
768  store volatile half %a.neg, ptr addrspace(1) undef
769  ret void
770}
771
772define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
773; SI-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
774; SI:       ; %bb.0: ; %entry
775; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
776; SI-NEXT:    s_mov_b32 s7, 0xf000
777; SI-NEXT:    s_mov_b32 s6, -1
778; SI-NEXT:    s_mov_b32 s10, s6
779; SI-NEXT:    s_mov_b32 s11, s7
780; SI-NEXT:    s_waitcnt lgkmcnt(0)
781; SI-NEXT:    s_mov_b32 s8, s2
782; SI-NEXT:    s_mov_b32 s9, s3
783; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
784; SI-NEXT:    s_mov_b32 s4, s0
785; SI-NEXT:    s_mov_b32 s5, s1
786; SI-NEXT:    s_waitcnt vmcnt(0)
787; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
788; SI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
789; SI-NEXT:    v_mul_f32_e32 v1, v0, v1
790; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
791; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
792; SI-NEXT:    s_waitcnt vmcnt(0)
793; SI-NEXT:    buffer_store_short v1, off, s[4:7], 0
794; SI-NEXT:    s_waitcnt vmcnt(0)
795; SI-NEXT:    s_endpgm
796;
797; GFX89-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
798; GFX89:       ; %bb.0: ; %entry
799; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
800; GFX89-NEXT:    s_mov_b32 s7, 0xf000
801; GFX89-NEXT:    s_mov_b32 s6, -1
802; GFX89-NEXT:    s_mov_b32 s10, s6
803; GFX89-NEXT:    s_mov_b32 s11, s7
804; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
805; GFX89-NEXT:    s_mov_b32 s8, s2
806; GFX89-NEXT:    s_mov_b32 s9, s3
807; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
808; GFX89-NEXT:    s_mov_b32 s4, s0
809; GFX89-NEXT:    s_mov_b32 s5, s1
810; GFX89-NEXT:    s_waitcnt vmcnt(0)
811; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, -v0
812; GFX89-NEXT:    v_mul_f16_e64 v0, -v0, v0
813; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
814; GFX89-NEXT:    s_waitcnt vmcnt(0)
815; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
816; GFX89-NEXT:    s_waitcnt vmcnt(0)
817; GFX89-NEXT:    s_endpgm
818;
819; GFX11-TRUE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
820; GFX11-TRUE16:       ; %bb.0: ; %entry
821; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
822; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
823; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
824; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
825; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
826; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
827; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
828; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
829; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
830; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
831; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
832; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
833; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.h, -v0.l, v0.l
834; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v1, -v0.l
835; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
836; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
837; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
838; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
839; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
840; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
841; GFX11-TRUE16-NEXT:    s_endpgm
842;
843; GFX11-FAKE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
844; GFX11-FAKE16:       ; %bb.0: ; %entry
845; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
846; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
847; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
848; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
849; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
850; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
851; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
852; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
853; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
854; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
855; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
856; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
857; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v1, -v0
858; GFX11-FAKE16-NEXT:    v_mul_f16_e64 v0, -v0, v0
859; GFX11-FAKE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
860; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
861; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
862; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
863; GFX11-FAKE16-NEXT:    s_endpgm
864    ptr addrspace(1) %r,
865    ptr addrspace(1) %a) {
866entry:
867  %a.val = load half, ptr addrspace(1) %a
868  %a.neg = fsub half -0.0, %a.val
869  %r.val = fpext half %a.neg to float
870  %mul = fmul half %a.neg, %a.val
871  store volatile float %r.val, ptr addrspace(1) %r
872  store volatile half %mul, ptr addrspace(1) undef
873  ret void
874}
875
876define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
877; SI-LABEL: fabs_multi_use_fpext_f16_to_f32:
878; SI:       ; %bb.0: ; %entry
879; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
880; SI-NEXT:    s_mov_b32 s7, 0xf000
881; SI-NEXT:    s_mov_b32 s6, -1
882; SI-NEXT:    s_mov_b32 s10, s6
883; SI-NEXT:    s_mov_b32 s11, s7
884; SI-NEXT:    s_waitcnt lgkmcnt(0)
885; SI-NEXT:    s_mov_b32 s8, s2
886; SI-NEXT:    s_mov_b32 s9, s3
887; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
888; SI-NEXT:    s_mov_b32 s4, s0
889; SI-NEXT:    s_mov_b32 s5, s1
890; SI-NEXT:    s_waitcnt vmcnt(0)
891; SI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
892; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
893; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
894; SI-NEXT:    s_waitcnt vmcnt(0)
895; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
896; SI-NEXT:    s_waitcnt vmcnt(0)
897; SI-NEXT:    s_endpgm
898;
899; GFX89-LABEL: fabs_multi_use_fpext_f16_to_f32:
900; GFX89:       ; %bb.0: ; %entry
901; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
902; GFX89-NEXT:    s_mov_b32 s7, 0xf000
903; GFX89-NEXT:    s_mov_b32 s6, -1
904; GFX89-NEXT:    s_mov_b32 s10, s6
905; GFX89-NEXT:    s_mov_b32 s11, s7
906; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
907; GFX89-NEXT:    s_mov_b32 s8, s2
908; GFX89-NEXT:    s_mov_b32 s9, s3
909; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
910; GFX89-NEXT:    s_mov_b32 s4, s0
911; GFX89-NEXT:    s_mov_b32 s5, s1
912; GFX89-NEXT:    s_waitcnt vmcnt(0)
913; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
914; GFX89-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
915; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
916; GFX89-NEXT:    s_waitcnt vmcnt(0)
917; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
918; GFX89-NEXT:    s_waitcnt vmcnt(0)
919; GFX89-NEXT:    s_endpgm
920;
921; GFX11-TRUE16-LABEL: fabs_multi_use_fpext_f16_to_f32:
922; GFX11-TRUE16:       ; %bb.0: ; %entry
923; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
924; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
925; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
926; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
927; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
928; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
929; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
930; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
931; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
932; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[8:11], 0
933; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
934; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
935; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
936; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
937; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
938; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, |v0.l|
939; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0 dlc
940; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
941; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
942; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
943; GFX11-TRUE16-NEXT:    s_endpgm
944;
945; GFX11-FAKE16-LABEL: fabs_multi_use_fpext_f16_to_f32:
946; GFX11-FAKE16:       ; %bb.0: ; %entry
947; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
948; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
949; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
950; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
951; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
952; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
953; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
954; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
955; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
956; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
957; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
958; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
959; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
960; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
961; GFX11-FAKE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
962; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
963; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
964; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
965; GFX11-FAKE16-NEXT:    s_endpgm
966    ptr addrspace(1) %r,
967    ptr addrspace(1) %a) {
968entry:
969  %a.val = load half, ptr addrspace(1) %a
970  %a.fabs = call half @llvm.fabs.f16(half %a.val)
971  %r.val = fpext half %a.fabs to float
972  store volatile float %r.val, ptr addrspace(1) %r
973  store volatile half %a.fabs, ptr addrspace(1) undef
974  ret void
975}
976
977define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
978; SI-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
979; SI:       ; %bb.0: ; %entry
980; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
981; SI-NEXT:    s_mov_b32 s7, 0xf000
982; SI-NEXT:    s_mov_b32 s6, -1
983; SI-NEXT:    s_mov_b32 s10, s6
984; SI-NEXT:    s_mov_b32 s11, s7
985; SI-NEXT:    s_waitcnt lgkmcnt(0)
986; SI-NEXT:    s_mov_b32 s8, s2
987; SI-NEXT:    s_mov_b32 s9, s3
988; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
989; SI-NEXT:    s_mov_b32 s4, s0
990; SI-NEXT:    s_mov_b32 s5, s1
991; SI-NEXT:    s_waitcnt vmcnt(0)
992; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
993; SI-NEXT:    v_mul_f32_e64 v1, |v0|, v0
994; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
995; SI-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
996; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
997; SI-NEXT:    s_waitcnt vmcnt(0)
998; SI-NEXT:    buffer_store_short v1, off, s[4:7], 0
999; SI-NEXT:    s_waitcnt vmcnt(0)
1000; SI-NEXT:    s_endpgm
1001;
1002; GFX89-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
1003; GFX89:       ; %bb.0: ; %entry
1004; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1005; GFX89-NEXT:    s_mov_b32 s7, 0xf000
1006; GFX89-NEXT:    s_mov_b32 s6, -1
1007; GFX89-NEXT:    s_mov_b32 s10, s6
1008; GFX89-NEXT:    s_mov_b32 s11, s7
1009; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
1010; GFX89-NEXT:    s_mov_b32 s8, s2
1011; GFX89-NEXT:    s_mov_b32 s9, s3
1012; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
1013; GFX89-NEXT:    s_mov_b32 s4, s0
1014; GFX89-NEXT:    s_mov_b32 s5, s1
1015; GFX89-NEXT:    s_waitcnt vmcnt(0)
1016; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
1017; GFX89-NEXT:    v_mul_f16_e64 v0, |v0|, v0
1018; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
1019; GFX89-NEXT:    s_waitcnt vmcnt(0)
1020; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
1021; GFX89-NEXT:    s_waitcnt vmcnt(0)
1022; GFX89-NEXT:    s_endpgm
1023;
1024; GFX11-TRUE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
1025; GFX11-TRUE16:       ; %bb.0: ; %entry
1026; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1027; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
1028; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
1029; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
1030; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
1031; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
1032; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
1033; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
1034; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
1035; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
1036; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
1037; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
1038; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.h, |v0.l|, v0.l
1039; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v1, |v0.l|
1040; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1041; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
1042; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
1043; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
1044; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
1045; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
1046; GFX11-TRUE16-NEXT:    s_endpgm
1047;
1048; GFX11-FAKE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
1049; GFX11-FAKE16:       ; %bb.0: ; %entry
1050; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1051; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
1052; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
1053; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
1054; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
1055; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
1056; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
1057; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
1058; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
1059; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
1060; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
1061; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
1062; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
1063; GFX11-FAKE16-NEXT:    v_mul_f16_e64 v0, |v0|, v0
1064; GFX11-FAKE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
1065; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
1066; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
1067; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
1068; GFX11-FAKE16-NEXT:    s_endpgm
1069    ptr addrspace(1) %r,
1070    ptr addrspace(1) %a) {
1071entry:
1072  %a.val = load half, ptr addrspace(1) %a
1073  %a.fabs = call half @llvm.fabs.f16(half %a.val)
1074  %r.val = fpext half %a.fabs to float
1075  %mul = fmul half %a.fabs, %a.val
1076  store volatile float %r.val, ptr addrspace(1) %r
1077  store volatile half %mul, ptr addrspace(1) undef
1078  ret void
1079}
1080
1081define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
1082; SI-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
1083; SI:       ; %bb.0: ; %entry
1084; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1085; SI-NEXT:    s_mov_b32 s7, 0xf000
1086; SI-NEXT:    s_mov_b32 s6, -1
1087; SI-NEXT:    s_mov_b32 s10, s6
1088; SI-NEXT:    s_mov_b32 s11, s7
1089; SI-NEXT:    s_waitcnt lgkmcnt(0)
1090; SI-NEXT:    s_mov_b32 s8, s2
1091; SI-NEXT:    s_mov_b32 s9, s3
1092; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
1093; SI-NEXT:    s_mov_b32 s4, s0
1094; SI-NEXT:    s_mov_b32 s5, s1
1095; SI-NEXT:    s_waitcnt vmcnt(0)
1096; SI-NEXT:    v_or_b32_e32 v0, 0x8000, v0
1097; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1098; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
1099; SI-NEXT:    s_waitcnt vmcnt(0)
1100; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1101; SI-NEXT:    s_waitcnt vmcnt(0)
1102; SI-NEXT:    s_endpgm
1103;
1104; GFX89-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
1105; GFX89:       ; %bb.0: ; %entry
1106; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1107; GFX89-NEXT:    s_mov_b32 s7, 0xf000
1108; GFX89-NEXT:    s_mov_b32 s6, -1
1109; GFX89-NEXT:    s_mov_b32 s10, s6
1110; GFX89-NEXT:    s_mov_b32 s11, s7
1111; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
1112; GFX89-NEXT:    s_mov_b32 s8, s2
1113; GFX89-NEXT:    s_mov_b32 s9, s3
1114; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
1115; GFX89-NEXT:    s_mov_b32 s4, s0
1116; GFX89-NEXT:    s_mov_b32 s5, s1
1117; GFX89-NEXT:    s_waitcnt vmcnt(0)
1118; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
1119; GFX89-NEXT:    v_or_b32_e32 v0, 0x8000, v0
1120; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
1121; GFX89-NEXT:    s_waitcnt vmcnt(0)
1122; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
1123; GFX89-NEXT:    s_waitcnt vmcnt(0)
1124; GFX89-NEXT:    s_endpgm
1125;
1126; GFX11-TRUE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
1127; GFX11-TRUE16:       ; %bb.0: ; %entry
1128; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1129; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
1130; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
1131; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
1132; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
1133; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
1134; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
1135; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
1136; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
1137; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[8:11], 0
1138; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
1139; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
1140; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
1141; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x8000, v1
1142; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1143; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -|v0.l|
1144; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0 dlc
1145; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
1146; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
1147; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
1148; GFX11-TRUE16-NEXT:    s_endpgm
1149;
1150; GFX11-FAKE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
1151; GFX11-FAKE16:       ; %bb.0: ; %entry
1152; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1153; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
1154; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
1155; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
1156; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
1157; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
1158; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
1159; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
1160; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
1161; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
1162; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
1163; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
1164; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
1165; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, 0x8000, v0
1166; GFX11-FAKE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
1167; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
1168; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
1169; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
1170; GFX11-FAKE16-NEXT:    s_endpgm
1171    ptr addrspace(1) %r,
1172    ptr addrspace(1) %a) {
1173entry:
1174  %a.val = load half, ptr addrspace(1) %a
1175  %a.fabs = call half @llvm.fabs.f16(half %a.val)
1176  %a.fneg.fabs = fsub half -0.0, %a.fabs
1177  %r.val = fpext half %a.fneg.fabs to float
1178  store volatile float %r.val, ptr addrspace(1) %r
1179  store volatile half %a.fneg.fabs, ptr addrspace(1) undef
1180  ret void
1181}
1182
1183define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
1184; SI-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
1185; SI:       ; %bb.0: ; %entry
1186; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1187; SI-NEXT:    s_mov_b32 s7, 0xf000
1188; SI-NEXT:    s_mov_b32 s6, -1
1189; SI-NEXT:    s_mov_b32 s10, s6
1190; SI-NEXT:    s_mov_b32 s11, s7
1191; SI-NEXT:    s_waitcnt lgkmcnt(0)
1192; SI-NEXT:    s_mov_b32 s8, s2
1193; SI-NEXT:    s_mov_b32 s9, s3
1194; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
1195; SI-NEXT:    s_mov_b32 s4, s0
1196; SI-NEXT:    s_mov_b32 s5, s1
1197; SI-NEXT:    s_waitcnt vmcnt(0)
1198; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1199; SI-NEXT:    v_mul_f32_e64 v1, -|v0|, v0
1200; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1201; SI-NEXT:    v_or_b32_e32 v0, 0x80000000, v0
1202; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1203; SI-NEXT:    s_waitcnt vmcnt(0)
1204; SI-NEXT:    buffer_store_short v1, off, s[4:7], 0
1205; SI-NEXT:    s_waitcnt vmcnt(0)
1206; SI-NEXT:    s_endpgm
1207;
1208; GFX89-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
1209; GFX89:       ; %bb.0: ; %entry
1210; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1211; GFX89-NEXT:    s_mov_b32 s7, 0xf000
1212; GFX89-NEXT:    s_mov_b32 s6, -1
1213; GFX89-NEXT:    s_mov_b32 s10, s6
1214; GFX89-NEXT:    s_mov_b32 s11, s7
1215; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
1216; GFX89-NEXT:    s_mov_b32 s8, s2
1217; GFX89-NEXT:    s_mov_b32 s9, s3
1218; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
1219; GFX89-NEXT:    s_mov_b32 s4, s0
1220; GFX89-NEXT:    s_mov_b32 s5, s1
1221; GFX89-NEXT:    s_waitcnt vmcnt(0)
1222; GFX89-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
1223; GFX89-NEXT:    v_mul_f16_e64 v0, -|v0|, v0
1224; GFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
1225; GFX89-NEXT:    s_waitcnt vmcnt(0)
1226; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
1227; GFX89-NEXT:    s_waitcnt vmcnt(0)
1228; GFX89-NEXT:    s_endpgm
1229;
1230; GFX11-TRUE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
1231; GFX11-TRUE16:       ; %bb.0: ; %entry
1232; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1233; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
1234; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
1235; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
1236; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
1237; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
1238; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
1239; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
1240; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
1241; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
1242; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
1243; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
1244; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.h, -|v0.l|, v0.l
1245; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v1, -|v0.l|
1246; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1247; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
1248; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
1249; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
1250; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
1251; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
1252; GFX11-TRUE16-NEXT:    s_endpgm
1253;
1254; GFX11-FAKE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
1255; GFX11-FAKE16:       ; %bb.0: ; %entry
1256; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1257; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
1258; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
1259; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
1260; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
1261; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
1262; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
1263; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
1264; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
1265; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
1266; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
1267; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
1268; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
1269; GFX11-FAKE16-NEXT:    v_mul_f16_e64 v0, -|v0|, v0
1270; GFX11-FAKE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
1271; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
1272; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
1273; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
1274; GFX11-FAKE16-NEXT:    s_endpgm
1275    ptr addrspace(1) %r,
1276    ptr addrspace(1) %a) {
1277entry:
1278  %a.val = load half, ptr addrspace(1) %a
1279  %a.fabs = call half @llvm.fabs.f16(half %a.val)
1280  %a.fneg.fabs = fsub half -0.0, %a.fabs
1281  %r.val = fpext half %a.fneg.fabs to float
1282  %mul = fmul half %a.fneg.fabs, %a.val
1283  store volatile float %r.val, ptr addrspace(1) %r
1284  store volatile half %mul, ptr addrspace(1) undef
1285  ret void
1286}
1287
1288declare half @llvm.fabs.f16(half) #1
1289
1290attributes #1 = { nounwind readnone }
1291;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
1292; GFX9: {{.*}}
1293; VI: {{.*}}
1294