xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fpext-free.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-F32FLUSH %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-F32DENORM %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9-F32FLUSH %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9-F32DENORM %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s
8
9;  fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
10define float @fadd_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
11; GFX11-LABEL: fadd_fpext_fmul_f16_to_f32:
12; GFX11:       ; %bb.0: ; %entry
13; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
15; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
16; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
17; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
18; GFX11-NEXT:    s_setpc_b64 s[30:31]
19;
20; GFX9-F32FLUSH-LABEL: fadd_fpext_fmul_f16_to_f32:
21; GFX9-F32FLUSH:       ; %bb.0: ; %entry
22; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
24; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
25;
26; GFX9-F32DENORM-LABEL: fadd_fpext_fmul_f16_to_f32:
27; GFX9-F32DENORM:       ; %bb.0: ; %entry
28; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
30; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v0, v0
31; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v0, v2
32; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
33entry:
34  %mul = fmul half %x, %y
35  %mul.ext = fpext half %mul to float
36  %add = fadd float %mul.ext, %z
37  ret float %add
38}
39
40; f16->f64 is not free.
41define double @fadd_fpext_fmul_f16_to_f64(half %x, half %y, double %z) #0 {
42; GFX11-LABEL: fadd_fpext_fmul_f16_to_f64:
43; GFX11:       ; %bb.0: ; %entry
44; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
45; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
46; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
47; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
48; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
49; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
50; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
51; GFX11-NEXT:    s_setpc_b64 s[30:31]
52;
53; GFX89-LABEL: fadd_fpext_fmul_f16_to_f64:
54; GFX89:       ; %bb.0: ; %entry
55; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; GFX89-NEXT:    v_mul_f16_e32 v0, v0, v1
57; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v0
58; GFX89-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
59; GFX89-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
60; GFX89-NEXT:    s_setpc_b64 s[30:31]
61entry:
62  %mul = fmul half %x, %y
63  %mul.ext = fpext half %mul to double
64  %add = fadd double %mul.ext, %z
65  ret double %add
66}
67
68; f32->f64 is not free.
69define double @fadd_fpext_fmul_f32_to_f64(float %x, float %y, double %z) #0 {
70; GFX11-LABEL: fadd_fpext_fmul_f32_to_f64:
71; GFX11:       ; %bb.0: ; %entry
72; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
74; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
75; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
76; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
77; GFX11-NEXT:    s_setpc_b64 s[30:31]
78;
79; GFX89-LABEL: fadd_fpext_fmul_f32_to_f64:
80; GFX89:       ; %bb.0: ; %entry
81; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82; GFX89-NEXT:    v_mul_f32_e32 v0, v0, v1
83; GFX89-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
84; GFX89-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
85; GFX89-NEXT:    s_setpc_b64 s[30:31]
86entry:
87  %mul = fmul float %x, %y
88  %mul.ext = fpext float %mul to double
89  %add = fadd double %mul.ext, %z
90  ret double %add
91}
92
93; fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x)
94define float @fadd_fpext_fmul_f16_to_f32_commute(half %x, half %y, float %z) #0 {
95; GFX11-LABEL: fadd_fpext_fmul_f16_to_f32_commute:
96; GFX11:       ; %bb.0: ; %entry
97; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
99; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
100; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
101; GFX11-NEXT:    v_add_f32_e32 v0, v2, v0
102; GFX11-NEXT:    s_setpc_b64 s[30:31]
103;
104; GFX9-F32FLUSH-LABEL: fadd_fpext_fmul_f16_to_f32_commute:
105; GFX9-F32FLUSH:       ; %bb.0: ; %entry
106; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
107; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
108; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
109;
110; GFX9-F32DENORM-LABEL: fadd_fpext_fmul_f16_to_f32_commute:
111; GFX9-F32DENORM:       ; %bb.0: ; %entry
112; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
114; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v0, v0
115; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v2, v0
116; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
117entry:
118  %mul = fmul half %x, %y
119  %mul.ext = fpext half %mul to float
120  %add = fadd float %z, %mul.ext
121  ret float %add
122}
123
124; fold (fadd (fma x, y, (fpext (fmul u, v))), z)
125;   -> (fma x, y, (fma (fpext u), (fpext v), z))
126define float @fadd_muladd_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half %v, float %z) #0 {
127; GFX11-LABEL: fadd_muladd_fpext_fmul_f16_to_f32:
128; GFX11:       ; %bb.0: ; %entry
129; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
130; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
131; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
132; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
133; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
134; GFX11-NEXT:    s_setpc_b64 s[30:31]
135;
136; GFX9-F32FLUSH-LABEL: fadd_muladd_fpext_fmul_f16_to_f32:
137; GFX9-F32FLUSH:       ; %bb.0: ; %entry
138; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
139; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
140; GFX9-F32FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
141; GFX9-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
142; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
143;
144; GFX9-F32DENORM-LABEL: fadd_muladd_fpext_fmul_f16_to_f32:
145; GFX9-F32DENORM:       ; %bb.0: ; %entry
146; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
148; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v2, v2
149; GFX9-F32DENORM-NEXT:    v_fma_f32 v0, v0, v1, v2
150; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v0, v4
151; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
152entry:
153  %mul = fmul half %u, %v
154  %mul.ext = fpext half %mul to float
155  %fma = call float @llvm.fmuladd.f32(float %x, float %y, float %mul.ext)
156  %add = fadd float %fma, %z
157  ret float %add
158}
159
160; fold (fadd x, (fma y, z, (fpext (fmul u, v)))
161;   -> (fma y, z, (fma (fpext u), (fpext v), x))
162define float @fadd_muladd_fpext_fmul_f16_to_f32_commute(float %x, float %y, half %u, half %v, float %z) #0 {
163; GFX11-LABEL: fadd_muladd_fpext_fmul_f16_to_f32_commute:
164; GFX11:       ; %bb.0: ; %entry
165; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
166; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
167; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
168; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
169; GFX11-NEXT:    v_add_f32_e32 v0, v4, v0
170; GFX11-NEXT:    s_setpc_b64 s[30:31]
171;
172; GFX9-F32FLUSH-LABEL: fadd_muladd_fpext_fmul_f16_to_f32_commute:
173; GFX9-F32FLUSH:       ; %bb.0: ; %entry
174; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
175; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
176; GFX9-F32FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
177; GFX9-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
178; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
179;
180; GFX9-F32DENORM-LABEL: fadd_muladd_fpext_fmul_f16_to_f32_commute:
181; GFX9-F32DENORM:       ; %bb.0: ; %entry
182; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
184; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v2, v2
185; GFX9-F32DENORM-NEXT:    v_fma_f32 v0, v0, v1, v2
186; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v4, v0
187; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
188entry:
189  %mul = fmul half %u, %v
190  %mul.ext = fpext half %mul to float
191  %fma = call float @llvm.fmuladd.f32(float %x, float %y, float %mul.ext)
192  %add = fadd float %z, %fma
193  ret float %add
194}
195
196define float @fadd_fmad_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half %v, float %z) #0 {
197; GFX11-LABEL: fadd_fmad_fpext_fmul_f16_to_f32:
198; GFX11:       ; %bb.0: ; %entry
199; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
200; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
201; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
202; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
203; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
204; GFX11-NEXT:    s_setpc_b64 s[30:31]
205;
206; GFX9-F32FLUSH-LABEL: fadd_fmad_fpext_fmul_f16_to_f32:
207; GFX9-F32FLUSH:       ; %bb.0: ; %entry
208; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
209; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
210; GFX9-F32FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
211; GFX9-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
212; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
213;
214; GFX9-F32DENORM-LABEL: fadd_fmad_fpext_fmul_f16_to_f32:
215; GFX9-F32DENORM:       ; %bb.0: ; %entry
216; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
218; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v2, v2
219; GFX9-F32DENORM-NEXT:    v_fma_f32 v0, v0, v1, v2
220; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v0, v4
221; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
222entry:
223  %mul = fmul half %u, %v
224  %mul.ext = fpext half %mul to float
225  %mul1 = fmul contract float %x, %y
226  %fmad = fadd contract float %mul1, %mul.ext
227  %add = fadd float %fmad, %z
228  ret float %add
229}
230
231; fold (fadd (fma x, y, (fpext (fmul u, v))), z)
232;   -> (fma x, y, (fma (fpext u), (fpext v), z))
233define float @fadd_fma_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half %v, float %z) #0 {
234; GFX11-LABEL: fadd_fma_fpext_fmul_f16_to_f32:
235; GFX11:       ; %bb.0: ; %entry
236; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
238; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
239; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
240; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
241; GFX11-NEXT:    s_setpc_b64 s[30:31]
242;
243; GFX9-F32FLUSH-LABEL: fadd_fma_fpext_fmul_f16_to_f32:
244; GFX9-F32FLUSH:       ; %bb.0: ; %entry
245; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
246; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
247; GFX9-F32FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
248; GFX9-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
249; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
250;
251; GFX9-F32DENORM-LABEL: fadd_fma_fpext_fmul_f16_to_f32:
252; GFX9-F32DENORM:       ; %bb.0: ; %entry
253; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
255; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v2, v2
256; GFX9-F32DENORM-NEXT:    v_fma_f32 v0, v0, v1, v2
257; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v0, v4
258; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
259entry:
260  %mul = fmul contract half %u, %v
261  %mul.ext = fpext half %mul to float
262  %fma = call float @llvm.fma.f32(float %x, float %y, float %mul.ext)
263  %add = fadd float %fma, %z
264  ret float %add
265}
266
267define float @fadd_fma_fpext_fmul_f16_to_f32_commute(float %x, float %y, half %u, half %v, float %z) #0 {
268; GFX11-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute:
269; GFX11:       ; %bb.0: ; %entry
270; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
271; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
272; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
273; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
274; GFX11-NEXT:    v_add_f32_e32 v0, v4, v0
275; GFX11-NEXT:    s_setpc_b64 s[30:31]
276;
277; GFX9-F32FLUSH-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute:
278; GFX9-F32FLUSH:       ; %bb.0: ; %entry
279; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
281; GFX9-F32FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
282; GFX9-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
283; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
284;
285; GFX9-F32DENORM-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute:
286; GFX9-F32DENORM:       ; %bb.0: ; %entry
287; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
289; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v2, v2
290; GFX9-F32DENORM-NEXT:    v_fma_f32 v0, v0, v1, v2
291; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v4, v0
292; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
293entry:
294  %mul = fmul contract half %u, %v
295  %mul.ext = fpext half %mul to float
296  %fma = call float @llvm.fma.f32(float %x, float %y, float %mul.ext)
297  %add = fadd float %z, %fma
298  ret float %add
299}
300
301; fold (fadd x, (fpext (fma y, z, (fmul u, v)))
302;   -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
303define float @fadd_fpext_fmuladd_f16_to_f32(float %x, half %y, half %z, half %u, half %v) #0 {
304; GFX11-LABEL: fadd_fpext_fmuladd_f16_to_f32:
305; GFX11:       ; %bb.0: ; %entry
306; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
307; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
308; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
309; GFX11-NEXT:    v_fmac_f16_e32 v3, v1, v2
310; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v3
311; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
312; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
313; GFX11-NEXT:    s_setpc_b64 s[30:31]
314;
315; GFX9-F32FLUSH-LABEL: fadd_fpext_fmuladd_f16_to_f32:
316; GFX9-F32FLUSH:       ; %bb.0: ; %entry
317; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0]
319; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0]
320; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
321;
322; GFX9-F32DENORM-LABEL: fadd_fpext_fmuladd_f16_to_f32:
323; GFX9-F32DENORM:       ; %bb.0: ; %entry
324; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v3, v3, v4
326; GFX9-F32DENORM-NEXT:    v_fma_f16 v1, v1, v2, v3
327; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
328; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v0, v1
329; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
330entry:
331  %mul = fmul contract half %u, %v
332  %fma = call half @llvm.fmuladd.f16(half %y, half %z, half %mul)
333  %ext.fma = fpext half %fma to float
334  %add = fadd float %x, %ext.fma
335  ret float %add
336}
337
338define float @fadd_fpext_fma_f16_to_f32(float %x, half %y, half %z, half %u, half %v) #0 {
339; GFX11-LABEL: fadd_fpext_fma_f16_to_f32:
340; GFX11:       ; %bb.0: ; %entry
341; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
343; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
344; GFX11-NEXT:    v_fmac_f16_e32 v3, v1, v2
345; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v3
346; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
347; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
348; GFX11-NEXT:    s_setpc_b64 s[30:31]
349;
350; GFX9-F32FLUSH-LABEL: fadd_fpext_fma_f16_to_f32:
351; GFX9-F32FLUSH:       ; %bb.0: ; %entry
352; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
353; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0]
354; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0]
355; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
356;
357; GFX9-F32DENORM-LABEL: fadd_fpext_fma_f16_to_f32:
358; GFX9-F32DENORM:       ; %bb.0: ; %entry
359; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
360; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v3, v3, v4
361; GFX9-F32DENORM-NEXT:    v_fma_f16 v1, v1, v2, v3
362; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
363; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v0, v1
364; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
365entry:
366  %mul = fmul contract half %u, %v
367  %fma = call half @llvm.fma.f16(half %y, half %z, half %mul)
368  %ext.fma = fpext half %fma to float
369  %add = fadd float %x, %ext.fma
370  ret float %add
371}
372
373define float @fadd_fpext_fma_f16_to_f32_commute(float %x, half %y, half %z, half %u, half %v) #0 {
374; GFX11-LABEL: fadd_fpext_fma_f16_to_f32_commute:
375; GFX11:       ; %bb.0: ; %entry
376; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
378; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
379; GFX11-NEXT:    v_fmac_f16_e32 v3, v1, v2
380; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v3
381; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
382; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
383; GFX11-NEXT:    s_setpc_b64 s[30:31]
384;
385; GFX9-F32FLUSH-LABEL: fadd_fpext_fma_f16_to_f32_commute:
386; GFX9-F32FLUSH:       ; %bb.0: ; %entry
387; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
388; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0]
389; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0]
390; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
391;
392; GFX9-F32DENORM-LABEL: fadd_fpext_fma_f16_to_f32_commute:
393; GFX9-F32DENORM:       ; %bb.0: ; %entry
394; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
395; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v3, v3, v4
396; GFX9-F32DENORM-NEXT:    v_fma_f16 v1, v1, v2, v3
397; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
398; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v1, v0
399; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
400entry:
401  %mul = fmul contract half %u, %v
402  %fma = call half @llvm.fma.f16(half %y, half %z, half %mul)
403  %ext.fma = fpext half %fma to float
404  %add = fadd float %ext.fma, %x
405  ret float %add
406}
407
408; fold (fsub (fpext (fmul x, y)), z)
409;   -> (fma (fpext x), (fpext y), (fneg z))
410define float @fsub_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
411; GFX11-LABEL: fsub_fpext_fmul_f16_to_f32:
412; GFX11:       ; %bb.0: ; %entry
413; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
414; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
415; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
416; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
417; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
418; GFX11-NEXT:    s_setpc_b64 s[30:31]
419;
420; GFX9-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32:
421; GFX9-F32FLUSH:       ; %bb.0: ; %entry
422; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
423; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0]
424; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
425;
426; GFX9-F32DENORM-LABEL: fsub_fpext_fmul_f16_to_f32:
427; GFX9-F32DENORM:       ; %bb.0: ; %entry
428; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
429; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
430; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v0, v0
431; GFX9-F32DENORM-NEXT:    v_sub_f32_e32 v0, v0, v2
432; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
433entry:
434  %mul = fmul half %x, %y
435  %mul.ext = fpext half %mul to float
436  %add = fsub float %mul.ext, %z
437  ret float %add
438}
439
440; fold (fsub x, (fpext (fmul y, z)))
441;   -> (fma (fneg (fpext y)), (fpext z), x)
442define float @fsub_fpext_fmul_f16_to_f32_commute(float %x, half %y, half %z) #0 {
443; GFX11-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32_commute:
444; GFX11-F32FLUSH:       ; %bb.0: ; %entry
445; GFX11-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
446; GFX11-F32FLUSH-NEXT:    v_fma_mix_f32 v0, -v1, v2, v0 op_sel_hi:[1,1,0]
447; GFX11-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
448;
449; GFX11-F32DENORM-LABEL: fsub_fpext_fmul_f16_to_f32_commute:
450; GFX11-F32DENORM:       ; %bb.0: ; %entry
451; GFX11-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
452; GFX11-F32DENORM-NEXT:    v_mul_f16_e32 v1, v1, v2
453; GFX11-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
454; GFX11-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
455; GFX11-F32DENORM-NEXT:    v_sub_f32_e32 v0, v0, v1
456; GFX11-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
457;
458; GFX9-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32_commute:
459; GFX9-F32FLUSH:       ; %bb.0: ; %entry
460; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
461; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, -v1, v2, v0 op_sel_hi:[1,1,0]
462; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
463;
464; GFX9-F32DENORM-LABEL: fsub_fpext_fmul_f16_to_f32_commute:
465; GFX9-F32DENORM:       ; %bb.0: ; %entry
466; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v1, v1, v2
468; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
469; GFX9-F32DENORM-NEXT:    v_sub_f32_e32 v0, v0, v1
470; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
471entry:
472  %mul = fmul contract half %y, %z
473  %mul.ext = fpext half %mul to float
474  %add = fsub contract float %x, %mul.ext
475  ret float %add
476}
477
478; fold (fsub (fpext (fneg (fmul, x, y))), z)
479;   -> (fneg (fma (fpext x), (fpext y), z))
480define float @fsub_fpext_fneg_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
481; GFX11-LABEL: fsub_fpext_fneg_fmul_f16_to_f32:
482; GFX11:       ; %bb.0: ; %entry
483; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
484; GFX11-NEXT:    v_mul_f16_e64 v0, v0, -v1
485; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
486; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
487; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
488; GFX11-NEXT:    s_setpc_b64 s[30:31]
489;
490; GFX9-F32FLUSH-LABEL: fsub_fpext_fneg_fmul_f16_to_f32:
491; GFX9-F32FLUSH:       ; %bb.0: ; %entry
492; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
493; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0]
494; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
495;
496; GFX9-F32DENORM-LABEL: fsub_fpext_fneg_fmul_f16_to_f32:
497; GFX9-F32DENORM:       ; %bb.0: ; %entry
498; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
499; GFX9-F32DENORM-NEXT:    v_mul_f16_e64 v0, v0, -v1
500; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v0, v0
501; GFX9-F32DENORM-NEXT:    v_sub_f32_e32 v0, v0, v2
502; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
503entry:
504  %mul = fmul half %x, %y
505  %neg.mul = fsub half -0.0, %mul
506  %neg.mul.ext = fpext half %neg.mul to float
507  %add = fsub float %neg.mul.ext, %z
508  ret float %add
509}
510
511; fold (fsub (fneg (fpext (fmul, x, y))), z)
512;   -> (fneg (fma (fpext x)), (fpext y), z)
513define float @fsub_fneg_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
514; GFX11-LABEL: fsub_fneg_fpext_fmul_f16_to_f32:
515; GFX11:       ; %bb.0: ; %entry
516; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
517; GFX11-NEXT:    v_mul_f16_e64 v0, v0, -v1
518; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
519; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
520; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
521; GFX11-NEXT:    s_setpc_b64 s[30:31]
522;
523; GFX9-F32FLUSH-LABEL: fsub_fneg_fpext_fmul_f16_to_f32:
524; GFX9-F32FLUSH:       ; %bb.0: ; %entry
525; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
526; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0]
527; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
528;
529; GFX9-F32DENORM-LABEL: fsub_fneg_fpext_fmul_f16_to_f32:
530; GFX9-F32DENORM:       ; %bb.0: ; %entry
531; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
532; GFX9-F32DENORM-NEXT:    v_mul_f16_e64 v0, v0, -v1
533; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v0, v0
534; GFX9-F32DENORM-NEXT:    v_sub_f32_e32 v0, v0, v2
535; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
536entry:
537  %mul = fmul half %x, %y
538  %mul.ext = fpext half %mul to float
539  %neg.mul.ext = fneg float %mul.ext
540  %add = fsub float %neg.mul.ext, %z
541  ret float %add
542}
543
544; fold (fsub (fmad x, y, (fpext (fmul u, v))), z)
545;    -> (fmad x, y (fmad (fpext u), (fpext v), (fneg z)))
546define float @fsub_muladd_fpext_mul_f16_to_f32(float %x, float %y, float %z, half %u, half %v) #0 {
547; GFX11-LABEL: fsub_muladd_fpext_mul_f16_to_f32:
548; GFX11:       ; %bb.0: ; %entry
549; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
550; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
551; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
552; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1]
553; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
554; GFX11-NEXT:    s_setpc_b64 s[30:31]
555;
556; GFX9-F32FLUSH-LABEL: fsub_muladd_fpext_mul_f16_to_f32:
557; GFX9-F32FLUSH:       ; %bb.0: ; %entry
558; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
559; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v2, v3, v4, -v2 op_sel_hi:[1,1,0]
560; GFX9-F32FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
561; GFX9-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
562; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
563;
564; GFX9-F32DENORM-LABEL: fsub_muladd_fpext_mul_f16_to_f32:
565; GFX9-F32DENORM:       ; %bb.0: ; %entry
566; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
567; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v3, v3, v4
568; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v3, v3
569; GFX9-F32DENORM-NEXT:    v_fma_f32 v0, v0, v1, v3
570; GFX9-F32DENORM-NEXT:    v_sub_f32_e32 v0, v0, v2
571; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
572entry:
573  %mul = fmul reassoc half %u, %v
574  %mul.ext = fpext half %mul to float
575  %fma = call float @llvm.fmuladd.f32(float %x, float %y, float %mul.ext)
576  %add = fsub reassoc float %fma, %z
577  ret float %add
578}
579
580;  fold (fsub (fpext (fmad x, y, (fmul u, v))), z)
581;    -> (fmad (fpext x), (fpext y),
582;            (fmad (fpext u), (fpext v), (fneg z)))
583define float @fsub_fpext_muladd_mul_f16_to_f32(half %x, half %y, float %z, half %u, half %v) #0 {
584; GFX11-LABEL: fsub_fpext_muladd_mul_f16_to_f32:
585; GFX11:       ; %bb.0: ; %entry
586; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
587; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
588; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
589; GFX11-NEXT:    v_fmac_f16_e32 v3, v0, v1
590; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v3
591; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
592; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
593; GFX11-NEXT:    s_setpc_b64 s[30:31]
594;
595; GFX89-LABEL: fsub_fpext_muladd_mul_f16_to_f32:
596; GFX89:       ; %bb.0: ; %entry
597; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
598; GFX89-NEXT:    v_mul_f16_e32 v3, v3, v4
599; GFX89-NEXT:    v_fma_f16 v0, v0, v1, v3
600; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v0
601; GFX89-NEXT:    v_sub_f32_e32 v0, v0, v2
602; GFX89-NEXT:    s_setpc_b64 s[30:31]
603entry:
604  %mul = fmul half %u, %v
605  %fma = call half @llvm.fmuladd.f16(half %x, half %y, half %mul)
606  %fma.ext = fpext half %fma to float
607  %add = fsub float %fma.ext, %z
608  ret float %add
609}
610
611; fold (fsub x, (fmad y, z, (fpext (fmul u, v))))
612;   -> (fmad (fneg y), z, (fmad (fneg (fpext u)), (fpext v), x))
613define float @fsub_muladd_fpext_mul_f16_to_f32_commute(float %x, float %y, float %z, half %u, half %v) #0 {
614; GFX11-LABEL: fsub_muladd_fpext_mul_f16_to_f32_commute:
615; GFX11:       ; %bb.0: ; %entry
616; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
617; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
618; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
619; GFX11-NEXT:    v_fma_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1]
620; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
621; GFX11-NEXT:    s_setpc_b64 s[30:31]
622;
623; GFX9-F32FLUSH-LABEL: fsub_muladd_fpext_mul_f16_to_f32_commute:
624; GFX9-F32FLUSH:       ; %bb.0: ; %entry
625; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
626; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, -v3, v4, v0 op_sel_hi:[1,1,0]
627; GFX9-F32FLUSH-NEXT:    v_mad_f32 v0, -v1, v2, v0
628; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
629;
630; GFX9-F32DENORM-LABEL: fsub_muladd_fpext_mul_f16_to_f32_commute:
631; GFX9-F32DENORM:       ; %bb.0: ; %entry
632; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
633; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v3, v3, v4
634; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v3, v3
635; GFX9-F32DENORM-NEXT:    v_fma_f32 v1, v1, v2, v3
636; GFX9-F32DENORM-NEXT:    v_sub_f32_e32 v0, v0, v1
637; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
638entry:
639  %mul = fmul reassoc half %u, %v
640  %mul.ext = fpext half %mul to float
641  %fma = call float @llvm.fmuladd.f32(float %y, float %z, float %mul.ext)
642  %add = fsub reassoc float %x, %fma
643  ret float %add
644}
645
646; fold (fsub x, (fpext (fma y, z, (fmul u, v))))
647;    -> (fma (fneg (fpext y)), (fpext z),
648;            (fma (fneg (fpext u)), (fpext v), x))
649define float @fsub_fpext_muladd_mul_f16_to_f32_commute(float %x, half %y, half %z, half %u, half %v) #0 {
650; GFX11-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute:
651; GFX11:       ; %bb.0: ; %entry
652; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
653; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
654; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
655; GFX11-NEXT:    v_fmac_f16_e32 v3, v1, v2
656; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v3
657; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
658; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
659; GFX11-NEXT:    s_setpc_b64 s[30:31]
660;
661; GFX89-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute:
662; GFX89:       ; %bb.0: ; %entry
663; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
664; GFX89-NEXT:    v_mul_f16_e32 v3, v3, v4
665; GFX89-NEXT:    v_fma_f16 v1, v1, v2, v3
666; GFX89-NEXT:    v_cvt_f32_f16_e32 v1, v1
667; GFX89-NEXT:    v_sub_f32_e32 v0, v0, v1
668; GFX89-NEXT:    s_setpc_b64 s[30:31]
669entry:
670  %mul = fmul half %u, %v
671  %fma = call half @llvm.fmuladd.f16(half %y, half %z, half %mul)
672  %fma.ext = fpext half %fma to float
673  %add = fsub float %x, %fma.ext
674  ret float %add
675}
676
677declare float @llvm.fmuladd.f32(float, float, float) #0
678declare float @llvm.fma.f32(float, float, float) #0
679declare half @llvm.fmuladd.f16(half, half, half) #0
680declare half @llvm.fma.f16(half, half, half) #0
681
682attributes #0 = { nounwind readnone speculatable }
683