xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s
3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
6; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -fp-contract=fast < %s | FileCheck -check-prefix=GFX11-CONTRACT %s
7; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX11-DENORM %s
8
9; fadd (fma a, b, (fmul c, d)), e --> fma a, b, (fma c, d, e)
10; fadd e, (fma a, b, (fmul c, d)) --> fma a, b, (fma c, d, e)
11
12define float @test_f32_add_mul(float %a, float %b, float %c, float %d, float %e) {
13; GFX9-CONTRACT-LABEL: test_f32_add_mul:
14; GFX9-CONTRACT:       ; %bb.0: ; %.entry
15; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16; GFX9-CONTRACT-NEXT:    v_fma_f32 v2, v2, v3, v4
17; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
18; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
19;
20; GFX9-DENORM-LABEL: test_f32_add_mul:
21; GFX9-DENORM:       ; %bb.0: ; %.entry
22; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23; GFX9-DENORM-NEXT:    v_mad_f32 v2, v2, v3, v4
24; GFX9-DENORM-NEXT:    v_mac_f32_e32 v2, v0, v1
25; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v2
26; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
27;
28; GFX10-CONTRACT-LABEL: test_f32_add_mul:
29; GFX10-CONTRACT:       ; %bb.0: ; %.entry
30; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31; GFX10-CONTRACT-NEXT:    v_fma_f32 v2, v2, v3, v4
32; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v2, v0, v1
33; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
34; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
35;
36; GFX10-DENORM-LABEL: test_f32_add_mul:
37; GFX10-DENORM:       ; %bb.0: ; %.entry
38; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39; GFX10-DENORM-NEXT:    v_fma_f32 v2, v2, v3, v4
40; GFX10-DENORM-NEXT:    v_fmac_f32_e32 v2, v0, v1
41; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v2
42; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
43;
44; GFX11-CONTRACT-LABEL: test_f32_add_mul:
45; GFX11-CONTRACT:       ; %bb.0: ; %.entry
46; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47; GFX11-CONTRACT-NEXT:    v_fma_f32 v2, v2, v3, v4
48; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
49; GFX11-CONTRACT-NEXT:    v_fmac_f32_e32 v2, v0, v1
50; GFX11-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
51; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
52;
53; GFX11-DENORM-LABEL: test_f32_add_mul:
54; GFX11-DENORM:       ; %bb.0: ; %.entry
55; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; GFX11-DENORM-NEXT:    v_fma_f32 v2, v2, v3, v4
57; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
58; GFX11-DENORM-NEXT:    v_fmac_f32_e32 v2, v0, v1
59; GFX11-DENORM-NEXT:    v_mov_b32_e32 v0, v2
60; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
61.entry:
62  %x = fmul fast float %c, %d
63  %y = call fast float @llvm.fmuladd.f32(float %a, float %b, float %x)
64  %z = fadd fast float %y, %e
65  ret float %z
66}
67
68define float @test_f32_add_mul_rhs(float %a, float %b, float %c, float %d, float %e) {
69; GFX9-CONTRACT-LABEL: test_f32_add_mul_rhs:
70; GFX9-CONTRACT:       ; %bb.0: ; %.entry
71; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
72; GFX9-CONTRACT-NEXT:    v_fma_f32 v2, v2, v3, v4
73; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
74; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
75;
76; GFX9-DENORM-LABEL: test_f32_add_mul_rhs:
77; GFX9-DENORM:       ; %bb.0: ; %.entry
78; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
79; GFX9-DENORM-NEXT:    v_mad_f32 v2, v2, v3, v4
80; GFX9-DENORM-NEXT:    v_mac_f32_e32 v2, v0, v1
81; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v2
82; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
83;
84; GFX10-CONTRACT-LABEL: test_f32_add_mul_rhs:
85; GFX10-CONTRACT:       ; %bb.0: ; %.entry
86; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87; GFX10-CONTRACT-NEXT:    v_fma_f32 v2, v2, v3, v4
88; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v2, v0, v1
89; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
90; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
91;
92; GFX10-DENORM-LABEL: test_f32_add_mul_rhs:
93; GFX10-DENORM:       ; %bb.0: ; %.entry
94; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
95; GFX10-DENORM-NEXT:    v_fma_f32 v2, v2, v3, v4
96; GFX10-DENORM-NEXT:    v_fmac_f32_e32 v2, v0, v1
97; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v2
98; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
99;
100; GFX11-CONTRACT-LABEL: test_f32_add_mul_rhs:
101; GFX11-CONTRACT:       ; %bb.0: ; %.entry
102; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
103; GFX11-CONTRACT-NEXT:    v_fma_f32 v2, v2, v3, v4
104; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
105; GFX11-CONTRACT-NEXT:    v_fmac_f32_e32 v2, v0, v1
106; GFX11-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
107; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
108;
109; GFX11-DENORM-LABEL: test_f32_add_mul_rhs:
110; GFX11-DENORM:       ; %bb.0: ; %.entry
111; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112; GFX11-DENORM-NEXT:    v_fma_f32 v2, v2, v3, v4
113; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
114; GFX11-DENORM-NEXT:    v_fmac_f32_e32 v2, v0, v1
115; GFX11-DENORM-NEXT:    v_mov_b32_e32 v0, v2
116; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
117.entry:
118  %x = fmul fast float %c, %d
119  %y = call fast float @llvm.fmuladd.f32(float %a, float %b, float %x)
120  %z = fadd fast float %e, %y
121  ret float %z
122}
123
124define half @test_half_add_mul(half %a, half %b, half %c, half %d, half %e) {
125; GFX9-CONTRACT-LABEL: test_half_add_mul:
126; GFX9-CONTRACT:       ; %bb.0: ; %.entry
127; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128; GFX9-CONTRACT-NEXT:    v_fma_f16 v2, v2, v3, v4
129; GFX9-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
130; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
131;
132; GFX9-DENORM-LABEL: test_half_add_mul:
133; GFX9-DENORM:       ; %bb.0: ; %.entry
134; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
135; GFX9-DENORM-NEXT:    v_mad_legacy_f16 v2, v2, v3, v4
136; GFX9-DENORM-NEXT:    v_mac_f16_e32 v2, v0, v1
137; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v2
138; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
139;
140; GFX10-CONTRACT-LABEL: test_half_add_mul:
141; GFX10-CONTRACT:       ; %bb.0: ; %.entry
142; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143; GFX10-CONTRACT-NEXT:    v_fma_f16 v2, v2, v3, v4
144; GFX10-CONTRACT-NEXT:    v_fmac_f16_e32 v2, v0, v1
145; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
146; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
147;
148; GFX10-DENORM-LABEL: test_half_add_mul:
149; GFX10-DENORM:       ; %bb.0: ; %.entry
150; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151; GFX10-DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
152; GFX10-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
153; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v0, v2
154; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v0, v4
155; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
156;
157; GFX11-CONTRACT-LABEL: test_half_add_mul:
158; GFX11-CONTRACT:       ; %bb.0: ; %.entry
159; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
160; GFX11-CONTRACT-NEXT:    v_fma_f16 v2, v2, v3, v4
161; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
162; GFX11-CONTRACT-NEXT:    v_fmac_f16_e32 v2, v0, v1
163; GFX11-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
164; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
165;
166; GFX11-DENORM-LABEL: test_half_add_mul:
167; GFX11-DENORM:       ; %bb.0: ; %.entry
168; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
169; GFX11-DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
170; GFX11-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
171; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
172; GFX11-DENORM-NEXT:    v_add_f16_e32 v0, v0, v2
173; GFX11-DENORM-NEXT:    v_add_f16_e32 v0, v0, v4
174; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
175.entry:
176  %x = fmul fast half %c, %d
177  %y = call fast half @llvm.fmuladd.f16(half %a, half %b, half %x)
178  %z = fadd fast half %y, %e
179  ret half %z
180}
181
182define half @test_half_add_mul_rhs(half %a, half %b, half %c, half %d, half %e) {
183; GFX9-CONTRACT-LABEL: test_half_add_mul_rhs:
184; GFX9-CONTRACT:       ; %bb.0: ; %.entry
185; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186; GFX9-CONTRACT-NEXT:    v_fma_f16 v2, v2, v3, v4
187; GFX9-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
188; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
189;
190; GFX9-DENORM-LABEL: test_half_add_mul_rhs:
191; GFX9-DENORM:       ; %bb.0: ; %.entry
192; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193; GFX9-DENORM-NEXT:    v_mad_legacy_f16 v2, v2, v3, v4
194; GFX9-DENORM-NEXT:    v_mac_f16_e32 v2, v0, v1
195; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v2
196; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
197;
198; GFX10-CONTRACT-LABEL: test_half_add_mul_rhs:
199; GFX10-CONTRACT:       ; %bb.0: ; %.entry
200; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
201; GFX10-CONTRACT-NEXT:    v_fma_f16 v2, v2, v3, v4
202; GFX10-CONTRACT-NEXT:    v_fmac_f16_e32 v2, v0, v1
203; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
204; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
205;
206; GFX10-DENORM-LABEL: test_half_add_mul_rhs:
207; GFX10-DENORM:       ; %bb.0: ; %.entry
208; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
209; GFX10-DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
210; GFX10-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
211; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v0, v2
212; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v4, v0
213; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
214;
215; GFX11-CONTRACT-LABEL: test_half_add_mul_rhs:
216; GFX11-CONTRACT:       ; %bb.0: ; %.entry
217; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218; GFX11-CONTRACT-NEXT:    v_fma_f16 v2, v2, v3, v4
219; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
220; GFX11-CONTRACT-NEXT:    v_fmac_f16_e32 v2, v0, v1
221; GFX11-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
222; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
223;
224; GFX11-DENORM-LABEL: test_half_add_mul_rhs:
225; GFX11-DENORM:       ; %bb.0: ; %.entry
226; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227; GFX11-DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
228; GFX11-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
229; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
230; GFX11-DENORM-NEXT:    v_add_f16_e32 v0, v0, v2
231; GFX11-DENORM-NEXT:    v_add_f16_e32 v0, v4, v0
232; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
233.entry:
234  %x = fmul fast half %c, %d
235  %y = call fast half @llvm.fmuladd.f16(half %a, half %b, half %x)
236  %z = fadd fast half %e, %y
237  ret half %z
238}
239
240define double @test_double_add_mul(double %a, double %b, double %c, double %d, double %e) {
241; GFX9-CONTRACT-LABEL: test_double_add_mul:
242; GFX9-CONTRACT:       ; %bb.0: ; %.entry
243; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
244; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
245; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
246; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
247;
248; GFX9-DENORM-LABEL: test_double_add_mul:
249; GFX9-DENORM:       ; %bb.0: ; %.entry
250; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251; GFX9-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
252; GFX9-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
253; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
254;
255; GFX10-CONTRACT-LABEL: test_double_add_mul:
256; GFX10-CONTRACT:       ; %bb.0: ; %.entry
257; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
258; GFX10-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
259; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
260; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
261;
262; GFX10-DENORM-LABEL: test_double_add_mul:
263; GFX10-DENORM:       ; %bb.0: ; %.entry
264; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265; GFX10-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
266; GFX10-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
267; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
268;
269; GFX11-CONTRACT-LABEL: test_double_add_mul:
270; GFX11-CONTRACT:       ; %bb.0: ; %.entry
271; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
272; GFX11-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
273; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
274; GFX11-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
275; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
276;
277; GFX11-DENORM-LABEL: test_double_add_mul:
278; GFX11-DENORM:       ; %bb.0: ; %.entry
279; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280; GFX11-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
281; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
282; GFX11-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
283; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
284.entry:
285  %x = fmul fast double %c, %d
286  %y = call fast double @llvm.fmuladd.f64(double %a, double %b, double %x)
287  %z = fadd fast double %y, %e
288  ret double %z
289}
290
291define double @test_double_add_mul_rhs(double %a, double %b, double %c, double %d, double %e) {
292; GFX9-CONTRACT-LABEL: test_double_add_mul_rhs:
293; GFX9-CONTRACT:       ; %bb.0: ; %.entry
294; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
295; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
296; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
297; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
298;
299; GFX9-DENORM-LABEL: test_double_add_mul_rhs:
300; GFX9-DENORM:       ; %bb.0: ; %.entry
301; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
302; GFX9-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
303; GFX9-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
304; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
305;
306; GFX10-CONTRACT-LABEL: test_double_add_mul_rhs:
307; GFX10-CONTRACT:       ; %bb.0: ; %.entry
308; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
309; GFX10-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
310; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
311; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
312;
313; GFX10-DENORM-LABEL: test_double_add_mul_rhs:
314; GFX10-DENORM:       ; %bb.0: ; %.entry
315; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
316; GFX10-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
317; GFX10-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
318; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
319;
320; GFX11-CONTRACT-LABEL: test_double_add_mul_rhs:
321; GFX11-CONTRACT:       ; %bb.0: ; %.entry
322; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
323; GFX11-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
324; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
325; GFX11-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
326; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
327;
328; GFX11-DENORM-LABEL: test_double_add_mul_rhs:
329; GFX11-DENORM:       ; %bb.0: ; %.entry
330; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
331; GFX11-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
332; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
333; GFX11-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
334; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
335.entry:
336  %x = fmul fast double %c, %d
337  %y = call fast double @llvm.fmuladd.f64(double %a, double %b, double %x)
338  %z = fadd fast double %e, %y
339  ret double %z
340}
341
342define <4 x float> @test_v4f32_add_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e) {
343; GFX9-CONTRACT-LABEL: test_v4f32_add_mul:
344; GFX9-CONTRACT:       ; %bb.0: ; %.entry
345; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
346; GFX9-CONTRACT-NEXT:    v_fma_f32 v8, v8, v12, v16
347; GFX9-CONTRACT-NEXT:    v_fma_f32 v9, v9, v13, v17
348; GFX9-CONTRACT-NEXT:    v_fma_f32 v10, v10, v14, v18
349; GFX9-CONTRACT-NEXT:    v_fma_f32 v11, v11, v15, v19
350; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v4, v8
351; GFX9-CONTRACT-NEXT:    v_fma_f32 v1, v1, v5, v9
352; GFX9-CONTRACT-NEXT:    v_fma_f32 v2, v2, v6, v10
353; GFX9-CONTRACT-NEXT:    v_fma_f32 v3, v3, v7, v11
354; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
355;
356; GFX9-DENORM-LABEL: test_v4f32_add_mul:
357; GFX9-DENORM:       ; %bb.0: ; %.entry
358; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
359; GFX9-DENORM-NEXT:    v_mad_f32 v8, v8, v12, v16
360; GFX9-DENORM-NEXT:    v_mac_f32_e32 v8, v0, v4
361; GFX9-DENORM-NEXT:    v_mad_f32 v4, v9, v13, v17
362; GFX9-DENORM-NEXT:    v_mac_f32_e32 v4, v1, v5
363; GFX9-DENORM-NEXT:    v_mad_f32 v5, v10, v14, v18
364; GFX9-DENORM-NEXT:    v_mac_f32_e32 v5, v2, v6
365; GFX9-DENORM-NEXT:    v_mad_f32 v6, v11, v15, v19
366; GFX9-DENORM-NEXT:    v_mac_f32_e32 v6, v3, v7
367; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v8
368; GFX9-DENORM-NEXT:    v_mov_b32_e32 v1, v4
369; GFX9-DENORM-NEXT:    v_mov_b32_e32 v2, v5
370; GFX9-DENORM-NEXT:    v_mov_b32_e32 v3, v6
371; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
372;
373; GFX10-CONTRACT-LABEL: test_v4f32_add_mul:
374; GFX10-CONTRACT:       ; %bb.0: ; %.entry
375; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
376; GFX10-CONTRACT-NEXT:    v_fma_f32 v8, v8, v12, v16
377; GFX10-CONTRACT-NEXT:    v_fma_f32 v9, v9, v13, v17
378; GFX10-CONTRACT-NEXT:    v_fma_f32 v10, v10, v14, v18
379; GFX10-CONTRACT-NEXT:    v_fma_f32 v11, v11, v15, v19
380; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v8, v0, v4
381; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v9, v1, v5
382; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v10, v2, v6
383; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v11, v3, v7
384; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v8
385; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v1, v9
386; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v2, v10
387; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v3, v11
388; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
389;
390; GFX10-DENORM-LABEL: test_v4f32_add_mul:
391; GFX10-DENORM:       ; %bb.0: ; %.entry
392; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
393; GFX10-DENORM-NEXT:    v_fma_f32 v8, v8, v12, v16
394; GFX10-DENORM-NEXT:    v_fma_f32 v9, v9, v13, v17
395; GFX10-DENORM-NEXT:    v_fma_f32 v10, v10, v14, v18
396; GFX10-DENORM-NEXT:    v_fma_f32 v11, v11, v15, v19
397; GFX10-DENORM-NEXT:    v_fmac_f32_e32 v8, v0, v4
398; GFX10-DENORM-NEXT:    v_fmac_f32_e32 v9, v1, v5
399; GFX10-DENORM-NEXT:    v_fmac_f32_e32 v10, v2, v6
400; GFX10-DENORM-NEXT:    v_fmac_f32_e32 v11, v3, v7
401; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v8
402; GFX10-DENORM-NEXT:    v_mov_b32_e32 v1, v9
403; GFX10-DENORM-NEXT:    v_mov_b32_e32 v2, v10
404; GFX10-DENORM-NEXT:    v_mov_b32_e32 v3, v11
405; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
406;
407; GFX11-CONTRACT-LABEL: test_v4f32_add_mul:
408; GFX11-CONTRACT:       ; %bb.0: ; %.entry
409; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
410; GFX11-CONTRACT-NEXT:    v_fma_f32 v8, v8, v12, v16
411; GFX11-CONTRACT-NEXT:    v_fma_f32 v9, v9, v13, v17
412; GFX11-CONTRACT-NEXT:    v_fma_f32 v10, v10, v14, v18
413; GFX11-CONTRACT-NEXT:    v_fma_f32 v11, v11, v15, v19
414; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
415; GFX11-CONTRACT-NEXT:    v_dual_fmac_f32 v8, v0, v4 :: v_dual_fmac_f32 v9, v1, v5
416; GFX11-CONTRACT-NEXT:    v_dual_fmac_f32 v10, v2, v6 :: v_dual_fmac_f32 v11, v3, v7
417; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
418; GFX11-CONTRACT-NEXT:    v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
419; GFX11-CONTRACT-NEXT:    v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
420; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
421;
422; GFX11-DENORM-LABEL: test_v4f32_add_mul:
423; GFX11-DENORM:       ; %bb.0: ; %.entry
424; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
425; GFX11-DENORM-NEXT:    v_fma_f32 v8, v8, v12, v16
426; GFX11-DENORM-NEXT:    v_fma_f32 v9, v9, v13, v17
427; GFX11-DENORM-NEXT:    v_fma_f32 v10, v10, v14, v18
428; GFX11-DENORM-NEXT:    v_fma_f32 v11, v11, v15, v19
429; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
430; GFX11-DENORM-NEXT:    v_dual_fmac_f32 v8, v0, v4 :: v_dual_fmac_f32 v9, v1, v5
431; GFX11-DENORM-NEXT:    v_dual_fmac_f32 v10, v2, v6 :: v_dual_fmac_f32 v11, v3, v7
432; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
433; GFX11-DENORM-NEXT:    v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
434; GFX11-DENORM-NEXT:    v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
435; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
436.entry:
437  %x = fmul fast <4 x float> %c, %d
438  %y = call fast <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %x)
439  %z = fadd fast <4 x float> %y, %e
440  ret <4 x float> %z
441}
442
443define <4 x float> @test_v4f32_add_mul_rhs(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e) {
444; GFX9-CONTRACT-LABEL: test_v4f32_add_mul_rhs:
445; GFX9-CONTRACT:       ; %bb.0: ; %.entry
446; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
447; GFX9-CONTRACT-NEXT:    v_fma_f32 v8, v8, v12, v16
448; GFX9-CONTRACT-NEXT:    v_fma_f32 v9, v9, v13, v17
449; GFX9-CONTRACT-NEXT:    v_fma_f32 v10, v10, v14, v18
450; GFX9-CONTRACT-NEXT:    v_fma_f32 v11, v11, v15, v19
451; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v4, v8
452; GFX9-CONTRACT-NEXT:    v_fma_f32 v1, v1, v5, v9
453; GFX9-CONTRACT-NEXT:    v_fma_f32 v2, v2, v6, v10
454; GFX9-CONTRACT-NEXT:    v_fma_f32 v3, v3, v7, v11
455; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
456;
457; GFX9-DENORM-LABEL: test_v4f32_add_mul_rhs:
458; GFX9-DENORM:       ; %bb.0: ; %.entry
459; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
460; GFX9-DENORM-NEXT:    v_mad_f32 v8, v8, v12, v16
461; GFX9-DENORM-NEXT:    v_mac_f32_e32 v8, v0, v4
462; GFX9-DENORM-NEXT:    v_mad_f32 v4, v9, v13, v17
463; GFX9-DENORM-NEXT:    v_mac_f32_e32 v4, v1, v5
464; GFX9-DENORM-NEXT:    v_mad_f32 v5, v10, v14, v18
465; GFX9-DENORM-NEXT:    v_mac_f32_e32 v5, v2, v6
466; GFX9-DENORM-NEXT:    v_mad_f32 v6, v11, v15, v19
467; GFX9-DENORM-NEXT:    v_mac_f32_e32 v6, v3, v7
468; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v8
469; GFX9-DENORM-NEXT:    v_mov_b32_e32 v1, v4
470; GFX9-DENORM-NEXT:    v_mov_b32_e32 v2, v5
471; GFX9-DENORM-NEXT:    v_mov_b32_e32 v3, v6
472; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
473;
474; GFX10-CONTRACT-LABEL: test_v4f32_add_mul_rhs:
475; GFX10-CONTRACT:       ; %bb.0: ; %.entry
476; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
477; GFX10-CONTRACT-NEXT:    v_fma_f32 v8, v8, v12, v16
478; GFX10-CONTRACT-NEXT:    v_fma_f32 v9, v9, v13, v17
479; GFX10-CONTRACT-NEXT:    v_fma_f32 v10, v10, v14, v18
480; GFX10-CONTRACT-NEXT:    v_fma_f32 v11, v11, v15, v19
481; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v8, v0, v4
482; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v9, v1, v5
483; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v10, v2, v6
484; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v11, v3, v7
485; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v8
486; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v1, v9
487; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v2, v10
488; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v3, v11
489; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
490;
491; GFX10-DENORM-LABEL: test_v4f32_add_mul_rhs:
492; GFX10-DENORM:       ; %bb.0: ; %.entry
493; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
494; GFX10-DENORM-NEXT:    v_fma_f32 v8, v8, v12, v16
495; GFX10-DENORM-NEXT:    v_fma_f32 v9, v9, v13, v17
496; GFX10-DENORM-NEXT:    v_fma_f32 v10, v10, v14, v18
497; GFX10-DENORM-NEXT:    v_fma_f32 v11, v11, v15, v19
498; GFX10-DENORM-NEXT:    v_fmac_f32_e32 v8, v0, v4
499; GFX10-DENORM-NEXT:    v_fmac_f32_e32 v9, v1, v5
500; GFX10-DENORM-NEXT:    v_fmac_f32_e32 v10, v2, v6
501; GFX10-DENORM-NEXT:    v_fmac_f32_e32 v11, v3, v7
502; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v8
503; GFX10-DENORM-NEXT:    v_mov_b32_e32 v1, v9
504; GFX10-DENORM-NEXT:    v_mov_b32_e32 v2, v10
505; GFX10-DENORM-NEXT:    v_mov_b32_e32 v3, v11
506; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
507;
508; GFX11-CONTRACT-LABEL: test_v4f32_add_mul_rhs:
509; GFX11-CONTRACT:       ; %bb.0: ; %.entry
510; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
511; GFX11-CONTRACT-NEXT:    v_fma_f32 v8, v8, v12, v16
512; GFX11-CONTRACT-NEXT:    v_fma_f32 v9, v9, v13, v17
513; GFX11-CONTRACT-NEXT:    v_fma_f32 v10, v10, v14, v18
514; GFX11-CONTRACT-NEXT:    v_fma_f32 v11, v11, v15, v19
515; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
516; GFX11-CONTRACT-NEXT:    v_dual_fmac_f32 v8, v0, v4 :: v_dual_fmac_f32 v9, v1, v5
517; GFX11-CONTRACT-NEXT:    v_dual_fmac_f32 v10, v2, v6 :: v_dual_fmac_f32 v11, v3, v7
518; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
519; GFX11-CONTRACT-NEXT:    v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
520; GFX11-CONTRACT-NEXT:    v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
521; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
522;
523; GFX11-DENORM-LABEL: test_v4f32_add_mul_rhs:
524; GFX11-DENORM:       ; %bb.0: ; %.entry
525; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
526; GFX11-DENORM-NEXT:    v_fma_f32 v8, v8, v12, v16
527; GFX11-DENORM-NEXT:    v_fma_f32 v9, v9, v13, v17
528; GFX11-DENORM-NEXT:    v_fma_f32 v10, v10, v14, v18
529; GFX11-DENORM-NEXT:    v_fma_f32 v11, v11, v15, v19
530; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
531; GFX11-DENORM-NEXT:    v_dual_fmac_f32 v8, v0, v4 :: v_dual_fmac_f32 v9, v1, v5
532; GFX11-DENORM-NEXT:    v_dual_fmac_f32 v10, v2, v6 :: v_dual_fmac_f32 v11, v3, v7
533; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
534; GFX11-DENORM-NEXT:    v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
535; GFX11-DENORM-NEXT:    v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
536; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
537.entry:
538  %x = fmul fast <4 x float> %c, %d
539  %y = call fast <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %x)
540  %z = fadd fast <4 x float> %e, %y
541  ret <4 x float> %z
542}
543
544define <4 x half> @test_f16_add_mul(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) {
545; GFX9-CONTRACT-LABEL: test_f16_add_mul:
546; GFX9-CONTRACT:       ; %bb.0: ; %.entry
547; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
548; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v4, v4, v6, v8
549; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v5, v5, v7, v9
550; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
551; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
552; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
553;
554; GFX9-DENORM-LABEL: test_f16_add_mul:
555; GFX9-DENORM:       ; %bb.0: ; %.entry
556; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
557; GFX9-DENORM-NEXT:    v_pk_mul_f16 v4, v4, v6
558; GFX9-DENORM-NEXT:    v_pk_mul_f16 v5, v5, v7
559; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
560; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
561; GFX9-DENORM-NEXT:    v_pk_add_f16 v0, v0, v4
562; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
563; GFX9-DENORM-NEXT:    v_pk_add_f16 v0, v0, v8
564; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v1, v9
565; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
566;
567; GFX10-CONTRACT-LABEL: test_f16_add_mul:
568; GFX10-CONTRACT:       ; %bb.0: ; %.entry
569; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
570; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v4, v4, v6, v8
571; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v5, v5, v7, v9
572; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
573; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
574; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
575;
576; GFX10-DENORM-LABEL: test_f16_add_mul:
577; GFX10-DENORM:       ; %bb.0: ; %.entry
578; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
579; GFX10-DENORM-NEXT:    v_pk_mul_f16 v4, v4, v6
580; GFX10-DENORM-NEXT:    v_pk_mul_f16 v5, v5, v7
581; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
582; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
583; GFX10-DENORM-NEXT:    v_pk_add_f16 v0, v0, v4
584; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
585; GFX10-DENORM-NEXT:    v_pk_add_f16 v0, v0, v8
586; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v1, v9
587; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
588;
589; GFX11-CONTRACT-LABEL: test_f16_add_mul:
590; GFX11-CONTRACT:       ; %bb.0: ; %.entry
591; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
592; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v4, v4, v6, v8
593; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v5, v5, v7, v9
594; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
595; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
596; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
597; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
598;
599; GFX11-DENORM-LABEL: test_f16_add_mul:
600; GFX11-DENORM:       ; %bb.0: ; %.entry
601; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
602; GFX11-DENORM-NEXT:    v_pk_mul_f16 v4, v4, v6
603; GFX11-DENORM-NEXT:    v_pk_mul_f16 v5, v5, v7
604; GFX11-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
605; GFX11-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
606; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
607; GFX11-DENORM-NEXT:    v_pk_add_f16 v0, v0, v4
608; GFX11-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
609; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
610; GFX11-DENORM-NEXT:    v_pk_add_f16 v0, v0, v8
611; GFX11-DENORM-NEXT:    v_pk_add_f16 v1, v1, v9
612; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
613.entry:
614  %x = fmul fast <4 x half> %c, %d
615  %y = call fast <4 x half> @llvm.fmuladd.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %x)
616  %z = fadd fast <4 x half> %y, %e
617  ret <4 x half> %z
618}
619
620define <4 x half> @test_f16_add_mul_rhs(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) {
621; GFX9-CONTRACT-LABEL: test_f16_add_mul_rhs:
622; GFX9-CONTRACT:       ; %bb.0: ; %.entry
623; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
624; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v4, v4, v6, v8
625; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v5, v5, v7, v9
626; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
627; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
628; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
629;
630; GFX9-DENORM-LABEL: test_f16_add_mul_rhs:
631; GFX9-DENORM:       ; %bb.0: ; %.entry
632; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
633; GFX9-DENORM-NEXT:    v_pk_mul_f16 v4, v4, v6
634; GFX9-DENORM-NEXT:    v_pk_mul_f16 v5, v5, v7
635; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
636; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
637; GFX9-DENORM-NEXT:    v_pk_add_f16 v0, v0, v4
638; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
639; GFX9-DENORM-NEXT:    v_pk_add_f16 v0, v8, v0
640; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v9, v1
641; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
642;
643; GFX10-CONTRACT-LABEL: test_f16_add_mul_rhs:
644; GFX10-CONTRACT:       ; %bb.0: ; %.entry
645; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
646; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v4, v4, v6, v8
647; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v5, v5, v7, v9
648; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
649; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
650; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
651;
652; GFX10-DENORM-LABEL: test_f16_add_mul_rhs:
653; GFX10-DENORM:       ; %bb.0: ; %.entry
654; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
655; GFX10-DENORM-NEXT:    v_pk_mul_f16 v4, v4, v6
656; GFX10-DENORM-NEXT:    v_pk_mul_f16 v5, v5, v7
657; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
658; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
659; GFX10-DENORM-NEXT:    v_pk_add_f16 v0, v0, v4
660; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
661; GFX10-DENORM-NEXT:    v_pk_add_f16 v0, v8, v0
662; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v9, v1
663; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
664;
665; GFX11-CONTRACT-LABEL: test_f16_add_mul_rhs:
666; GFX11-CONTRACT:       ; %bb.0: ; %.entry
667; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
668; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v4, v4, v6, v8
669; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v5, v5, v7, v9
670; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
671; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
672; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
673; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
674;
675; GFX11-DENORM-LABEL: test_f16_add_mul_rhs:
676; GFX11-DENORM:       ; %bb.0: ; %.entry
677; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
678; GFX11-DENORM-NEXT:    v_pk_mul_f16 v4, v4, v6
679; GFX11-DENORM-NEXT:    v_pk_mul_f16 v5, v5, v7
680; GFX11-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
681; GFX11-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
682; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
683; GFX11-DENORM-NEXT:    v_pk_add_f16 v0, v0, v4
684; GFX11-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
685; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
686; GFX11-DENORM-NEXT:    v_pk_add_f16 v0, v8, v0
687; GFX11-DENORM-NEXT:    v_pk_add_f16 v1, v9, v1
688; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
689.entry:
690  %x = fmul fast <4 x half> %c, %d
691  %y = call fast <4 x half> @llvm.fmuladd.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %x)
692  %z = fadd fast <4 x half> %e, %y
693  ret <4 x half> %z
694}
695
696define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d, <4 x double> %e) {
697; GFX9-CONTRACT-LABEL: test_f64_add_mul:
698; GFX9-CONTRACT:       ; %bb.0: ; %.entry
699; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
700; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
701; GFX9-CONTRACT-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
702; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
703; GFX9-CONTRACT-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
704; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:12
705; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:16
706; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
707; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
708; GFX9-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
709; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:20
710; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:24
711; GFX9-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
712; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
713; GFX9-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
714; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
715; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:28
716; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:32
717; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
718; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
719; GFX9-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
720; GFX9-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
721; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
722;
723; GFX9-DENORM-LABEL: test_f64_add_mul:
724; GFX9-DENORM:       ; %bb.0: ; %.entry
725; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
726; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
727; GFX9-DENORM-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
728; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
729; GFX9-DENORM-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
730; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:12
731; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:16
732; GFX9-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
733; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
734; GFX9-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
735; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:20
736; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:24
737; GFX9-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
738; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
739; GFX9-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
740; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
741; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:28
742; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:32
743; GFX9-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
744; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
745; GFX9-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
746; GFX9-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
747; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
748;
749; GFX10-CONTRACT-LABEL: test_f64_add_mul:
750; GFX10-CONTRACT:       ; %bb.0: ; %.entry
751; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
752; GFX10-CONTRACT-NEXT:    s_clause 0x8
753; GFX10-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
754; GFX10-CONTRACT-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
755; GFX10-CONTRACT-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
756; GFX10-CONTRACT-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
757; GFX10-CONTRACT-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:16
758; GFX10-CONTRACT-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:20
759; GFX10-CONTRACT-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
760; GFX10-CONTRACT-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
761; GFX10-CONTRACT-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
762; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(6)
763; GFX10-CONTRACT-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
764; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(4)
765; GFX10-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
766; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(2)
767; GFX10-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
768; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
769; GFX10-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
770; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
771; GFX10-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
772; GFX10-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
773; GFX10-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
774; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
775;
776; GFX10-DENORM-LABEL: test_f64_add_mul:
777; GFX10-DENORM:       ; %bb.0: ; %.entry
778; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
779; GFX10-DENORM-NEXT:    s_clause 0x8
780; GFX10-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
781; GFX10-DENORM-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
782; GFX10-DENORM-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
783; GFX10-DENORM-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
784; GFX10-DENORM-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:16
785; GFX10-DENORM-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:20
786; GFX10-DENORM-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
787; GFX10-DENORM-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
788; GFX10-DENORM-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
789; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(6)
790; GFX10-DENORM-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
791; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(4)
792; GFX10-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
793; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(2)
794; GFX10-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
795; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
796; GFX10-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
797; GFX10-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
798; GFX10-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
799; GFX10-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
800; GFX10-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
801; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
802;
803; GFX11-CONTRACT-LABEL: test_f64_add_mul:
804; GFX11-CONTRACT:       ; %bb.0: ; %.entry
805; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
806; GFX11-CONTRACT-NEXT:    s_clause 0x8
807; GFX11-CONTRACT-NEXT:    scratch_load_b32 v31, off, s32
808; GFX11-CONTRACT-NEXT:    scratch_load_b32 v32, off, s32 offset:4
809; GFX11-CONTRACT-NEXT:    scratch_load_b32 v33, off, s32 offset:8
810; GFX11-CONTRACT-NEXT:    scratch_load_b32 v34, off, s32 offset:12
811; GFX11-CONTRACT-NEXT:    scratch_load_b32 v35, off, s32 offset:16
812; GFX11-CONTRACT-NEXT:    scratch_load_b32 v36, off, s32 offset:20
813; GFX11-CONTRACT-NEXT:    scratch_load_b32 v37, off, s32 offset:24
814; GFX11-CONTRACT-NEXT:    scratch_load_b32 v38, off, s32 offset:28
815; GFX11-CONTRACT-NEXT:    scratch_load_b32 v39, off, s32 offset:32
816; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(6)
817; GFX11-CONTRACT-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
818; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(4)
819; GFX11-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
820; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(2)
821; GFX11-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
822; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
823; GFX11-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
824; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
825; GFX11-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
826; GFX11-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
827; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
828; GFX11-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
829; GFX11-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
830; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
831;
832; GFX11-DENORM-LABEL: test_f64_add_mul:
833; GFX11-DENORM:       ; %bb.0: ; %.entry
834; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
835; GFX11-DENORM-NEXT:    s_clause 0x8
836; GFX11-DENORM-NEXT:    scratch_load_b32 v31, off, s32
837; GFX11-DENORM-NEXT:    scratch_load_b32 v32, off, s32 offset:4
838; GFX11-DENORM-NEXT:    scratch_load_b32 v33, off, s32 offset:8
839; GFX11-DENORM-NEXT:    scratch_load_b32 v34, off, s32 offset:12
840; GFX11-DENORM-NEXT:    scratch_load_b32 v35, off, s32 offset:16
841; GFX11-DENORM-NEXT:    scratch_load_b32 v36, off, s32 offset:20
842; GFX11-DENORM-NEXT:    scratch_load_b32 v37, off, s32 offset:24
843; GFX11-DENORM-NEXT:    scratch_load_b32 v38, off, s32 offset:28
844; GFX11-DENORM-NEXT:    scratch_load_b32 v39, off, s32 offset:32
845; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(6)
846; GFX11-DENORM-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
847; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(4)
848; GFX11-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
849; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(2)
850; GFX11-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
851; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
852; GFX11-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
853; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
854; GFX11-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
855; GFX11-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
856; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
857; GFX11-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
858; GFX11-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
859; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
860.entry:
861  %x = fmul fast <4 x double> %c, %d
862  %y = call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %x)
863  %z = fadd fast <4 x double> %y, %e
864  ret <4 x double> %z
865}
866
867define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d, <4 x double> %e) {
868; GFX9-CONTRACT-LABEL: test_f64_add_mul_rhs:
869; GFX9-CONTRACT:       ; %bb.0: ; %.entry
870; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
871; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
872; GFX9-CONTRACT-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
873; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
874; GFX9-CONTRACT-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
875; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:12
876; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:16
877; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
878; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
879; GFX9-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
880; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:20
881; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:24
882; GFX9-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
883; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
884; GFX9-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
885; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
886; GFX9-CONTRACT-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:28
887; GFX9-CONTRACT-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:32
888; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
889; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
890; GFX9-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
891; GFX9-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
892; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
893;
894; GFX9-DENORM-LABEL: test_f64_add_mul_rhs:
895; GFX9-DENORM:       ; %bb.0: ; %.entry
896; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
897; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
898; GFX9-DENORM-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
899; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
900; GFX9-DENORM-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32]
901; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:12
902; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:16
903; GFX9-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
904; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
905; GFX9-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[24:25]
906; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:20
907; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:24
908; GFX9-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
909; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
910; GFX9-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[24:25]
911; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
912; GFX9-DENORM-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:28
913; GFX9-DENORM-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:32
914; GFX9-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
915; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
916; GFX9-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
917; GFX9-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
918; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
919;
920; GFX10-CONTRACT-LABEL: test_f64_add_mul_rhs:
921; GFX10-CONTRACT:       ; %bb.0: ; %.entry
922; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
923; GFX10-CONTRACT-NEXT:    s_clause 0x8
924; GFX10-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
925; GFX10-CONTRACT-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
926; GFX10-CONTRACT-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
927; GFX10-CONTRACT-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
928; GFX10-CONTRACT-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:16
929; GFX10-CONTRACT-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:20
930; GFX10-CONTRACT-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
931; GFX10-CONTRACT-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
932; GFX10-CONTRACT-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
933; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(6)
934; GFX10-CONTRACT-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
935; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(4)
936; GFX10-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
937; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(2)
938; GFX10-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
939; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
940; GFX10-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
941; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
942; GFX10-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
943; GFX10-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
944; GFX10-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
945; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
946;
947; GFX10-DENORM-LABEL: test_f64_add_mul_rhs:
948; GFX10-DENORM:       ; %bb.0: ; %.entry
949; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
950; GFX10-DENORM-NEXT:    s_clause 0x8
951; GFX10-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
952; GFX10-DENORM-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
953; GFX10-DENORM-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
954; GFX10-DENORM-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
955; GFX10-DENORM-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:16
956; GFX10-DENORM-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:20
957; GFX10-DENORM-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
958; GFX10-DENORM-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
959; GFX10-DENORM-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
960; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(6)
961; GFX10-DENORM-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
962; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(4)
963; GFX10-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
964; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(2)
965; GFX10-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
966; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
967; GFX10-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
968; GFX10-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
969; GFX10-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
970; GFX10-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
971; GFX10-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
972; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
973;
974; GFX11-CONTRACT-LABEL: test_f64_add_mul_rhs:
975; GFX11-CONTRACT:       ; %bb.0: ; %.entry
976; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
977; GFX11-CONTRACT-NEXT:    s_clause 0x8
978; GFX11-CONTRACT-NEXT:    scratch_load_b32 v31, off, s32
979; GFX11-CONTRACT-NEXT:    scratch_load_b32 v32, off, s32 offset:4
980; GFX11-CONTRACT-NEXT:    scratch_load_b32 v33, off, s32 offset:8
981; GFX11-CONTRACT-NEXT:    scratch_load_b32 v34, off, s32 offset:12
982; GFX11-CONTRACT-NEXT:    scratch_load_b32 v35, off, s32 offset:16
983; GFX11-CONTRACT-NEXT:    scratch_load_b32 v36, off, s32 offset:20
984; GFX11-CONTRACT-NEXT:    scratch_load_b32 v37, off, s32 offset:24
985; GFX11-CONTRACT-NEXT:    scratch_load_b32 v38, off, s32 offset:28
986; GFX11-CONTRACT-NEXT:    scratch_load_b32 v39, off, s32 offset:32
987; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(6)
988; GFX11-CONTRACT-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
989; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(4)
990; GFX11-CONTRACT-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
991; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(2)
992; GFX11-CONTRACT-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
993; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
994; GFX11-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
995; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
996; GFX11-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
997; GFX11-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
998; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
999; GFX11-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
1000; GFX11-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
1001; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
1002;
1003; GFX11-DENORM-LABEL: test_f64_add_mul_rhs:
1004; GFX11-DENORM:       ; %bb.0: ; %.entry
1005; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1006; GFX11-DENORM-NEXT:    s_clause 0x8
1007; GFX11-DENORM-NEXT:    scratch_load_b32 v31, off, s32
1008; GFX11-DENORM-NEXT:    scratch_load_b32 v32, off, s32 offset:4
1009; GFX11-DENORM-NEXT:    scratch_load_b32 v33, off, s32 offset:8
1010; GFX11-DENORM-NEXT:    scratch_load_b32 v34, off, s32 offset:12
1011; GFX11-DENORM-NEXT:    scratch_load_b32 v35, off, s32 offset:16
1012; GFX11-DENORM-NEXT:    scratch_load_b32 v36, off, s32 offset:20
1013; GFX11-DENORM-NEXT:    scratch_load_b32 v37, off, s32 offset:24
1014; GFX11-DENORM-NEXT:    scratch_load_b32 v38, off, s32 offset:28
1015; GFX11-DENORM-NEXT:    scratch_load_b32 v39, off, s32 offset:32
1016; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(6)
1017; GFX11-DENORM-NEXT:    v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33]
1018; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(4)
1019; GFX11-DENORM-NEXT:    v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35]
1020; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(2)
1021; GFX11-DENORM-NEXT:    v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37]
1022; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
1023; GFX11-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39]
1024; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1025; GFX11-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
1026; GFX11-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
1027; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1028; GFX11-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
1029; GFX11-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
1030; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
1031.entry:
1032  %x = fmul fast <4 x double> %c, %d
1033  %y = call fast <4 x double> @llvm.fmuladd.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %x)
1034  %z = fadd fast <4 x double> %e, %y
1035  ret <4 x double> %z
1036}
1037
1038declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #0
1039declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #0
1040declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>) #0
1041declare double @llvm.fmuladd.f64(double, double, double) #0
1042declare float @llvm.fmuladd.f32(float, float, float) #0
1043declare half @llvm.fmuladd.f16(half, half, half) #0
1044attributes #0 = { nounwind readnone }
1045