xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll (revision 8f6a1a07cb85980013c70d5af6d28f5fcf75e732)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s
6
7define <2 x half> @v_fmul_v2f16(<2 x half> %a, <2 x half> %b) {
8; GFX9-LABEL: v_fmul_v2f16:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
12; GFX9-NEXT:    s_setpc_b64 s[30:31]
13;
14; GFX8-LABEL: v_fmul_v2f16:
15; GFX8:       ; %bb.0:
16; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v1
18; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
19; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
20; GFX8-NEXT:    s_setpc_b64 s[30:31]
21;
22; GFX10-LABEL: v_fmul_v2f16:
23; GFX10:       ; %bb.0:
24; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1
26; GFX10-NEXT:    s_setpc_b64 s[30:31]
27  %mul = fmul <2 x half> %a, %b
28  ret <2 x half> %mul
29}
30
31define <2 x half> @v_fmul_v2f16_fneg_lhs(<2 x half> %a, <2 x half> %b) {
32; GFX9-LABEL: v_fmul_v2f16_fneg_lhs:
33; GFX9:       ; %bb.0:
34; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
36; GFX9-NEXT:    s_setpc_b64 s[30:31]
37;
38; GFX8-LABEL: v_fmul_v2f16_fneg_lhs:
39; GFX8:       ; %bb.0:
40; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
42; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v1
43; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
44; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
45; GFX8-NEXT:    s_setpc_b64 s[30:31]
46;
47; GFX10-LABEL: v_fmul_v2f16_fneg_lhs:
48; GFX10:       ; %bb.0:
49; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
51; GFX10-NEXT:    s_setpc_b64 s[30:31]
52  %neg.a = fneg <2 x half> %a
53  %mul = fmul <2 x half> %neg.a, %b
54  ret <2 x half> %mul
55}
56
57define <2 x half> @v_fmul_v2f16_fneg_rhs(<2 x half> %a, <2 x half> %b) {
58; GFX9-LABEL: v_fmul_v2f16_fneg_rhs:
59; GFX9:       ; %bb.0:
60; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
62; GFX9-NEXT:    s_setpc_b64 s[30:31]
63;
64; GFX8-LABEL: v_fmul_v2f16_fneg_rhs:
65; GFX8:       ; %bb.0:
66; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
67; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
68; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v1
69; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
70; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
71; GFX8-NEXT:    s_setpc_b64 s[30:31]
72;
73; GFX10-LABEL: v_fmul_v2f16_fneg_rhs:
74; GFX10:       ; %bb.0:
75; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
76; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
77; GFX10-NEXT:    s_setpc_b64 s[30:31]
78  %neg.b = fneg <2 x half> %b
79  %mul = fmul <2 x half> %a, %neg.b
80  ret <2 x half> %mul
81}
82
83define <2 x half> @v_fmul_v2f16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
84; GFX9-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs:
85; GFX9:       ; %bb.0:
86; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
88; GFX9-NEXT:    s_setpc_b64 s[30:31]
89;
90; GFX8-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs:
91; GFX8:       ; %bb.0:
92; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v1
94; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
95; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
96; GFX8-NEXT:    s_setpc_b64 s[30:31]
97;
98; GFX10-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs:
99; GFX10:       ; %bb.0:
100; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1
102; GFX10-NEXT:    s_setpc_b64 s[30:31]
103  %neg.a = fneg <2 x half> %a
104  %neg.b = fneg <2 x half> %b
105  %mul = fmul <2 x half> %neg.a, %neg.b
106  ret <2 x half> %mul
107}
108
109define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) {
110; GFX9-LABEL: v_fmul_v3f16:
111; GFX9:       ; %bb.0:
112; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
114; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
115; GFX9-NEXT:    s_setpc_b64 s[30:31]
116;
117; GFX8-LABEL: v_fmul_v3f16:
118; GFX8:       ; %bb.0:
119; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
120; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
121; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
122; GFX8-NEXT:    v_mul_f16_e32 v1, v1, v3
123; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
124; GFX8-NEXT:    s_setpc_b64 s[30:31]
125;
126; GFX10-LABEL: v_fmul_v3f16:
127; GFX10:       ; %bb.0:
128; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
130; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
131; GFX10-NEXT:    s_setpc_b64 s[30:31]
132  %mul = fmul <3 x half> %a, %b
133  ret <3 x half> %mul
134}
135
136define <3 x half> @v_fmul_v3f16_fneg_lhs(<3 x half> %a, <3 x half> %b) {
137; GFX9-LABEL: v_fmul_v3f16_fneg_lhs:
138; GFX9:       ; %bb.0:
139; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0]
141; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0]
142; GFX9-NEXT:    s_setpc_b64 s[30:31]
143;
144; GFX8-LABEL: v_fmul_v3f16_fneg_lhs:
145; GFX8:       ; %bb.0:
146; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
148; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
149; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
150; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
151; GFX8-NEXT:    v_mul_f16_e32 v1, v1, v3
152; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
153; GFX8-NEXT:    s_setpc_b64 s[30:31]
154;
155; GFX10-LABEL: v_fmul_v3f16_fneg_lhs:
156; GFX10:       ; %bb.0:
157; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
158; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0]
159; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0]
160; GFX10-NEXT:    s_setpc_b64 s[30:31]
161  %neg.a = fneg <3 x half> %a
162  %mul = fmul <3 x half> %neg.a, %b
163  ret <3 x half> %mul
164}
165
166define <3 x half> @v_fmul_v3f16_fneg_rhs(<3 x half> %a, <3 x half> %b) {
167; GFX9-LABEL: v_fmul_v3f16_fneg_rhs:
168; GFX9:       ; %bb.0:
169; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
170; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
171; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
172; GFX9-NEXT:    s_setpc_b64 s[30:31]
173;
174; GFX8-LABEL: v_fmul_v3f16_fneg_rhs:
175; GFX8:       ; %bb.0:
176; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
177; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
178; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
179; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
180; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
181; GFX8-NEXT:    v_mul_f16_e32 v1, v1, v3
182; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
183; GFX8-NEXT:    s_setpc_b64 s[30:31]
184;
185; GFX10-LABEL: v_fmul_v3f16_fneg_rhs:
186; GFX10:       ; %bb.0:
187; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
188; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
189; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
190; GFX10-NEXT:    s_setpc_b64 s[30:31]
191  %neg.b = fneg <3 x half> %b
192  %mul = fmul <3 x half> %a, %neg.b
193  ret <3 x half> %mul
194}
195
196define <3 x half> @v_fmul_v3f16_fneg_lhs_fneg_rhs(<3 x half> %a, <3 x half> %b) {
197; GFX9-LABEL: v_fmul_v3f16_fneg_lhs_fneg_rhs:
198; GFX9:       ; %bb.0:
199; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
200; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
201; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
202; GFX9-NEXT:    s_setpc_b64 s[30:31]
203;
204; GFX8-LABEL: v_fmul_v3f16_fneg_lhs_fneg_rhs:
205; GFX8:       ; %bb.0:
206; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
208; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
209; GFX8-NEXT:    v_mul_f16_e32 v1, v1, v3
210; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
211; GFX8-NEXT:    s_setpc_b64 s[30:31]
212;
213; GFX10-LABEL: v_fmul_v3f16_fneg_lhs_fneg_rhs:
214; GFX10:       ; %bb.0:
215; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
216; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
217; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
218; GFX10-NEXT:    s_setpc_b64 s[30:31]
219  %neg.a = fneg <3 x half> %a
220  %neg.b = fneg <3 x half> %b
221  %mul = fmul <3 x half> %neg.a, %neg.b
222  ret <3 x half> %mul
223}
224
225define <4 x half> @v_fmul_v4f16(<4 x half> %a, <4 x half> %b) {
226; GFX9-LABEL: v_fmul_v4f16:
227; GFX9:       ; %bb.0:
228; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
230; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
231; GFX9-NEXT:    s_setpc_b64 s[30:31]
232;
233; GFX8-LABEL: v_fmul_v4f16:
234; GFX8:       ; %bb.0:
235; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
236; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
237; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
238; GFX8-NEXT:    v_mul_f16_e32 v2, v1, v3
239; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
240; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
241; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
242; GFX8-NEXT:    s_setpc_b64 s[30:31]
243;
244; GFX10-LABEL: v_fmul_v4f16:
245; GFX10:       ; %bb.0:
246; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
247; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
248; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
249; GFX10-NEXT:    s_setpc_b64 s[30:31]
250  %mul = fmul <4 x half> %a, %b
251  ret <4 x half> %mul
252}
253
254define <4 x half> @v_fmul_v4f16_fneg_lhs(<4 x half> %a, <4 x half> %b) {
255; GFX9-LABEL: v_fmul_v4f16_fneg_lhs:
256; GFX9:       ; %bb.0:
257; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
258; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0]
259; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0]
260; GFX9-NEXT:    s_setpc_b64 s[30:31]
261;
262; GFX8-LABEL: v_fmul_v4f16_fneg_lhs:
263; GFX8:       ; %bb.0:
264; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
266; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
267; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
268; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
269; GFX8-NEXT:    v_mul_f16_e32 v2, v1, v3
270; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
271; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
272; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
273; GFX8-NEXT:    s_setpc_b64 s[30:31]
274;
275; GFX10-LABEL: v_fmul_v4f16_fneg_lhs:
276; GFX10:       ; %bb.0:
277; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
278; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0]
279; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0]
280; GFX10-NEXT:    s_setpc_b64 s[30:31]
281  %neg.a = fneg <4 x half> %a
282  %mul = fmul <4 x half> %neg.a, %b
283  ret <4 x half> %mul
284}
285
286define <4 x half> @v_fmul_v4f16_fneg_rhs(<4 x half> %a, <4 x half> %b) {
287; GFX9-LABEL: v_fmul_v4f16_fneg_rhs:
288; GFX9:       ; %bb.0:
289; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
290; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
291; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
292; GFX9-NEXT:    s_setpc_b64 s[30:31]
293;
294; GFX8-LABEL: v_fmul_v4f16_fneg_rhs:
295; GFX8:       ; %bb.0:
296; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
297; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
298; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
299; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
300; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
301; GFX8-NEXT:    v_mul_f16_e32 v2, v1, v3
302; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
303; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
304; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
305; GFX8-NEXT:    s_setpc_b64 s[30:31]
306;
307; GFX10-LABEL: v_fmul_v4f16_fneg_rhs:
308; GFX10:       ; %bb.0:
309; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
310; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
311; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
312; GFX10-NEXT:    s_setpc_b64 s[30:31]
313  %neg.b = fneg <4 x half> %b
314  %mul = fmul <4 x half> %a, %neg.b
315  ret <4 x half> %mul
316}
317
318define <4 x half> @v_fmul_v4f16_fneg_lhs_fneg_rhs(<4 x half> %a, <4 x half> %b) {
319; GFX9-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs:
320; GFX9:       ; %bb.0:
321; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
322; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
323; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
324; GFX9-NEXT:    s_setpc_b64 s[30:31]
325;
326; GFX8-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs:
327; GFX8:       ; %bb.0:
328; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
329; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
330; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
331; GFX8-NEXT:    v_mul_f16_e32 v2, v1, v3
332; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
333; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
334; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
335; GFX8-NEXT:    s_setpc_b64 s[30:31]
336;
337; GFX10-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs:
338; GFX10:       ; %bb.0:
339; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
340; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
341; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
342; GFX10-NEXT:    s_setpc_b64 s[30:31]
343  %neg.a = fneg <4 x half> %a
344  %neg.b = fneg <4 x half> %b
345  %mul = fmul <4 x half> %neg.a, %neg.b
346  ret <4 x half> %mul
347}
348
349define <6 x half> @v_fmul_v6f16(<6 x half> %a, <6 x half> %b) {
350; GFX9-LABEL: v_fmul_v6f16:
351; GFX9:       ; %bb.0:
352; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
353; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
354; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v4
355; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v5
356; GFX9-NEXT:    s_setpc_b64 s[30:31]
357;
358; GFX8-LABEL: v_fmul_v6f16:
359; GFX8:       ; %bb.0:
360; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
361; GFX8-NEXT:    v_mul_f16_e32 v6, v0, v3
362; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
363; GFX8-NEXT:    v_mul_f16_e32 v3, v1, v4
364; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
365; GFX8-NEXT:    v_mul_f16_e32 v4, v2, v5
366; GFX8-NEXT:    v_mul_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
367; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
368; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
369; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
370; GFX8-NEXT:    s_setpc_b64 s[30:31]
371;
372; GFX10-LABEL: v_fmul_v6f16:
373; GFX10:       ; %bb.0:
374; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
375; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v3
376; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v4
377; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v5
378; GFX10-NEXT:    s_setpc_b64 s[30:31]
379  %mul = fmul <6 x half> %a, %b
380  ret <6 x half> %mul
381}
382
383define <6 x half> @v_fmul_v6f16_fneg_lhs(<6 x half> %a, <6 x half> %b) {
384; GFX9-LABEL: v_fmul_v6f16_fneg_lhs:
385; GFX9:       ; %bb.0:
386; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
387; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3 neg_lo:[1,0] neg_hi:[1,0]
388; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v4 neg_lo:[1,0] neg_hi:[1,0]
389; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v5 neg_lo:[1,0] neg_hi:[1,0]
390; GFX9-NEXT:    s_setpc_b64 s[30:31]
391;
392; GFX8-LABEL: v_fmul_v6f16_fneg_lhs:
393; GFX8:       ; %bb.0:
394; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
395; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
396; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
397; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
398; GFX8-NEXT:    v_mul_f16_e32 v6, v0, v3
399; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
400; GFX8-NEXT:    v_mul_f16_e32 v3, v1, v4
401; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
402; GFX8-NEXT:    v_mul_f16_e32 v4, v2, v5
403; GFX8-NEXT:    v_mul_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
404; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
405; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
406; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
407; GFX8-NEXT:    s_setpc_b64 s[30:31]
408;
409; GFX10-LABEL: v_fmul_v6f16_fneg_lhs:
410; GFX10:       ; %bb.0:
411; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
412; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v3 neg_lo:[1,0] neg_hi:[1,0]
413; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v4 neg_lo:[1,0] neg_hi:[1,0]
414; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v5 neg_lo:[1,0] neg_hi:[1,0]
415; GFX10-NEXT:    s_setpc_b64 s[30:31]
416  %neg.a = fneg <6 x half> %a
417  %mul = fmul <6 x half> %neg.a, %b
418  ret <6 x half> %mul
419}
420
421define <6 x half> @v_fmul_v6f16_fneg_rhs(<6 x half> %a, <6 x half> %b) {
422; GFX9-LABEL: v_fmul_v6f16_fneg_rhs:
423; GFX9:       ; %bb.0:
424; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
425; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3 neg_lo:[0,1] neg_hi:[0,1]
426; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v4 neg_lo:[0,1] neg_hi:[0,1]
427; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
428; GFX9-NEXT:    s_setpc_b64 s[30:31]
429;
430; GFX8-LABEL: v_fmul_v6f16_fneg_rhs:
431; GFX8:       ; %bb.0:
432; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
433; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
434; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
435; GFX8-NEXT:    v_xor_b32_e32 v5, 0x80008000, v5
436; GFX8-NEXT:    v_mul_f16_e32 v6, v0, v3
437; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
438; GFX8-NEXT:    v_mul_f16_e32 v3, v1, v4
439; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
440; GFX8-NEXT:    v_mul_f16_e32 v4, v2, v5
441; GFX8-NEXT:    v_mul_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
442; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
443; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
444; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
445; GFX8-NEXT:    s_setpc_b64 s[30:31]
446;
447; GFX10-LABEL: v_fmul_v6f16_fneg_rhs:
448; GFX10:       ; %bb.0:
449; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v3 neg_lo:[0,1] neg_hi:[0,1]
451; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v4 neg_lo:[0,1] neg_hi:[0,1]
452; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
453; GFX10-NEXT:    s_setpc_b64 s[30:31]
454  %neg.b = fneg <6 x half> %b
455  %mul = fmul <6 x half> %a, %neg.b
456  ret <6 x half> %mul
457}
458
459define <6 x half> @v_fmul_v6f16_fneg_lhs_fneg_rhs(<6 x half> %a, <6 x half> %b) {
460; GFX9-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs:
461; GFX9:       ; %bb.0:
462; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
463; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
464; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v4
465; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v5
466; GFX9-NEXT:    s_setpc_b64 s[30:31]
467;
468; GFX8-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs:
469; GFX8:       ; %bb.0:
470; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
471; GFX8-NEXT:    v_mul_f16_e32 v6, v0, v3
472; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
473; GFX8-NEXT:    v_mul_f16_e32 v3, v1, v4
474; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
475; GFX8-NEXT:    v_mul_f16_e32 v4, v2, v5
476; GFX8-NEXT:    v_mul_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
477; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
478; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
479; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
480; GFX8-NEXT:    s_setpc_b64 s[30:31]
481;
482; GFX10-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs:
483; GFX10:       ; %bb.0:
484; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
485; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v3
486; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v4
487; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v5
488; GFX10-NEXT:    s_setpc_b64 s[30:31]
489  %neg.a = fneg <6 x half> %a
490  %neg.b = fneg <6 x half> %b
491  %mul = fmul <6 x half> %neg.a, %neg.b
492  ret <6 x half> %mul
493}
494
495define <8 x half> @v_fmul_v8f16(<8 x half> %a, <8 x half> %b) {
496; GFX9-LABEL: v_fmul_v8f16:
497; GFX9:       ; %bb.0:
498; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
499; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v4
500; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v5
501; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v6
502; GFX9-NEXT:    v_pk_mul_f16 v3, v3, v7
503; GFX9-NEXT:    s_setpc_b64 s[30:31]
504;
505; GFX8-LABEL: v_fmul_v8f16:
506; GFX8:       ; %bb.0:
507; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
508; GFX8-NEXT:    v_mul_f16_e32 v8, v0, v4
509; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
510; GFX8-NEXT:    v_mul_f16_e32 v4, v1, v5
511; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
512; GFX8-NEXT:    v_mul_f16_e32 v5, v2, v6
513; GFX8-NEXT:    v_mul_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
514; GFX8-NEXT:    v_mul_f16_e32 v6, v3, v7
515; GFX8-NEXT:    v_mul_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
516; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
517; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
518; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
519; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
520; GFX8-NEXT:    s_setpc_b64 s[30:31]
521;
522; GFX10-LABEL: v_fmul_v8f16:
523; GFX10:       ; %bb.0:
524; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
525; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v4
526; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v5
527; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v6
528; GFX10-NEXT:    v_pk_mul_f16 v3, v3, v7
529; GFX10-NEXT:    s_setpc_b64 s[30:31]
530  %mul = fmul <8 x half> %a, %b
531  ret <8 x half> %mul
532}
533
534define <8 x half> @v_fmul_v8f16_fneg_lhs(<8 x half> %a, <8 x half> %b) {
535; GFX9-LABEL: v_fmul_v8f16_fneg_lhs:
536; GFX9:       ; %bb.0:
537; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
538; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v4 neg_lo:[1,0] neg_hi:[1,0]
539; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v5 neg_lo:[1,0] neg_hi:[1,0]
540; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v6 neg_lo:[1,0] neg_hi:[1,0]
541; GFX9-NEXT:    v_pk_mul_f16 v3, v3, v7 neg_lo:[1,0] neg_hi:[1,0]
542; GFX9-NEXT:    s_setpc_b64 s[30:31]
543;
544; GFX8-LABEL: v_fmul_v8f16_fneg_lhs:
545; GFX8:       ; %bb.0:
546; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
547; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
548; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
549; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
550; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
551; GFX8-NEXT:    v_mul_f16_e32 v8, v0, v4
552; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
553; GFX8-NEXT:    v_mul_f16_e32 v4, v1, v5
554; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
555; GFX8-NEXT:    v_mul_f16_e32 v5, v2, v6
556; GFX8-NEXT:    v_mul_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
557; GFX8-NEXT:    v_mul_f16_e32 v6, v3, v7
558; GFX8-NEXT:    v_mul_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
559; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
560; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
561; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
562; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
563; GFX8-NEXT:    s_setpc_b64 s[30:31]
564;
565; GFX10-LABEL: v_fmul_v8f16_fneg_lhs:
566; GFX10:       ; %bb.0:
567; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
568; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v4 neg_lo:[1,0] neg_hi:[1,0]
569; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v5 neg_lo:[1,0] neg_hi:[1,0]
570; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v6 neg_lo:[1,0] neg_hi:[1,0]
571; GFX10-NEXT:    v_pk_mul_f16 v3, v3, v7 neg_lo:[1,0] neg_hi:[1,0]
572; GFX10-NEXT:    s_setpc_b64 s[30:31]
573  %neg.a = fneg <8 x half> %a
574  %mul = fmul <8 x half> %neg.a, %b
575  ret <8 x half> %mul
576}
577
578define <8 x half> @v_fmul_v8f16_fneg_rhs(<8 x half> %a, <8 x half> %b) {
579; GFX9-LABEL: v_fmul_v8f16_fneg_rhs:
580; GFX9:       ; %bb.0:
581; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
582; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
583; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
584; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v6 neg_lo:[0,1] neg_hi:[0,1]
585; GFX9-NEXT:    v_pk_mul_f16 v3, v3, v7 neg_lo:[0,1] neg_hi:[0,1]
586; GFX9-NEXT:    s_setpc_b64 s[30:31]
587;
588; GFX8-LABEL: v_fmul_v8f16_fneg_rhs:
589; GFX8:       ; %bb.0:
590; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
591; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
592; GFX8-NEXT:    v_xor_b32_e32 v5, 0x80008000, v5
593; GFX8-NEXT:    v_xor_b32_e32 v6, 0x80008000, v6
594; GFX8-NEXT:    v_xor_b32_e32 v7, 0x80008000, v7
595; GFX8-NEXT:    v_mul_f16_e32 v8, v0, v4
596; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
597; GFX8-NEXT:    v_mul_f16_e32 v4, v1, v5
598; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
599; GFX8-NEXT:    v_mul_f16_e32 v5, v2, v6
600; GFX8-NEXT:    v_mul_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
601; GFX8-NEXT:    v_mul_f16_e32 v6, v3, v7
602; GFX8-NEXT:    v_mul_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
603; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
604; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
605; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
606; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
607; GFX8-NEXT:    s_setpc_b64 s[30:31]
608;
609; GFX10-LABEL: v_fmul_v8f16_fneg_rhs:
610; GFX10:       ; %bb.0:
611; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
612; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
613; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
614; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v6 neg_lo:[0,1] neg_hi:[0,1]
615; GFX10-NEXT:    v_pk_mul_f16 v3, v3, v7 neg_lo:[0,1] neg_hi:[0,1]
616; GFX10-NEXT:    s_setpc_b64 s[30:31]
617  %neg.b = fneg <8 x half> %b
618  %mul = fmul <8 x half> %a, %neg.b
619  ret <8 x half> %mul
620}
621
622define <8 x half> @v_fmul_v8f16_fneg_lhs_fneg_rhs(<8 x half> %a, <8 x half> %b) {
623; GFX9-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs:
624; GFX9:       ; %bb.0:
625; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
626; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v4
627; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v5
628; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v6
629; GFX9-NEXT:    v_pk_mul_f16 v3, v3, v7
630; GFX9-NEXT:    s_setpc_b64 s[30:31]
631;
632; GFX8-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs:
633; GFX8:       ; %bb.0:
634; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
635; GFX8-NEXT:    v_mul_f16_e32 v8, v0, v4
636; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
637; GFX8-NEXT:    v_mul_f16_e32 v4, v1, v5
638; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
639; GFX8-NEXT:    v_mul_f16_e32 v5, v2, v6
640; GFX8-NEXT:    v_mul_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
641; GFX8-NEXT:    v_mul_f16_e32 v6, v3, v7
642; GFX8-NEXT:    v_mul_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
643; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
644; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
645; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
646; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
647; GFX8-NEXT:    s_setpc_b64 s[30:31]
648;
649; GFX10-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs:
650; GFX10:       ; %bb.0:
651; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v4
653; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v5
654; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v6
655; GFX10-NEXT:    v_pk_mul_f16 v3, v3, v7
656; GFX10-NEXT:    s_setpc_b64 s[30:31]
657  %neg.a = fneg <8 x half> %a
658  %neg.b = fneg <8 x half> %b
659  %mul = fmul <8 x half> %neg.a, %neg.b
660  ret <8 x half> %mul
661}
662