xref: /llvm-project/llvm/test/CodeGen/X86/fmaddsub-combine.ll (revision e9f9467da063875bd684e46660e2ff36ba4f55e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s -check-prefixes=NOFMA
3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck %s -check-prefixes=FMA3,FMA3_256
4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck %s -check-prefixes=FMA3,FMA3_512
5; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck %s -check-prefixes=FMA4
6
7; This test checks the fusing of MUL + ADDSUB to FMADDSUB.
8
9define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B,  <2 x double> %C) #0 {
10; NOFMA-LABEL: mul_addsub_pd128:
11; NOFMA:       # %bb.0: # %entry
12; NOFMA-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
13; NOFMA-NEXT:    vaddsubpd %xmm2, %xmm0, %xmm0
14; NOFMA-NEXT:    retq
15;
16; FMA3-LABEL: mul_addsub_pd128:
17; FMA3:       # %bb.0: # %entry
18; FMA3-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
19; FMA3-NEXT:    retq
20;
21; FMA4-LABEL: mul_addsub_pd128:
22; FMA4:       # %bb.0: # %entry
23; FMA4-NEXT:    vfmaddsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
24; FMA4-NEXT:    retq
25entry:
26  %AB = fmul <2 x double> %A, %B
27  %Sub = fsub <2 x double> %AB, %C
28  %Add = fadd <2 x double> %AB, %C
29  %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, <2 x i32> <i32 0, i32 3>
30  ret <2 x double> %Addsub
31}
32
33define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
34; NOFMA-LABEL: mul_addsub_ps128:
35; NOFMA:       # %bb.0: # %entry
36; NOFMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
37; NOFMA-NEXT:    vaddsubps %xmm2, %xmm0, %xmm0
38; NOFMA-NEXT:    retq
39;
40; FMA3-LABEL: mul_addsub_ps128:
41; FMA3:       # %bb.0: # %entry
42; FMA3-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
43; FMA3-NEXT:    retq
44;
45; FMA4-LABEL: mul_addsub_ps128:
46; FMA4:       # %bb.0: # %entry
47; FMA4-NEXT:    vfmaddsubps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
48; FMA4-NEXT:    retq
49entry:
50  %AB = fmul <4 x float> %A, %B
51  %Sub = fsub <4 x float> %AB, %C
52  %Add = fadd <4 x float> %AB, %C
53  %Addsub = shufflevector <4 x float> %Sub, <4 x float> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
54  ret <4 x float> %Addsub
55}
56
57define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
58; NOFMA-LABEL: mul_addsub_pd256:
59; NOFMA:       # %bb.0: # %entry
60; NOFMA-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
61; NOFMA-NEXT:    vaddsubpd %ymm2, %ymm0, %ymm0
62; NOFMA-NEXT:    retq
63;
64; FMA3-LABEL: mul_addsub_pd256:
65; FMA3:       # %bb.0: # %entry
66; FMA3-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
67; FMA3-NEXT:    retq
68;
69; FMA4-LABEL: mul_addsub_pd256:
70; FMA4:       # %bb.0: # %entry
71; FMA4-NEXT:    vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
72; FMA4-NEXT:    retq
73entry:
74  %AB = fmul <4 x double> %A, %B
75  %Sub = fsub <4 x double> %AB, %C
76  %Add = fadd <4 x double> %AB, %C
77  %Addsub = shufflevector <4 x double> %Sub, <4 x double> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
78  ret <4 x double> %Addsub
79}
80
81define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
82; NOFMA-LABEL: mul_addsub_ps256:
83; NOFMA:       # %bb.0: # %entry
84; NOFMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
85; NOFMA-NEXT:    vaddsubps %ymm2, %ymm0, %ymm0
86; NOFMA-NEXT:    retq
87;
88; FMA3-LABEL: mul_addsub_ps256:
89; FMA3:       # %bb.0: # %entry
90; FMA3-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
91; FMA3-NEXT:    retq
92;
93; FMA4-LABEL: mul_addsub_ps256:
94; FMA4:       # %bb.0: # %entry
95; FMA4-NEXT:    vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
96; FMA4-NEXT:    retq
97entry:
98  %AB = fmul <8 x float> %A, %B
99  %Sub = fsub <8 x float> %AB, %C
100  %Add = fadd <8 x float> %AB, %C
101  %Addsub = shufflevector <8 x float> %Sub, <8 x float> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
102  ret <8 x float> %Addsub
103}
104
105define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
106; NOFMA-LABEL: mul_addsub_pd512:
107; NOFMA:       # %bb.0: # %entry
108; NOFMA-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
109; NOFMA-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
110; NOFMA-NEXT:    vaddsubpd %ymm4, %ymm0, %ymm0
111; NOFMA-NEXT:    vaddsubpd %ymm5, %ymm1, %ymm1
112; NOFMA-NEXT:    retq
113;
114; FMA3_256-LABEL: mul_addsub_pd512:
115; FMA3_256:       # %bb.0: # %entry
116; FMA3_256-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
117; FMA3_256-NEXT:    vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
118; FMA3_256-NEXT:    retq
119;
120; FMA3_512-LABEL: mul_addsub_pd512:
121; FMA3_512:       # %bb.0: # %entry
122; FMA3_512-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
123; FMA3_512-NEXT:    retq
124;
125; FMA4-LABEL: mul_addsub_pd512:
126; FMA4:       # %bb.0: # %entry
127; FMA4-NEXT:    vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
128; FMA4-NEXT:    vfmaddsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
129; FMA4-NEXT:    retq
130entry:
131  %AB = fmul <8 x double> %A, %B
132  %Sub = fsub <8 x double> %AB, %C
133  %Add = fadd <8 x double> %AB, %C
134  %Addsub = shufflevector <8 x double> %Sub, <8 x double> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
135  ret <8 x double> %Addsub
136}
137
138define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
139; NOFMA-LABEL: mul_addsub_ps512:
140; NOFMA:       # %bb.0: # %entry
141; NOFMA-NEXT:    vmulps %ymm3, %ymm1, %ymm1
142; NOFMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
143; NOFMA-NEXT:    vaddsubps %ymm4, %ymm0, %ymm0
144; NOFMA-NEXT:    vaddsubps %ymm5, %ymm1, %ymm1
145; NOFMA-NEXT:    retq
146;
147; FMA3_256-LABEL: mul_addsub_ps512:
148; FMA3_256:       # %bb.0: # %entry
149; FMA3_256-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
150; FMA3_256-NEXT:    vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
151; FMA3_256-NEXT:    retq
152;
153; FMA3_512-LABEL: mul_addsub_ps512:
154; FMA3_512:       # %bb.0: # %entry
155; FMA3_512-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
156; FMA3_512-NEXT:    retq
157;
158; FMA4-LABEL: mul_addsub_ps512:
159; FMA4:       # %bb.0: # %entry
160; FMA4-NEXT:    vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
161; FMA4-NEXT:    vfmaddsubps {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
162; FMA4-NEXT:    retq
163entry:
164  %AB = fmul <16 x float> %A, %B
165  %Sub = fsub <16 x float> %AB, %C
166  %Add = fadd <16 x float> %AB, %C
167  %Addsub = shufflevector <16 x float> %Sub, <16 x float> %Add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
168  ret <16 x float> %Addsub
169}
170
171define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
172; NOFMA-LABEL: buildvector_mul_addsub_ps128:
173; NOFMA:       # %bb.0: # %bb
174; NOFMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
175; NOFMA-NEXT:    vaddsubps %xmm2, %xmm0, %xmm0
176; NOFMA-NEXT:    retq
177;
178; FMA3-LABEL: buildvector_mul_addsub_ps128:
179; FMA3:       # %bb.0: # %bb
180; FMA3-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
181; FMA3-NEXT:    retq
182;
183; FMA4-LABEL: buildvector_mul_addsub_ps128:
184; FMA4:       # %bb.0: # %bb
185; FMA4-NEXT:    vfmaddsubps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
186; FMA4-NEXT:    retq
187bb:
188  %A = fmul <4 x float> %C, %D
189  %A0 = extractelement <4 x float> %A, i32 0
190  %B0 = extractelement <4 x float> %B, i32 0
191  %sub0 = fsub float %A0, %B0
192  %A2 = extractelement <4 x float> %A, i32 2
193  %B2 = extractelement <4 x float> %B, i32 2
194  %sub2 = fsub float %A2, %B2
195  %A1 = extractelement <4 x float> %A, i32 1
196  %B1 = extractelement <4 x float> %B, i32 1
197  %add1 = fadd float %A1, %B1
198  %A3 = extractelement <4 x float> %A, i32 3
199  %B3 = extractelement <4 x float> %B, i32 3
200  %add3 = fadd float %A3, %B3
201  %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
202  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
203  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
204  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
205  ret <4 x float> %vecinsert4
206}
207
208define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
209; NOFMA-LABEL: buildvector_mul_addsub_pd128:
210; NOFMA:       # %bb.0: # %bb
211; NOFMA-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
212; NOFMA-NEXT:    vaddsubpd %xmm2, %xmm0, %xmm0
213; NOFMA-NEXT:    retq
214;
215; FMA3-LABEL: buildvector_mul_addsub_pd128:
216; FMA3:       # %bb.0: # %bb
217; FMA3-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
218; FMA3-NEXT:    retq
219;
220; FMA4-LABEL: buildvector_mul_addsub_pd128:
221; FMA4:       # %bb.0: # %bb
222; FMA4-NEXT:    vfmaddsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
223; FMA4-NEXT:    retq
224bb:
225  %A = fmul <2 x double> %C, %D
226  %A0 = extractelement <2 x double> %A, i32 0
227  %B0 = extractelement <2 x double> %B, i32 0
228  %sub0 = fsub double %A0, %B0
229  %A1 = extractelement <2 x double> %A, i32 1
230  %B1 = extractelement <2 x double> %B, i32 1
231  %add1 = fadd double %A1, %B1
232  %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
233  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
234  ret <2 x double> %vecinsert2
235}
236
237define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
238; NOFMA-LABEL: buildvector_mul_addsub_ps256:
239; NOFMA:       # %bb.0: # %bb
240; NOFMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
241; NOFMA-NEXT:    vaddsubps %ymm2, %ymm0, %ymm0
242; NOFMA-NEXT:    retq
243;
244; FMA3-LABEL: buildvector_mul_addsub_ps256:
245; FMA3:       # %bb.0: # %bb
246; FMA3-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
247; FMA3-NEXT:    retq
248;
249; FMA4-LABEL: buildvector_mul_addsub_ps256:
250; FMA4:       # %bb.0: # %bb
251; FMA4-NEXT:    vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
252; FMA4-NEXT:    retq
253bb:
254  %A = fmul <8 x float> %C, %D
255  %A0 = extractelement <8 x float> %A, i32 0
256  %B0 = extractelement <8 x float> %B, i32 0
257  %sub0 = fsub float %A0, %B0
258  %A2 = extractelement <8 x float> %A, i32 2
259  %B2 = extractelement <8 x float> %B, i32 2
260  %sub2 = fsub float %A2, %B2
261  %A4 = extractelement <8 x float> %A, i32 4
262  %B4 = extractelement <8 x float> %B, i32 4
263  %sub4 = fsub float %A4, %B4
264  %A6 = extractelement <8 x float> %A, i32 6
265  %B6 = extractelement <8 x float> %B, i32 6
266  %sub6 = fsub float %A6, %B6
267  %A1 = extractelement <8 x float> %A, i32 1
268  %B1 = extractelement <8 x float> %B, i32 1
269  %add1 = fadd float %A1, %B1
270  %A3 = extractelement <8 x float> %A, i32 3
271  %B3 = extractelement <8 x float> %B, i32 3
272  %add3 = fadd float %A3, %B3
273  %A5 = extractelement <8 x float> %A, i32 5
274  %B5 = extractelement <8 x float> %B, i32 5
275  %add5 = fadd float %A5, %B5
276  %A7 = extractelement <8 x float> %A, i32 7
277  %B7 = extractelement <8 x float> %B, i32 7
278  %add7 = fadd float %A7, %B7
279  %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
280  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
281  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
282  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
283  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
284  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
285  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
286  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
287  ret <8 x float> %vecinsert8
288}
289
290define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
291; NOFMA-LABEL: buildvector_mul_addsub_pd256:
292; NOFMA:       # %bb.0: # %bb
293; NOFMA-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
294; NOFMA-NEXT:    vaddsubpd %ymm2, %ymm0, %ymm0
295; NOFMA-NEXT:    retq
296;
297; FMA3-LABEL: buildvector_mul_addsub_pd256:
298; FMA3:       # %bb.0: # %bb
299; FMA3-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
300; FMA3-NEXT:    retq
301;
302; FMA4-LABEL: buildvector_mul_addsub_pd256:
303; FMA4:       # %bb.0: # %bb
304; FMA4-NEXT:    vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
305; FMA4-NEXT:    retq
306bb:
307  %A = fmul <4 x double> %C, %D
308  %A0 = extractelement <4 x double> %A, i32 0
309  %B0 = extractelement <4 x double> %B, i32 0
310  %sub0 = fsub double %A0, %B0
311  %A2 = extractelement <4 x double> %A, i32 2
312  %B2 = extractelement <4 x double> %B, i32 2
313  %sub2 = fsub double %A2, %B2
314  %A1 = extractelement <4 x double> %A, i32 1
315  %B1 = extractelement <4 x double> %B, i32 1
316  %add1 = fadd double %A1, %B1
317  %A3 = extractelement <4 x double> %A, i32 3
318  %B3 = extractelement <4 x double> %B, i32 3
319  %add3 = fadd double %A3, %B3
320  %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
321  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
322  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
323  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
324  ret <4 x double> %vecinsert4
325}
326
327define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
328; NOFMA-LABEL: buildvector_mul_addsub_ps512:
329; NOFMA:       # %bb.0: # %bb
330; NOFMA-NEXT:    vmulps %ymm3, %ymm1, %ymm1
331; NOFMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
332; NOFMA-NEXT:    vaddsubps %ymm4, %ymm0, %ymm0
333; NOFMA-NEXT:    vaddsubps %ymm5, %ymm1, %ymm1
334; NOFMA-NEXT:    retq
335;
336; FMA3_256-LABEL: buildvector_mul_addsub_ps512:
337; FMA3_256:       # %bb.0: # %bb
338; FMA3_256-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
339; FMA3_256-NEXT:    vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
340; FMA3_256-NEXT:    retq
341;
342; FMA3_512-LABEL: buildvector_mul_addsub_ps512:
343; FMA3_512:       # %bb.0: # %bb
344; FMA3_512-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
345; FMA3_512-NEXT:    retq
346;
347; FMA4-LABEL: buildvector_mul_addsub_ps512:
348; FMA4:       # %bb.0: # %bb
349; FMA4-NEXT:    vfmaddsubps {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
350; FMA4-NEXT:    vfmaddsubps {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
351; FMA4-NEXT:    retq
352bb:
353  %A = fmul <16 x float> %C, %D
354  %A0 = extractelement <16 x float> %A, i32 0
355  %B0 = extractelement <16 x float> %B, i32 0
356  %sub0 = fsub float %A0, %B0
357  %A2 = extractelement <16 x float> %A, i32 2
358  %B2 = extractelement <16 x float> %B, i32 2
359  %sub2 = fsub float %A2, %B2
360  %A4 = extractelement <16 x float> %A, i32 4
361  %B4 = extractelement <16 x float> %B, i32 4
362  %sub4 = fsub float %A4, %B4
363  %A6 = extractelement <16 x float> %A, i32 6
364  %B6 = extractelement <16 x float> %B, i32 6
365  %sub6 = fsub float %A6, %B6
366  %A8 = extractelement <16 x float> %A, i32 8
367  %B8 = extractelement <16 x float> %B, i32 8
368  %sub8 = fsub float %A8, %B8
369  %A10 = extractelement <16 x float> %A, i32 10
370  %B10 = extractelement <16 x float> %B, i32 10
371  %sub10 = fsub float %A10, %B10
372  %A12 = extractelement <16 x float> %A, i32 12
373  %B12 = extractelement <16 x float> %B, i32 12
374  %sub12 = fsub float %A12, %B12
375  %A14 = extractelement <16 x float> %A, i32 14
376  %B14 = extractelement <16 x float> %B, i32 14
377  %sub14 = fsub float %A14, %B14
378  %A1 = extractelement <16 x float> %A, i32 1
379  %B1 = extractelement <16 x float> %B, i32 1
380  %add1 = fadd float %A1, %B1
381  %A3 = extractelement <16 x float> %A, i32 3
382  %B3 = extractelement <16 x float> %B, i32 3
383  %add3 = fadd float %A3, %B3
384  %A5 = extractelement <16 x float> %A, i32 5
385  %B5 = extractelement <16 x float> %B, i32 5
386  %add5 = fadd float %A5, %B5
387  %A7 = extractelement <16 x float> %A, i32 7
388  %B7 = extractelement <16 x float> %B, i32 7
389  %add7 = fadd float %A7, %B7
390  %A9 = extractelement <16 x float> %A, i32 9
391  %B9 = extractelement <16 x float> %B, i32 9
392  %add9 = fadd float %A9, %B9
393  %A11 = extractelement <16 x float> %A, i32 11
394  %B11 = extractelement <16 x float> %B, i32 11
395  %add11 = fadd float %A11, %B11
396  %A13 = extractelement <16 x float> %A, i32 13
397  %B13 = extractelement <16 x float> %B, i32 13
398  %add13 = fadd float %A13, %B13
399  %A15 = extractelement <16 x float> %A, i32 15
400  %B15 = extractelement <16 x float> %B, i32 15
401  %add15 = fadd float %A15, %B15
402  %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
403  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
404  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
405  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
406  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
407  ; element 5 is undef
408  %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
409  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
410  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
411  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
412  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
413  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
414  ; element 12 is undef
415  %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
416  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
417  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
418  ret <16 x float> %vecinsert16
419}
420
421define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
422; NOFMA-LABEL: buildvector_mul_addsub_pd512:
423; NOFMA:       # %bb.0: # %bb
424; NOFMA-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
425; NOFMA-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
426; NOFMA-NEXT:    vaddsubpd %ymm4, %ymm0, %ymm0
427; NOFMA-NEXT:    vaddsubpd %ymm5, %ymm1, %ymm1
428; NOFMA-NEXT:    retq
429;
430; FMA3_256-LABEL: buildvector_mul_addsub_pd512:
431; FMA3_256:       # %bb.0: # %bb
432; FMA3_256-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
433; FMA3_256-NEXT:    vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm5
434; FMA3_256-NEXT:    retq
435;
436; FMA3_512-LABEL: buildvector_mul_addsub_pd512:
437; FMA3_512:       # %bb.0: # %bb
438; FMA3_512-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
439; FMA3_512-NEXT:    retq
440;
441; FMA4-LABEL: buildvector_mul_addsub_pd512:
442; FMA4:       # %bb.0: # %bb
443; FMA4-NEXT:    vfmaddsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm4
444; FMA4-NEXT:    vfmaddsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) +/- ymm5
445; FMA4-NEXT:    retq
446bb:
447  %A = fmul <8 x double> %C, %D
448  %A0 = extractelement <8 x double> %A, i32 0
449  %B0 = extractelement <8 x double> %B, i32 0
450  %sub0 = fsub double %A0, %B0
451  %A2 = extractelement <8 x double> %A, i32 2
452  %B2 = extractelement <8 x double> %B, i32 2
453  %sub2 = fsub double %A2, %B2
454  %A4 = extractelement <8 x double> %A, i32 4
455  %B4 = extractelement <8 x double> %B, i32 4
456  %sub4 = fsub double %A4, %B4
457  %A6 = extractelement <8 x double> %A, i32 6
458  %B6 = extractelement <8 x double> %B, i32 6
459  %sub6 = fsub double %A6, %B6
460  %A1 = extractelement <8 x double> %A, i32 1
461  %B1 = extractelement <8 x double> %B, i32 1
462  %add1 = fadd double %A1, %B1
463  %A3 = extractelement <8 x double> %A, i32 3
464  %B3 = extractelement <8 x double> %B, i32 3
465  %add3 = fadd double %A3, %B3
466  %A7 = extractelement <8 x double> %A, i32 7
467  %B7 = extractelement <8 x double> %B, i32 7
468  %add7 = fadd double %A7, %B7
469  %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
470  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
471  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
472  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
473  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
474  ; element 5 is undef
475  %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
476  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
477  ret <8 x double> %vecinsert8
478}
479
480define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
481; NOFMA-LABEL: buildvector_mul_subadd_ps128:
482; NOFMA:       # %bb.0: # %bb
483; NOFMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
484; NOFMA-NEXT:    vaddss %xmm2, %xmm0, %xmm1
485; NOFMA-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
486; NOFMA-NEXT:    vshufpd {{.*#+}} xmm4 = xmm2[1,0]
487; NOFMA-NEXT:    vaddss %xmm4, %xmm3, %xmm3
488; NOFMA-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
489; NOFMA-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
490; NOFMA-NEXT:    vsubss %xmm5, %xmm4, %xmm4
491; NOFMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3]
492; NOFMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
493; NOFMA-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
494; NOFMA-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
495; NOFMA-NEXT:    vsubss %xmm2, %xmm0, %xmm0
496; NOFMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
497; NOFMA-NEXT:    retq
498;
499; FMA3-LABEL: buildvector_mul_subadd_ps128:
500; FMA3:       # %bb.0: # %bb
501; FMA3-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
502; FMA3-NEXT:    retq
503;
504; FMA4-LABEL: buildvector_mul_subadd_ps128:
505; FMA4:       # %bb.0: # %bb
506; FMA4-NEXT:    vfmsubaddps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
507; FMA4-NEXT:    retq
508bb:
509  %A = fmul <4 x float> %C, %D
510  %A0 = extractelement <4 x float> %A, i32 0
511  %B0 = extractelement <4 x float> %B, i32 0
512  %sub0 = fadd float %A0, %B0
513  %A2 = extractelement <4 x float> %A, i32 2
514  %B2 = extractelement <4 x float> %B, i32 2
515  %sub2 = fadd float %A2, %B2
516  %A1 = extractelement <4 x float> %A, i32 1
517  %B1 = extractelement <4 x float> %B, i32 1
518  %add1 = fsub float %A1, %B1
519  %A3 = extractelement <4 x float> %A, i32 3
520  %B3 = extractelement <4 x float> %B, i32 3
521  %add3 = fsub float %A3, %B3
522  %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
523  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
524  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
525  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
526  ret <4 x float> %vecinsert4
527}
528
529define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
530; NOFMA-LABEL: buildvector_mul_subadd_pd128:
531; NOFMA:       # %bb.0: # %bb
532; NOFMA-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
533; NOFMA-NEXT:    vaddsd %xmm2, %xmm0, %xmm1
534; NOFMA-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
535; NOFMA-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
536; NOFMA-NEXT:    vsubsd %xmm2, %xmm0, %xmm0
537; NOFMA-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
538; NOFMA-NEXT:    retq
539;
540; FMA3-LABEL: buildvector_mul_subadd_pd128:
541; FMA3:       # %bb.0: # %bb
542; FMA3-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
543; FMA3-NEXT:    retq
544;
545; FMA4-LABEL: buildvector_mul_subadd_pd128:
546; FMA4:       # %bb.0: # %bb
547; FMA4-NEXT:    vfmsubaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
548; FMA4-NEXT:    retq
549bb:
550  %A = fmul <2 x double> %C, %D
551  %A0 = extractelement <2 x double> %A, i32 0
552  %B0 = extractelement <2 x double> %B, i32 0
553  %sub0 = fadd double %A0, %B0
554  %A1 = extractelement <2 x double> %A, i32 1
555  %B1 = extractelement <2 x double> %B, i32 1
556  %add1 = fsub double %A1, %B1
557  %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
558  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
559  ret <2 x double> %vecinsert2
560}
561
562define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
563; NOFMA-LABEL: buildvector_mul_subadd_ps256:
564; NOFMA:       # %bb.0: # %bb
565; NOFMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
566; NOFMA-NEXT:    vaddss %xmm2, %xmm0, %xmm1
567; NOFMA-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
568; NOFMA-NEXT:    vshufpd {{.*#+}} xmm4 = xmm2[1,0]
569; NOFMA-NEXT:    vaddss %xmm4, %xmm3, %xmm3
570; NOFMA-NEXT:    vextractf128 $1, %ymm0, %xmm4
571; NOFMA-NEXT:    vextractf128 $1, %ymm2, %xmm5
572; NOFMA-NEXT:    vaddss %xmm5, %xmm4, %xmm6
573; NOFMA-NEXT:    vshufpd {{.*#+}} xmm7 = xmm4[1,0]
574; NOFMA-NEXT:    vshufpd {{.*#+}} xmm8 = xmm5[1,0]
575; NOFMA-NEXT:    vaddss %xmm7, %xmm8, %xmm7
576; NOFMA-NEXT:    vmovshdup {{.*#+}} xmm8 = xmm0[1,1,3,3]
577; NOFMA-NEXT:    vmovshdup {{.*#+}} xmm9 = xmm2[1,1,3,3]
578; NOFMA-NEXT:    vsubss %xmm9, %xmm8, %xmm8
579; NOFMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[2,3]
580; NOFMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
581; NOFMA-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
582; NOFMA-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
583; NOFMA-NEXT:    vsubss %xmm2, %xmm0, %xmm0
584; NOFMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
585; NOFMA-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
586; NOFMA-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm5[1,1,3,3]
587; NOFMA-NEXT:    vsubss %xmm2, %xmm1, %xmm1
588; NOFMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[2,3]
589; NOFMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3]
590; NOFMA-NEXT:    vshufps {{.*#+}} xmm2 = xmm4[3,3,3,3]
591; NOFMA-NEXT:    vshufps {{.*#+}} xmm3 = xmm5[3,3,3,3]
592; NOFMA-NEXT:    vsubss %xmm3, %xmm2, %xmm2
593; NOFMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
594; NOFMA-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
595; NOFMA-NEXT:    retq
596;
597; FMA3-LABEL: buildvector_mul_subadd_ps256:
598; FMA3:       # %bb.0: # %bb
599; FMA3-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
600; FMA3-NEXT:    retq
601;
602; FMA4-LABEL: buildvector_mul_subadd_ps256:
603; FMA4:       # %bb.0: # %bb
604; FMA4-NEXT:    vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
605; FMA4-NEXT:    retq
606bb:
607  %A = fmul <8 x float> %C, %D
608  %A0 = extractelement <8 x float> %A, i32 0
609  %B0 = extractelement <8 x float> %B, i32 0
610  %sub0 = fadd float %A0, %B0
611  %A2 = extractelement <8 x float> %A, i32 2
612  %B2 = extractelement <8 x float> %B, i32 2
613  %sub2 = fadd float %A2, %B2
614  %A4 = extractelement <8 x float> %A, i32 4
615  %B4 = extractelement <8 x float> %B, i32 4
616  %sub4 = fadd float %A4, %B4
617  %A6 = extractelement <8 x float> %A, i32 6
618  %B6 = extractelement <8 x float> %B, i32 6
619  %sub6 = fadd float %A6, %B6
620  %A1 = extractelement <8 x float> %A, i32 1
621  %B1 = extractelement <8 x float> %B, i32 1
622  %add1 = fsub float %A1, %B1
623  %A3 = extractelement <8 x float> %A, i32 3
624  %B3 = extractelement <8 x float> %B, i32 3
625  %add3 = fsub float %A3, %B3
626  %A5 = extractelement <8 x float> %A, i32 5
627  %B5 = extractelement <8 x float> %B, i32 5
628  %add5 = fsub float %A5, %B5
629  %A7 = extractelement <8 x float> %A, i32 7
630  %B7 = extractelement <8 x float> %B, i32 7
631  %add7 = fsub float %A7, %B7
632  %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
633  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
634  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
635  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
636  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
637  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
638  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
639  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
640  ret <8 x float> %vecinsert8
641}
642
643define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
644; NOFMA-LABEL: buildvector_mul_subadd_pd256:
645; NOFMA:       # %bb.0: # %bb
646; NOFMA-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
647; NOFMA-NEXT:    vaddsd %xmm2, %xmm0, %xmm1
648; NOFMA-NEXT:    vextractf128 $1, %ymm0, %xmm3
649; NOFMA-NEXT:    vextractf128 $1, %ymm2, %xmm4
650; NOFMA-NEXT:    vaddsd %xmm4, %xmm3, %xmm5
651; NOFMA-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
652; NOFMA-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
653; NOFMA-NEXT:    vsubsd %xmm2, %xmm0, %xmm0
654; NOFMA-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
655; NOFMA-NEXT:    vshufpd {{.*#+}} xmm1 = xmm3[1,0]
656; NOFMA-NEXT:    vshufpd {{.*#+}} xmm2 = xmm4[1,0]
657; NOFMA-NEXT:    vsubsd %xmm2, %xmm1, %xmm1
658; NOFMA-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm5[0],xmm1[0]
659; NOFMA-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
660; NOFMA-NEXT:    retq
661;
662; FMA3-LABEL: buildvector_mul_subadd_pd256:
663; FMA3:       # %bb.0: # %bb
664; FMA3-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
665; FMA3-NEXT:    retq
666;
667; FMA4-LABEL: buildvector_mul_subadd_pd256:
668; FMA4:       # %bb.0: # %bb
669; FMA4-NEXT:    vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
670; FMA4-NEXT:    retq
671bb:
672  %A = fmul <4 x double> %C, %D
673  %A0 = extractelement <4 x double> %A, i32 0
674  %B0 = extractelement <4 x double> %B, i32 0
675  %sub0 = fadd double %A0, %B0
676  %A2 = extractelement <4 x double> %A, i32 2
677  %B2 = extractelement <4 x double> %B, i32 2
678  %sub2 = fadd double %A2, %B2
679  %A1 = extractelement <4 x double> %A, i32 1
680  %B1 = extractelement <4 x double> %B, i32 1
681  %add1 = fsub double %A1, %B1
682  %A3 = extractelement <4 x double> %A, i32 3
683  %B3 = extractelement <4 x double> %B, i32 3
684  %add3 = fsub double %A3, %B3
685  %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
686  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
687  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
688  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
689  ret <4 x double> %vecinsert4
690}
691
692define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
693; NOFMA-LABEL: buildvector_mul_subadd_ps512:
694; NOFMA:       # %bb.0: # %bb
695; NOFMA-NEXT:    vmulps %ymm3, %ymm1, %ymm1
696; NOFMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
697; NOFMA-NEXT:    vaddss %xmm4, %xmm0, %xmm2
698; NOFMA-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
699; NOFMA-NEXT:    vshufpd {{.*#+}} xmm6 = xmm4[1,0]
700; NOFMA-NEXT:    vaddss %xmm6, %xmm3, %xmm3
701; NOFMA-NEXT:    vextractf128 $1, %ymm0, %xmm6
702; NOFMA-NEXT:    vextractf128 $1, %ymm4, %xmm7
703; NOFMA-NEXT:    vaddss %xmm7, %xmm6, %xmm8
704; NOFMA-NEXT:    vshufpd {{.*#+}} xmm9 = xmm6[1,0]
705; NOFMA-NEXT:    vshufpd {{.*#+}} xmm10 = xmm7[1,0]
706; NOFMA-NEXT:    vaddss %xmm10, %xmm9, %xmm9
707; NOFMA-NEXT:    vinsertps {{.*#+}} xmm8 = xmm8[0,1],xmm9[0],xmm8[3]
708; NOFMA-NEXT:    vaddss %xmm5, %xmm1, %xmm9
709; NOFMA-NEXT:    vshufpd {{.*#+}} xmm10 = xmm1[1,0]
710; NOFMA-NEXT:    vshufpd {{.*#+}} xmm11 = xmm5[1,0]
711; NOFMA-NEXT:    vaddss %xmm11, %xmm10, %xmm10
712; NOFMA-NEXT:    vextractf128 $1, %ymm1, %xmm11
713; NOFMA-NEXT:    vshufpd {{.*#+}} xmm12 = xmm11[1,0]
714; NOFMA-NEXT:    vextractf128 $1, %ymm5, %xmm13
715; NOFMA-NEXT:    vshufpd {{.*#+}} xmm14 = xmm13[1,0]
716; NOFMA-NEXT:    vaddss %xmm14, %xmm12, %xmm12
717; NOFMA-NEXT:    vmovshdup {{.*#+}} xmm14 = xmm0[1,1,3,3]
718; NOFMA-NEXT:    vmovshdup {{.*#+}} xmm15 = xmm4[1,1,3,3]
719; NOFMA-NEXT:    vsubss %xmm15, %xmm14, %xmm14
720; NOFMA-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[2,3]
721; NOFMA-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
722; NOFMA-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
723; NOFMA-NEXT:    vshufps {{.*#+}} xmm3 = xmm4[3,3,3,3]
724; NOFMA-NEXT:    vsubss %xmm3, %xmm0, %xmm0
725; NOFMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
726; NOFMA-NEXT:    vshufps {{.*#+}} xmm2 = xmm6[3,3,3,3]
727; NOFMA-NEXT:    vshufps {{.*#+}} xmm3 = xmm7[3,3,3,3]
728; NOFMA-NEXT:    vsubss %xmm3, %xmm2, %xmm2
729; NOFMA-NEXT:    vinsertps {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[0]
730; NOFMA-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
731; NOFMA-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm5[1,1,3,3]
732; NOFMA-NEXT:    vsubss %xmm4, %xmm3, %xmm3
733; NOFMA-NEXT:    vinsertps {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[2,3]
734; NOFMA-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm10[0],xmm3[3]
735; NOFMA-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
736; NOFMA-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[3,3,3,3]
737; NOFMA-NEXT:    vsubss %xmm4, %xmm1, %xmm1
738; NOFMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
739; NOFMA-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm11[1,1,3,3]
740; NOFMA-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm13[1,1,3,3]
741; NOFMA-NEXT:    vsubss %xmm4, %xmm3, %xmm3
742; NOFMA-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm12[0,0]
743; NOFMA-NEXT:    vshufps {{.*#+}} xmm4 = xmm11[3,3,3,3]
744; NOFMA-NEXT:    vshufps {{.*#+}} xmm5 = xmm13[3,3,3,3]
745; NOFMA-NEXT:    vsubss %xmm5, %xmm4, %xmm4
746; NOFMA-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
747; NOFMA-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
748; NOFMA-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
749; NOFMA-NEXT:    retq
750;
751; FMA3_256-LABEL: buildvector_mul_subadd_ps512:
752; FMA3_256:       # %bb.0: # %bb
753; FMA3_256-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
754; FMA3_256-NEXT:    vfmsubadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
755; FMA3_256-NEXT:    retq
756;
757; FMA3_512-LABEL: buildvector_mul_subadd_ps512:
758; FMA3_512:       # %bb.0: # %bb
759; FMA3_512-NEXT:    vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
760; FMA3_512-NEXT:    retq
761;
762; FMA4-LABEL: buildvector_mul_subadd_ps512:
763; FMA4:       # %bb.0: # %bb
764; FMA4-NEXT:    vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4
765; FMA4-NEXT:    vfmsubaddps {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5
766; FMA4-NEXT:    retq
767bb:
768  %A = fmul <16 x float> %C, %D
769  %A0 = extractelement <16 x float> %A, i32 0
770  %B0 = extractelement <16 x float> %B, i32 0
771  %sub0 = fadd float %A0, %B0
772  %A2 = extractelement <16 x float> %A, i32 2
773  %B2 = extractelement <16 x float> %B, i32 2
774  %sub2 = fadd float %A2, %B2
775  %A4 = extractelement <16 x float> %A, i32 4
776  %B4 = extractelement <16 x float> %B, i32 4
777  %sub4 = fadd float %A4, %B4
778  %A6 = extractelement <16 x float> %A, i32 6
779  %B6 = extractelement <16 x float> %B, i32 6
780  %sub6 = fadd float %A6, %B6
781  %A8 = extractelement <16 x float> %A, i32 8
782  %B8 = extractelement <16 x float> %B, i32 8
783  %sub8 = fadd float %A8, %B8
784  %A10 = extractelement <16 x float> %A, i32 10
785  %B10 = extractelement <16 x float> %B, i32 10
786  %sub10 = fadd float %A10, %B10
787  %A12 = extractelement <16 x float> %A, i32 12
788  %B12 = extractelement <16 x float> %B, i32 12
789  %sub12 = fadd float %A12, %B12
790  %A14 = extractelement <16 x float> %A, i32 14
791  %B14 = extractelement <16 x float> %B, i32 14
792  %sub14 = fadd float %A14, %B14
793  %A1 = extractelement <16 x float> %A, i32 1
794  %B1 = extractelement <16 x float> %B, i32 1
795  %add1 = fsub float %A1, %B1
796  %A3 = extractelement <16 x float> %A, i32 3
797  %B3 = extractelement <16 x float> %B, i32 3
798  %add3 = fsub float %A3, %B3
799  %A5 = extractelement <16 x float> %A, i32 5
800  %B5 = extractelement <16 x float> %B, i32 5
801  %add5 = fsub float %A5, %B5
802  %A7 = extractelement <16 x float> %A, i32 7
803  %B7 = extractelement <16 x float> %B, i32 7
804  %add7 = fsub float %A7, %B7
805  %A9 = extractelement <16 x float> %A, i32 9
806  %B9 = extractelement <16 x float> %B, i32 9
807  %add9 = fsub float %A9, %B9
808  %A11 = extractelement <16 x float> %A, i32 11
809  %B11 = extractelement <16 x float> %B, i32 11
810  %add11 = fsub float %A11, %B11
811  %A13 = extractelement <16 x float> %A, i32 13
812  %B13 = extractelement <16 x float> %B, i32 13
813  %add13 = fsub float %A13, %B13
814  %A15 = extractelement <16 x float> %A, i32 15
815  %B15 = extractelement <16 x float> %B, i32 15
816  %add15 = fsub float %A15, %B15
817  %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
818  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
819  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
820  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
821  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
822  ; element 5 is undef
823  %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
824  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
825  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
826  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
827  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
828  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
829  ; element 12 is undef
830  %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
831  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
832  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
833  ret <16 x float> %vecinsert16
834}
835
836define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
837; NOFMA-LABEL: buildvector_mul_subadd_pd512:
838; NOFMA:       # %bb.0: # %bb
839; NOFMA-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
840; NOFMA-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
841; NOFMA-NEXT:    vaddsd %xmm4, %xmm0, %xmm2
842; NOFMA-NEXT:    vextractf128 $1, %ymm0, %xmm3
843; NOFMA-NEXT:    vextractf128 $1, %ymm4, %xmm6
844; NOFMA-NEXT:    vaddsd %xmm6, %xmm3, %xmm7
845; NOFMA-NEXT:    vaddsd %xmm5, %xmm1, %xmm8
846; NOFMA-NEXT:    vextractf128 $1, %ymm1, %xmm1
847; NOFMA-NEXT:    vextractf128 $1, %ymm5, %xmm5
848; NOFMA-NEXT:    vaddsd %xmm5, %xmm1, %xmm9
849; NOFMA-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
850; NOFMA-NEXT:    vshufpd {{.*#+}} xmm4 = xmm4[1,0]
851; NOFMA-NEXT:    vsubsd %xmm4, %xmm0, %xmm0
852; NOFMA-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
853; NOFMA-NEXT:    vshufpd {{.*#+}} xmm2 = xmm3[1,0]
854; NOFMA-NEXT:    vshufpd {{.*#+}} xmm3 = xmm6[1,0]
855; NOFMA-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
856; NOFMA-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm7[0],xmm2[0]
857; NOFMA-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
858; NOFMA-NEXT:    vshufpd {{.*#+}} xmm3 = xmm5[1,0]
859; NOFMA-NEXT:    vsubsd %xmm3, %xmm1, %xmm1
860; NOFMA-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm9[0],xmm1[0]
861; NOFMA-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
862; NOFMA-NEXT:    vinsertf128 $1, %xmm1, %ymm8, %ymm1
863; NOFMA-NEXT:    retq
864;
865; FMA3_256-LABEL: buildvector_mul_subadd_pd512:
866; FMA3_256:       # %bb.0: # %bb
867; FMA3_256-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
868; FMA3_256-NEXT:    vfmsubadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
869; FMA3_256-NEXT:    retq
870;
871; FMA3_512-LABEL: buildvector_mul_subadd_pd512:
872; FMA3_512:       # %bb.0: # %bb
873; FMA3_512-NEXT:    vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
874; FMA3_512-NEXT:    retq
875;
876; FMA4-LABEL: buildvector_mul_subadd_pd512:
877; FMA4:       # %bb.0: # %bb
878; FMA4-NEXT:    vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4
879; FMA4-NEXT:    vfmsubaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5
880; FMA4-NEXT:    retq
881bb:
882  %A = fmul <8 x double> %C, %D
883  %A0 = extractelement <8 x double> %A, i32 0
884  %B0 = extractelement <8 x double> %B, i32 0
885  %sub0 = fadd double %A0, %B0
886  %A2 = extractelement <8 x double> %A, i32 2
887  %B2 = extractelement <8 x double> %B, i32 2
888  %sub2 = fadd double %A2, %B2
889  %A4 = extractelement <8 x double> %A, i32 4
890  %B4 = extractelement <8 x double> %B, i32 4
891  %sub4 = fadd double %A4, %B4
892  %A6 = extractelement <8 x double> %A, i32 6
893  %B6 = extractelement <8 x double> %B, i32 6
894  %sub6 = fadd double %A6, %B6
895  %A1 = extractelement <8 x double> %A, i32 1
896  %B1 = extractelement <8 x double> %B, i32 1
897  %add1 = fsub double %A1, %B1
898  %A3 = extractelement <8 x double> %A, i32 3
899  %B3 = extractelement <8 x double> %B, i32 3
900  %add3 = fsub double %A3, %B3
901  %A7 = extractelement <8 x double> %A, i32 7
902  %B7 = extractelement <8 x double> %B, i32 7
903  %add7 = fsub double %A7, %B7
904  %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
905  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
906  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
907  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
908  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
909  ; element 5 is undef
910  %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
911  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
912  ret <8 x double> %vecinsert8
913}
914
915attributes #0 = { nounwind "unsafe-fp-math"="true" }
916