xref: /llvm-project/llvm/test/CodeGen/X86/fmsubadd-combine.ll (revision b1e6ca9d227dfdc0f01e83b62e2af1b05d3fc9a7)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s -check-prefixes=CHECK,NOFMA
3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck %s -check-prefixes=CHECK,FMA3,FMA3_256
4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck %s -check-prefixes=CHECK,FMA3,FMA3_512
5; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck %s -check-prefixes=CHECK,FMA4
6
7; This test checks the fusing of MUL + SUB/ADD to FMSUBADD.
8
9define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {
10; NOFMA-LABEL: mul_subadd_pd128:
11; NOFMA:       # %bb.0: # %entry
12; NOFMA-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
13; NOFMA-NEXT:    vsubpd %xmm2, %xmm0, %xmm1
14; NOFMA-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
15; NOFMA-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
16; NOFMA-NEXT:    retq
17;
18; FMA3-LABEL: mul_subadd_pd128:
19; FMA3:       # %bb.0: # %entry
20; FMA3-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
21; FMA3-NEXT:    retq
22;
23; FMA4-LABEL: mul_subadd_pd128:
24; FMA4:       # %bb.0: # %entry
25; FMA4-NEXT:    vfmsubaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
26; FMA4-NEXT:    retq
27entry:
28  %AB = fmul <2 x double> %A, %B
29  %Sub = fsub <2 x double> %AB, %C
30  %Add = fadd <2 x double> %AB, %C
31  %subadd = shufflevector <2 x double> %Add, <2 x double> %Sub, <2 x i32> <i32 0, i32 3>
32  ret <2 x double> %subadd
33}
34
35define <4 x float> @mul_subadd_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
36; NOFMA-LABEL: mul_subadd_ps128:
37; NOFMA:       # %bb.0: # %entry
38; NOFMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
39; NOFMA-NEXT:    vsubps %xmm2, %xmm0, %xmm1
40; NOFMA-NEXT:    vaddps %xmm2, %xmm0, %xmm0
41; NOFMA-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
42; NOFMA-NEXT:    retq
43;
44; FMA3-LABEL: mul_subadd_ps128:
45; FMA3:       # %bb.0: # %entry
46; FMA3-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
47; FMA3-NEXT:    retq
48;
49; FMA4-LABEL: mul_subadd_ps128:
50; FMA4:       # %bb.0: # %entry
51; FMA4-NEXT:    vfmsubaddps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
52; FMA4-NEXT:    retq
53entry:
54  %AB = fmul <4 x float> %A, %B
55  %Sub = fsub <4 x float> %AB, %C
56  %Add = fadd <4 x float> %AB, %C
57  %subadd = shufflevector <4 x float> %Add, <4 x float> %Sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
58  ret <4 x float> %subadd
59}
60
61define <4 x double> @mul_subadd_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
62; NOFMA-LABEL: mul_subadd_pd256:
63; NOFMA:       # %bb.0: # %entry
64; NOFMA-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
65; NOFMA-NEXT:    vsubpd %ymm2, %ymm0, %ymm1
66; NOFMA-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
67; NOFMA-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
68; NOFMA-NEXT:    retq
69;
70; FMA3-LABEL: mul_subadd_pd256:
71; FMA3:       # %bb.0: # %entry
72; FMA3-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
73; FMA3-NEXT:    retq
74;
75; FMA4-LABEL: mul_subadd_pd256:
76; FMA4:       # %bb.0: # %entry
77; FMA4-NEXT:    vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
78; FMA4-NEXT:    retq
79entry:
80  %AB = fmul <4 x double> %A, %B
81  %Sub = fsub <4 x double> %AB, %C
82  %Add = fadd <4 x double> %AB, %C
83  %subadd = shufflevector <4 x double> %Add, <4 x double> %Sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
84  ret <4 x double> %subadd
85}
86
87define <8 x float> @mul_subadd_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
88; NOFMA-LABEL: mul_subadd_ps256:
89; NOFMA:       # %bb.0: # %entry
90; NOFMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
91; NOFMA-NEXT:    vsubps %ymm2, %ymm0, %ymm1
92; NOFMA-NEXT:    vaddps %ymm2, %ymm0, %ymm0
93; NOFMA-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
94; NOFMA-NEXT:    retq
95;
96; FMA3-LABEL: mul_subadd_ps256:
97; FMA3:       # %bb.0: # %entry
98; FMA3-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
99; FMA3-NEXT:    retq
100;
101; FMA4-LABEL: mul_subadd_ps256:
102; FMA4:       # %bb.0: # %entry
103; FMA4-NEXT:    vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
104; FMA4-NEXT:    retq
105entry:
106  %AB = fmul <8 x float> %A, %B
107  %Sub = fsub <8 x float> %AB, %C
108  %Add = fadd <8 x float> %AB, %C
109  %subadd = shufflevector <8 x float> %Add, <8 x float> %Sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
110  ret <8 x float> %subadd
111}
112
113define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
114; NOFMA-LABEL: mul_subadd_pd512:
115; NOFMA:       # %bb.0: # %entry
116; NOFMA-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
117; NOFMA-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
118; NOFMA-NEXT:    vsubpd %ymm5, %ymm1, %ymm2
119; NOFMA-NEXT:    vsubpd %ymm4, %ymm0, %ymm3
120; NOFMA-NEXT:    vaddpd %ymm5, %ymm1, %ymm1
121; NOFMA-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
122; NOFMA-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
123; NOFMA-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
124; NOFMA-NEXT:    retq
125;
126; FMA3_256-LABEL: mul_subadd_pd512:
127; FMA3_256:       # %bb.0: # %entry
128; FMA3_256-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
129; FMA3_256-NEXT:    vfmsubadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
130; FMA3_256-NEXT:    retq
131;
132; FMA3_512-LABEL: mul_subadd_pd512:
133; FMA3_512:       # %bb.0: # %entry
134; FMA3_512-NEXT:    vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
135; FMA3_512-NEXT:    retq
136;
137; FMA4-LABEL: mul_subadd_pd512:
138; FMA4:       # %bb.0: # %entry
139; FMA4-NEXT:    vfmsubaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4
140; FMA4-NEXT:    vfmsubaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5
141; FMA4-NEXT:    retq
142entry:
143  %AB = fmul <8 x double> %A, %B
144  %Sub = fsub <8 x double> %AB, %C
145  %Add = fadd <8 x double> %AB, %C
146  %subadd = shufflevector <8 x double> %Add, <8 x double> %Sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
147  ret <8 x double> %subadd
148}
149
150define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
151; NOFMA-LABEL: mul_subadd_ps512:
152; NOFMA:       # %bb.0: # %entry
153; NOFMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
154; NOFMA-NEXT:    vmulps %ymm3, %ymm1, %ymm1
155; NOFMA-NEXT:    vsubps %ymm5, %ymm1, %ymm2
156; NOFMA-NEXT:    vsubps %ymm4, %ymm0, %ymm3
157; NOFMA-NEXT:    vaddps %ymm5, %ymm1, %ymm1
158; NOFMA-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
159; NOFMA-NEXT:    vaddps %ymm4, %ymm0, %ymm0
160; NOFMA-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
161; NOFMA-NEXT:    retq
162;
163; FMA3_256-LABEL: mul_subadd_ps512:
164; FMA3_256:       # %bb.0: # %entry
165; FMA3_256-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
166; FMA3_256-NEXT:    vfmsubadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
167; FMA3_256-NEXT:    retq
168;
169; FMA3_512-LABEL: mul_subadd_ps512:
170; FMA3_512:       # %bb.0: # %entry
171; FMA3_512-NEXT:    vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
172; FMA3_512-NEXT:    retq
173;
174; FMA4-LABEL: mul_subadd_ps512:
175; FMA4:       # %bb.0: # %entry
176; FMA4-NEXT:    vfmsubaddps {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm4
177; FMA4-NEXT:    vfmsubaddps {{.*#+}} ymm1 = (ymm1 * ymm3) -/+ ymm5
178; FMA4-NEXT:    retq
179entry:
180  %AB = fmul <16 x float> %A, %B
181  %Sub = fsub <16 x float> %AB, %C
182  %Add = fadd <16 x float> %AB, %C
183  %subadd = shufflevector <16 x float> %Add, <16 x float> %Sub, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
184  ret <16 x float> %subadd
185}
186
187; This should not be matched to fmsubadd because the mul is on the wrong side of the fsub.
188define <2 x double> @mul_subadd_bad_commute(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {
189; CHECK-LABEL: mul_subadd_bad_commute:
190; CHECK:       # %bb.0: # %entry
191; CHECK-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
192; CHECK-NEXT:    vsubpd %xmm0, %xmm2, %xmm1
193; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
194; CHECK-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
195; CHECK-NEXT:    retq
196entry:
197  %AB = fmul <2 x double> %A, %B
198  %Sub = fsub <2 x double> %C, %AB
199  %Add = fadd <2 x double> %AB, %C
200  %subadd = shufflevector <2 x double> %Add, <2 x double> %Sub, <2 x i32> <i32 0, i32 3>
201  ret <2 x double> %subadd
202}
203
204attributes #0 = { nounwind "unsafe-fp-math"="true" }
205