xref: /llvm-project/llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll (revision 57e8f840b6d33475ca5f023001996ab4bc9035b4)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast --enable-no-signed-zeros-fp-math -mattr=avx512fp16 | FileCheck %s --check-prefixes=CHECK,NO-SZ
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast -mattr=avx512fp16 | FileCheck %s --check-prefixes=CHECK,HAS-SZ
4
5; FADD(acc, FMA(a, b, +0.0)) can be combined to FMA(a, b, acc) if the nsz flag set.
6define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
7; NO-SZ-LABEL: test1:
8; NO-SZ:       # %bb.0: # %entry
9; NO-SZ-NEXT:    vfcmaddcph %zmm2, %zmm1, %zmm0
10; NO-SZ-NEXT:    retq
11;
12; HAS-SZ-LABEL: test1:
13; HAS-SZ:       # %bb.0: # %entry
14; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
15; HAS-SZ-NEXT:    vfcmaddcph %zmm2, %zmm1, %zmm3
16; HAS-SZ-NEXT:    vaddph %zmm0, %zmm3, %zmm0
17; HAS-SZ-NEXT:    retq
18entry:
19  %0 = bitcast <32 x half> %a to <16 x float>
20  %1 = bitcast <32 x half> %b to <16 x float>
21  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
22  %3 = bitcast <16 x float> %2 to <32 x half>
23  %add.i = fadd <32 x half> %3, %acc
24  ret <32 x half> %add.i
25}
26
27define dso_local <32 x half> @test2(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
28; NO-SZ-LABEL: test2:
29; NO-SZ:       # %bb.0: # %entry
30; NO-SZ-NEXT:    vfmaddcph %zmm2, %zmm1, %zmm0
31; NO-SZ-NEXT:    retq
32;
33; HAS-SZ-LABEL: test2:
34; HAS-SZ:       # %bb.0: # %entry
35; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
36; HAS-SZ-NEXT:    vfmaddcph %zmm2, %zmm1, %zmm3
37; HAS-SZ-NEXT:    vaddph %zmm0, %zmm3, %zmm0
38; HAS-SZ-NEXT:    retq
39entry:
40  %0 = bitcast <32 x half> %a to <16 x float>
41  %1 = bitcast <32 x half> %b to <16 x float>
42  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
43  %3 = bitcast <16 x float> %2 to <32 x half>
44  %add.i = fadd <32 x half> %3, %acc
45  ret <32 x half> %add.i
46}
47
48define dso_local <16 x half> @test3(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
49; NO-SZ-LABEL: test3:
50; NO-SZ:       # %bb.0: # %entry
51; NO-SZ-NEXT:    vfcmaddcph %ymm2, %ymm1, %ymm0
52; NO-SZ-NEXT:    retq
53;
54; HAS-SZ-LABEL: test3:
55; HAS-SZ:       # %bb.0: # %entry
56; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
57; HAS-SZ-NEXT:    vfcmaddcph %ymm2, %ymm1, %ymm3
58; HAS-SZ-NEXT:    vaddph %ymm0, %ymm3, %ymm0
59; HAS-SZ-NEXT:    retq
60entry:
61  %0 = bitcast <16 x half> %a to <8 x float>
62  %1 = bitcast <16 x half> %b to <8 x float>
63  %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
64  %3 = bitcast <8 x float> %2 to <16 x half>
65  %add.i = fadd <16 x half> %3, %acc
66  ret <16 x half> %add.i
67}
68
69define dso_local <16 x half> @test4(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
70; NO-SZ-LABEL: test4:
71; NO-SZ:       # %bb.0: # %entry
72; NO-SZ-NEXT:    vfmaddcph %ymm2, %ymm1, %ymm0
73; NO-SZ-NEXT:    retq
74;
75; HAS-SZ-LABEL: test4:
76; HAS-SZ:       # %bb.0: # %entry
77; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
78; HAS-SZ-NEXT:    vfmaddcph %ymm2, %ymm1, %ymm3
79; HAS-SZ-NEXT:    vaddph %ymm0, %ymm3, %ymm0
80; HAS-SZ-NEXT:    retq
81entry:
82  %0 = bitcast <16 x half> %a to <8 x float>
83  %1 = bitcast <16 x half> %b to <8 x float>
84  %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
85  %3 = bitcast <8 x float> %2 to <16 x half>
86  %add.i = fadd <16 x half> %3, %acc
87  ret <16 x half> %add.i
88}
89
90define dso_local <8 x half> @test5(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
91; NO-SZ-LABEL: test5:
92; NO-SZ:       # %bb.0: # %entry
93; NO-SZ-NEXT:    vfcmaddcph %xmm2, %xmm1, %xmm0
94; NO-SZ-NEXT:    retq
95;
96; HAS-SZ-LABEL: test5:
97; HAS-SZ:       # %bb.0: # %entry
98; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
99; HAS-SZ-NEXT:    vfcmaddcph %xmm2, %xmm1, %xmm3
100; HAS-SZ-NEXT:    vaddph %xmm0, %xmm3, %xmm0
101; HAS-SZ-NEXT:    retq
102entry:
103  %0 = bitcast <8 x half> %a to <4 x float>
104  %1 = bitcast <8 x half> %b to <4 x float>
105  %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
106  %3 = bitcast <4 x float> %2 to <8 x half>
107  %add.i = fadd <8 x half> %3, %acc
108  ret <8 x half> %add.i
109}
110
111define dso_local <8 x half> @test6(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
112; NO-SZ-LABEL: test6:
113; NO-SZ:       # %bb.0: # %entry
114; NO-SZ-NEXT:    vfmaddcph %xmm2, %xmm1, %xmm0
115; NO-SZ-NEXT:    retq
116;
117; HAS-SZ-LABEL: test6:
118; HAS-SZ:       # %bb.0: # %entry
119; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
120; HAS-SZ-NEXT:    vfmaddcph %xmm2, %xmm1, %xmm3
121; HAS-SZ-NEXT:    vaddph %xmm0, %xmm3, %xmm0
122; HAS-SZ-NEXT:    retq
123entry:
124  %0 = bitcast <8 x half> %a to <4 x float>
125  %1 = bitcast <8 x half> %b to <4 x float>
126  %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
127  %3 = bitcast <4 x float> %2 to <8 x half>
128  %add.i = fadd <8 x half> %3, %acc
129  ret <8 x half> %add.i
130}
131
132; FADD(acc, FMA(a, b, -0.0)) can be combined to FMA(a, b, acc) no matter if the nsz flag set.
133define dso_local <32 x half> @test13(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
134; CHECK-LABEL: test13:
135; CHECK:       # %bb.0: # %entry
136; CHECK-NEXT:    vfcmaddcph %zmm2, %zmm1, %zmm0
137; CHECK-NEXT:    retq
138entry:
139  %0 = bitcast <32 x half> %a to <16 x float>
140  %1 = bitcast <32 x half> %b to <16 x float>
141  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
142  %3 = bitcast <16 x float> %2 to <32 x half>
143  %add.i = fadd <32 x half> %3, %acc
144  ret <32 x half> %add.i
145}
146
147define dso_local <32 x half> @test14(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
148; CHECK-LABEL: test14:
149; CHECK:       # %bb.0: # %entry
150; CHECK-NEXT:    vfmaddcph %zmm2, %zmm1, %zmm0
151; CHECK-NEXT:    retq
152entry:
153  %0 = bitcast <32 x half> %a to <16 x float>
154  %1 = bitcast <32 x half> %b to <16 x float>
155  %2 = tail call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
156  %3 = bitcast <16 x float> %2 to <32 x half>
157  %add.i = fadd <32 x half> %3, %acc
158  ret <32 x half> %add.i
159}
160
161define dso_local <16 x half> @test15(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
162; CHECK-LABEL: test15:
163; CHECK:       # %bb.0: # %entry
164; CHECK-NEXT:    vfcmaddcph %ymm2, %ymm1, %ymm0
165; CHECK-NEXT:    retq
166entry:
167  %0 = bitcast <16 x half> %a to <8 x float>
168  %1 = bitcast <16 x half> %b to <8 x float>
169  %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
170  %3 = bitcast <8 x float> %2 to <16 x half>
171  %add.i = fadd <16 x half> %3, %acc
172  ret <16 x half> %add.i
173}
174
175define dso_local <16 x half> @test16(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
176; CHECK-LABEL: test16:
177; CHECK:       # %bb.0: # %entry
178; CHECK-NEXT:    vfmaddcph %ymm2, %ymm1, %ymm0
179; CHECK-NEXT:    retq
180entry:
181  %0 = bitcast <16 x half> %a to <8 x float>
182  %1 = bitcast <16 x half> %b to <8 x float>
183  %2 = tail call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
184  %3 = bitcast <8 x float> %2 to <16 x half>
185  %add.i = fadd <16 x half> %3, %acc
186  ret <16 x half> %add.i
187}
188
189define dso_local <8 x half> @test17(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
190; CHECK-LABEL: test17:
191; CHECK:       # %bb.0: # %entry
192; CHECK-NEXT:    vfcmaddcph %xmm2, %xmm1, %xmm0
193; CHECK-NEXT:    retq
194entry:
195  %0 = bitcast <8 x half> %a to <4 x float>
196  %1 = bitcast <8 x half> %b to <4 x float>
197  %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
198  %3 = bitcast <4 x float> %2 to <8 x half>
199  %add.i = fadd <8 x half> %3, %acc
200  ret <8 x half> %add.i
201}
202
203define dso_local <8 x half> @test18(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
204; CHECK-LABEL: test18:
205; CHECK:       # %bb.0: # %entry
206; CHECK-NEXT:    vfmaddcph %xmm2, %xmm1, %xmm0
207; CHECK-NEXT:    retq
208entry:
209  %0 = bitcast <8 x half> %a to <4 x float>
210  %1 = bitcast <8 x half> %b to <4 x float>
211  %2 = tail call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
212  %3 = bitcast <4 x float> %2 to <8 x half>
213  %add.i = fadd <8 x half> %3, %acc
214  ret <8 x half> %add.i
215}
216
217declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
218declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
219declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
220declare <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
221declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
222declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
223