xref: /llvm-project/llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll (revision f8ed7093452aabd71650a3bb5dbca942815f9563)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512fp16 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s
3
4; Incremental updates of the instruction depths should be enough for this test
5; case.
6; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512fp16 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s
7
8; Verify that the first two adds are independent regardless of how the inputs are
9; commuted. The destination registers are used as source registers for the third add.
10
11define half @reassociate_adds1(half %x0, half %x1, half %x2, half %x3) {
12; CHECK-LABEL: reassociate_adds1:
13; CHECK:       # %bb.0:
14; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
15; CHECK-NEXT:    vaddsh %xmm3, %xmm2, %xmm1
16; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
17; CHECK-NEXT:    retq
18  %t0 = fadd reassoc nsz half %x0, %x1
19  %t1 = fadd reassoc nsz half %t0, %x2
20  %t2 = fadd reassoc nsz half %t1, %x3
21  ret half %t2
22}
23
24define half @reassociate_adds2(half %x0, half %x1, half %x2, half %x3) {
25; CHECK-LABEL: reassociate_adds2:
26; CHECK:       # %bb.0:
27; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
28; CHECK-NEXT:    vaddsh %xmm3, %xmm2, %xmm1
29; CHECK-NEXT:    vaddsh %xmm0, %xmm1, %xmm0
30; CHECK-NEXT:    retq
31  %t0 = fadd reassoc nsz half %x0, %x1
32  %t1 = fadd reassoc nsz half %x2, %t0
33  %t2 = fadd reassoc nsz half %t1, %x3
34  ret half %t2
35}
36
37define half @reassociate_adds3(half %x0, half %x1, half %x2, half %x3) {
38; CHECK-LABEL: reassociate_adds3:
39; CHECK:       # %bb.0:
40; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
41; CHECK-NEXT:    vaddsh %xmm2, %xmm3, %xmm1
42; CHECK-NEXT:    vaddsh %xmm0, %xmm1, %xmm0
43; CHECK-NEXT:    retq
44  %t0 = fadd reassoc nsz half %x0, %x1
45  %t1 = fadd reassoc nsz half %t0, %x2
46  %t2 = fadd reassoc nsz half %x3, %t1
47  ret half %t2
48}
49
50define half @reassociate_adds4(half %x0, half %x1, half %x2, half %x3) {
51; CHECK-LABEL: reassociate_adds4:
52; CHECK:       # %bb.0:
53; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
54; CHECK-NEXT:    vaddsh %xmm2, %xmm3, %xmm1
55; CHECK-NEXT:    vaddsh %xmm0, %xmm1, %xmm0
56; CHECK-NEXT:    retq
57  %t0 = fadd reassoc nsz half %x0, %x1
58  %t1 = fadd reassoc nsz half %x2, %t0
59  %t2 = fadd reassoc nsz half %x3, %t1
60  ret half %t2
61}
62
63; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not
64; produced because that would cost more compile time.
65
66define half @reassociate_adds5(half %x0, half %x1, half %x2, half %x3, half %x4, half %x5, half %x6, half %x7) {
67; CHECK-LABEL: reassociate_adds5:
68; CHECK:       # %bb.0:
69; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
70; CHECK-NEXT:    vaddsh %xmm3, %xmm2, %xmm1
71; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
72; CHECK-NEXT:    vaddsh %xmm5, %xmm4, %xmm1
73; CHECK-NEXT:    vaddsh %xmm6, %xmm1, %xmm1
74; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
75; CHECK-NEXT:    vaddsh %xmm7, %xmm0, %xmm0
76; CHECK-NEXT:    retq
77  %t0 = fadd reassoc nsz half %x0, %x1
78  %t1 = fadd reassoc nsz half %t0, %x2
79  %t2 = fadd reassoc nsz half %t1, %x3
80  %t3 = fadd reassoc nsz half %t2, %x4
81  %t4 = fadd reassoc nsz half %t3, %x5
82  %t5 = fadd reassoc nsz half %t4, %x6
83  %t6 = fadd reassoc nsz half %t5, %x7
84  ret half %t6
85}
86
87; Verify that we only need two associative operations to reassociate the operands.
88; Also, we should reassociate such that the result of the high latency division
89; is used by the final 'add' rather than reassociating the %x3 operand with the
90; division. The latter reassociation would not improve anything.
91
92define half @reassociate_adds6(half %x0, half %x1, half %x2, half %x3) {
93; CHECK-LABEL: reassociate_adds6:
94; CHECK:       # %bb.0:
95; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
96; CHECK-NEXT:    vaddsh %xmm2, %xmm3, %xmm1
97; CHECK-NEXT:    vaddsh %xmm0, %xmm1, %xmm0
98; CHECK-NEXT:    retq
99  %t0 = fdiv reassoc nsz half %x0, %x1
100  %t1 = fadd reassoc nsz half %x2, %t0
101  %t2 = fadd reassoc nsz half %x3, %t1
102  ret half %t2
103}
104
105; Verify that SSE and AVX scalar single-precision multiplies are reassociated.
106
107define half @reassociate_muls1(half %x0, half %x1, half %x2, half %x3) {
108; CHECK-LABEL: reassociate_muls1:
109; CHECK:       # %bb.0:
110; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
111; CHECK-NEXT:    vmulsh %xmm2, %xmm3, %xmm1
112; CHECK-NEXT:    vmulsh %xmm0, %xmm1, %xmm0
113; CHECK-NEXT:    retq
114  %t0 = fdiv reassoc nsz half %x0, %x1
115  %t1 = fmul reassoc nsz half %x2, %t0
116  %t2 = fmul reassoc nsz half %x3, %t1
117  ret half %t2
118}
119
120; Verify that SSE and AVX 128-bit vector half-precision adds are reassociated.
121
122define <8 x half> @reassociate_adds_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) {
123; CHECK-LABEL: reassociate_adds_v8f16:
124; CHECK:       # %bb.0:
125; CHECK-NEXT:    vdivph %xmm1, %xmm0, %xmm0
126; CHECK-NEXT:    vaddph %xmm2, %xmm3, %xmm1
127; CHECK-NEXT:    vaddph %xmm0, %xmm1, %xmm0
128; CHECK-NEXT:    retq
129  %t0 = fdiv reassoc nsz <8 x half> %x0, %x1
130  %t1 = fadd reassoc nsz <8 x half> %x2, %t0
131  %t2 = fadd reassoc nsz <8 x half> %x3, %t1
132  ret <8 x half> %t2
133}
134
135; Verify that SSE and AVX 128-bit vector half-precision multiplies are reassociated.
136
137define <8 x half> @reassociate_muls_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) {
138; CHECK-LABEL: reassociate_muls_v8f16:
139; CHECK:       # %bb.0:
140; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
141; CHECK-NEXT:    vmulph %xmm2, %xmm3, %xmm1
142; CHECK-NEXT:    vmulph %xmm0, %xmm1, %xmm0
143; CHECK-NEXT:    retq
144  %t0 = fadd reassoc nsz <8 x half> %x0, %x1
145  %t1 = fmul reassoc nsz <8 x half> %x2, %t0
146  %t2 = fmul reassoc nsz <8 x half> %x3, %t1
147  ret <8 x half> %t2
148}
149
150; Verify that AVX 256-bit vector half-precision adds are reassociated.
151
152define <16 x half> @reassociate_adds_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) {
153; CHECK-LABEL: reassociate_adds_v16f16:
154; CHECK:       # %bb.0:
155; CHECK-NEXT:    vdivph %ymm1, %ymm0, %ymm0
156; CHECK-NEXT:    vaddph %ymm2, %ymm3, %ymm1
157; CHECK-NEXT:    vaddph %ymm0, %ymm1, %ymm0
158; CHECK-NEXT:    retq
159  %t0 = fdiv reassoc nsz <16 x half> %x0, %x1
160  %t1 = fadd reassoc nsz <16 x half> %x2, %t0
161  %t2 = fadd reassoc nsz <16 x half> %x3, %t1
162  ret <16 x half> %t2
163}
164
165; Verify that AVX 256-bit vector half-precision multiplies are reassociated.
166
167define <16 x half> @reassociate_muls_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) {
168; CHECK-LABEL: reassociate_muls_v16f16:
169; CHECK:       # %bb.0:
170; CHECK-NEXT:    vaddph %ymm1, %ymm0, %ymm0
171; CHECK-NEXT:    vmulph %ymm2, %ymm3, %ymm1
172; CHECK-NEXT:    vmulph %ymm0, %ymm1, %ymm0
173; CHECK-NEXT:    retq
174  %t0 = fadd reassoc nsz <16 x half> %x0, %x1
175  %t1 = fmul reassoc nsz <16 x half> %x2, %t0
176  %t2 = fmul reassoc nsz <16 x half> %x3, %t1
177  ret <16 x half> %t2
178}
179
180; Verify that AVX512 512-bit vector half-precision adds are reassociated.
181
182define <32 x half> @reassociate_adds_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) {
183; CHECK-LABEL: reassociate_adds_v32f16:
184; CHECK:       # %bb.0:
185; CHECK-NEXT:    vdivph %zmm1, %zmm0, %zmm0
186; CHECK-NEXT:    vaddph %zmm2, %zmm3, %zmm1
187; CHECK-NEXT:    vaddph %zmm0, %zmm1, %zmm0
188; CHECK-NEXT:    retq
189  %t0 = fdiv reassoc nsz <32 x half> %x0, %x1
190  %t1 = fadd reassoc nsz <32 x half> %x2, %t0
191  %t2 = fadd reassoc nsz <32 x half> %x3, %t1
192  ret <32 x half> %t2
193}
194
195; Verify that AVX512 512-bit vector half-precision multiplies are reassociated.
196
197define <32 x half> @reassociate_muls_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) {
198; CHECK-LABEL: reassociate_muls_v32f16:
199; CHECK:       # %bb.0:
200; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0
201; CHECK-NEXT:    vmulph %zmm2, %zmm3, %zmm1
202; CHECK-NEXT:    vmulph %zmm0, %zmm1, %zmm0
203; CHECK-NEXT:    retq
204  %t0 = fadd reassoc nsz <32 x half> %x0, %x1
205  %t1 = fmul reassoc nsz <32 x half> %x2, %t0
206  %t2 = fmul reassoc nsz <32 x half> %x3, %t1
207  ret <32 x half> %t2
208}
209
210; Verify that SSE and AVX scalar half-precision minimum ops are reassociated.
211
212define half @reassociate_mins_half(half %x0, half %x1, half %x2, half %x3) {
213; CHECK-LABEL: reassociate_mins_half:
214; CHECK:       # %bb.0:
215; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
216; CHECK-NEXT:    vminsh %xmm2, %xmm3, %xmm1
217; CHECK-NEXT:    vminsh %xmm0, %xmm1, %xmm0
218; CHECK-NEXT:    retq
219  %t0 = fdiv half %x0, %x1
220  %cmp1 = fcmp olt half %x2, %t0
221  %sel1 = select i1 %cmp1, half %x2, half %t0
222  %cmp2 = fcmp olt half %x3, %sel1
223  %sel2 = select i1 %cmp2, half %x3, half %sel1
224  ret half %sel2
225}
226
227; Verify that SSE and AVX scalar half-precision maximum ops are reassociated.
228
229define half @reassociate_maxs_half(half %x0, half %x1, half %x2, half %x3) {
230; CHECK-LABEL: reassociate_maxs_half:
231; CHECK:       # %bb.0:
232; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
233; CHECK-NEXT:    vmaxsh %xmm2, %xmm3, %xmm1
234; CHECK-NEXT:    vmaxsh %xmm0, %xmm1, %xmm0
235; CHECK-NEXT:    retq
236  %t0 = fdiv half %x0, %x1
237  %cmp1 = fcmp ogt half %x2, %t0
238  %sel1 = select i1 %cmp1, half %x2, half %t0
239  %cmp2 = fcmp ogt half %x3, %sel1
240  %sel2 = select i1 %cmp2, half %x3, half %sel1
241  ret half %sel2
242}
243
244; Verify that SSE and AVX 128-bit vector half-precision minimum ops are reassociated.
245
246define <8 x half> @reassociate_mins_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) {
247; CHECK-LABEL: reassociate_mins_v8f16:
248; CHECK:       # %bb.0:
249; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
250; CHECK-NEXT:    vminph %xmm2, %xmm3, %xmm1
251; CHECK-NEXT:    vminph %xmm0, %xmm1, %xmm0
252; CHECK-NEXT:    retq
253  %t0 = fadd <8 x half> %x0, %x1
254  %cmp1 = fcmp olt <8 x half> %x2, %t0
255  %sel1 = select <8 x i1> %cmp1, <8 x half> %x2, <8 x half> %t0
256  %cmp2 = fcmp olt <8 x half> %x3, %sel1
257  %sel2 = select <8 x i1> %cmp2, <8 x half> %x3, <8 x half> %sel1
258  ret <8 x half> %sel2
259}
260
261; Verify that SSE and AVX 128-bit vector half-precision maximum ops are reassociated.
262
263define <8 x half> @reassociate_maxs_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) {
264; CHECK-LABEL: reassociate_maxs_v8f16:
265; CHECK:       # %bb.0:
266; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
267; CHECK-NEXT:    vmaxph %xmm2, %xmm3, %xmm1
268; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm0
269; CHECK-NEXT:    retq
270  %t0 = fadd <8 x half> %x0, %x1
271  %cmp1 = fcmp ogt <8 x half> %x2, %t0
272  %sel1 = select <8 x i1> %cmp1, <8 x half> %x2, <8 x half> %t0
273  %cmp2 = fcmp ogt <8 x half> %x3, %sel1
274  %sel2 = select <8 x i1> %cmp2, <8 x half> %x3, <8 x half> %sel1
275  ret <8 x half> %sel2
276}
277
278; Verify that AVX 256-bit vector half-precision minimum ops are reassociated.
279
280define <16 x half> @reassociate_mins_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) {
281; CHECK-LABEL: reassociate_mins_v16f16:
282; CHECK:       # %bb.0:
283; CHECK-NEXT:    vaddph %ymm1, %ymm0, %ymm0
284; CHECK-NEXT:    vminph %ymm2, %ymm3, %ymm1
285; CHECK-NEXT:    vminph %ymm0, %ymm1, %ymm0
286; CHECK-NEXT:    retq
287  %t0 = fadd <16 x half> %x0, %x1
288  %cmp1 = fcmp olt <16 x half> %x2, %t0
289  %sel1 = select <16 x i1> %cmp1, <16 x half> %x2, <16 x half> %t0
290  %cmp2 = fcmp olt <16 x half> %x3, %sel1
291  %sel2 = select <16 x i1> %cmp2, <16 x half> %x3, <16 x half> %sel1
292  ret <16 x half> %sel2
293}
294
295; Verify that AVX 256-bit vector half-precision maximum ops are reassociated.
296
297define <16 x half> @reassociate_maxs_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) {
298; CHECK-LABEL: reassociate_maxs_v16f16:
299; CHECK:       # %bb.0:
300; CHECK-NEXT:    vaddph %ymm1, %ymm0, %ymm0
301; CHECK-NEXT:    vmaxph %ymm2, %ymm3, %ymm1
302; CHECK-NEXT:    vmaxph %ymm0, %ymm1, %ymm0
303; CHECK-NEXT:    retq
304  %t0 = fadd <16 x half> %x0, %x1
305  %cmp1 = fcmp ogt <16 x half> %x2, %t0
306  %sel1 = select <16 x i1> %cmp1, <16 x half> %x2, <16 x half> %t0
307  %cmp2 = fcmp ogt <16 x half> %x3, %sel1
308  %sel2 = select <16 x i1> %cmp2, <16 x half> %x3, <16 x half> %sel1
309  ret <16 x half> %sel2
310}
311
312; Verify that AVX512 512-bit vector half-precision minimum ops are reassociated.
313
314define <32 x half> @reassociate_mins_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) {
315; CHECK-LABEL: reassociate_mins_v32f16:
316; CHECK:       # %bb.0:
317; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0
318; CHECK-NEXT:    vminph %zmm2, %zmm3, %zmm1
319; CHECK-NEXT:    vminph %zmm0, %zmm1, %zmm0
320; CHECK-NEXT:    retq
321  %t0 = fadd <32 x half> %x0, %x1
322  %cmp1 = fcmp olt <32 x half> %x2, %t0
323  %sel1 = select <32 x i1> %cmp1, <32 x half> %x2, <32 x half> %t0
324  %cmp2 = fcmp olt <32 x half> %x3, %sel1
325  %sel2 = select <32 x i1> %cmp2, <32 x half> %x3, <32 x half> %sel1
326  ret <32 x half> %sel2
327}
328
329; Verify that AVX512 512-bit vector half-precision maximum ops are reassociated.
330
331define <32 x half> @reassociate_maxs_v16f32(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) {
332; CHECK-LABEL: reassociate_maxs_v16f32:
333; CHECK:       # %bb.0:
334; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0
335; CHECK-NEXT:    vmaxph %zmm2, %zmm3, %zmm1
336; CHECK-NEXT:    vmaxph %zmm0, %zmm1, %zmm0
337; CHECK-NEXT:    retq
338  %t0 = fadd <32 x half> %x0, %x1
339  %cmp1 = fcmp ogt <32 x half> %x2, %t0
340  %sel1 = select <32 x i1> %cmp1, <32 x half> %x2, <32 x half> %t0
341  %cmp2 = fcmp ogt <32 x half> %x3, %sel1
342  %sel2 = select <32 x i1> %cmp2, <32 x half> %x3, <32 x half> %sel1
343  ret <32 x half> %sel2
344}
345
346