xref: /llvm-project/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll (revision 57e8f840b6d33475ca5f023001996ab4bc9035b4)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 --fp-contract=fast --enable-unsafe-fp-math | FileCheck %s
3
4define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
5; CHECK-LABEL: test1:
6; CHECK:       # %bb.0: # %entry
7; CHECK-NEXT:    vfcmaddcph %zmm1, %zmm2, %zmm0
8; CHECK-NEXT:    retq
9entry:
10  %0 = bitcast <32 x half> %lhs.coerce.conj to <16 x i32>
11  %xor.i.i = xor <16 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
12  %1 = bitcast <16 x i32> %xor.i.i to <16 x float>
13  %2 = bitcast <32 x half> %rhs.coerce to <16 x float>
14  %3 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %1, <16 x float> %2, <16 x float> zeroinitializer, i16 -1, i32 4) #2
15  %4 = bitcast <16 x float> %3 to <32 x half>
16  %add = fadd fast <32 x half> %4, %acc.coerce
17  ret <32 x half> %add
18}
19
20define dso_local <32 x half> @test2(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
21; CHECK-LABEL: test2:
22; CHECK:       # %bb.0: # %entry
23; CHECK-NEXT:    vfcmaddcph %zmm1, %zmm2, %zmm0
24; CHECK-NEXT:    retq
25entry:
26  %0 = bitcast <32 x half> %lhs.coerce.conj to <16 x i32>
27  %xor.i.i = xor <16 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
28  %1 = bitcast <16 x i32> %xor.i.i to <16 x float>
29  %2 = bitcast <32 x half> %rhs.coerce to <16 x float>
30  %3 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %2, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) #2
31  %4 = bitcast <16 x float> %3 to <32 x half>
32  %add = fadd fast <32 x half> %4, %acc.coerce
33  ret <32 x half> %add
34}
35
36define dso_local <16 x half> @test3(<16 x half> %acc.coerce, <16 x half> %lhs.coerce.conj, <16 x half> %rhs.coerce) local_unnamed_addr #0 {
37; CHECK-LABEL: test3:
38; CHECK:       # %bb.0: # %entry
39; CHECK-NEXT:    vfcmaddcph %ymm1, %ymm2, %ymm0
40; CHECK-NEXT:    retq
41entry:
42  %0 = bitcast <16 x half> %lhs.coerce.conj to <8 x i32>
43  %xor.i.i = xor <8 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
44  %1 = bitcast <8 x i32> %xor.i.i to <8 x float>
45  %2 = bitcast <16 x half> %rhs.coerce to <8 x float>
46  %3 = tail call fast <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %1, <8 x float> %2, <8 x float> zeroinitializer, i8 -1) #2
47  %4 = bitcast <8 x float> %3 to <16 x half>
48  %add = fadd fast <16 x half> %4, %acc.coerce
49  ret <16 x half> %add
50}
51
52define dso_local <8 x half> @test4(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
53; CHECK-LABEL: test4:
54; CHECK:       # %bb.0: # %entry
55; CHECK-NEXT:    vfcmaddcph %xmm1, %xmm2, %xmm0
56; CHECK-NEXT:    retq
57entry:
58  %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32>
59  %xor.i.i = xor <4 x i32> %0, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
60  %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
61  %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
62  %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
63  %4 = bitcast <4 x float> %3 to <8 x half>
64  %add = fadd fast <8 x half> %4, %acc.coerce
65  ret <8 x half> %add
66}
67
68define dso_local <8 x half> @test5(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
69; CHECK-LABEL: test5:
70; CHECK:       # %bb.0: # %entry
71; CHECK-NEXT:    vfcmaddcph %xmm1, %xmm2, %xmm0
72; CHECK-NEXT:    retq
73entry:
74  %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32>
75  %xor.i.i = xor <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %0
76  %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
77  %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
78  %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
79  %4 = bitcast <4 x float> %3 to <8 x half>
80  %add = fadd fast <8 x half> %4, %acc.coerce
81  ret <8 x half> %add
82}
83
84define dso_local <8 x half> @test6(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
85; CHECK-LABEL: test6:
86; CHECK:       # %bb.0: # %entry
87; CHECK-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
88; CHECK-NEXT:    vfmaddcph %xmm2, %xmm1, %xmm0
89; CHECK-NEXT:    retq
90entry:
91  %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32>
92  %xor.i.i = xor <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %0
93  %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
94  %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
95  %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
96  %4 = bitcast <4 x float> %3 to <8 x half>
97  %add = fadd fast <8 x half> %4, %acc.coerce
98  ret <8 x half> %add
99}
100
101define dso_local <8 x half> @test7(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
102; CHECK-LABEL: test7:
103; CHECK:       # %bb.0: # %entry
104; CHECK-NEXT:    vfcmaddcph %xmm1, %xmm2, %xmm0
105; CHECK-NEXT:    retq
106entry:
107  %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32>
108  %xor.i.i = xor <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %0
109  %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
110  %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
111  %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
112  %4 = bitcast <4 x float> %3 to <8 x half>
113  %add = fadd fast <8 x half> %acc.coerce, %4
114  ret <8 x half> %add
115}
116
117define dso_local <8 x half> @test8(<8 x half> %acc.coerce, <4 x float> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 {
118; CHECK-LABEL: test8:
119; CHECK:       # %bb.0: # %entry
120; CHECK-NEXT:    vfcmaddcph %xmm1, %xmm2, %xmm0
121; CHECK-NEXT:    retq
122entry:
123  %0 = bitcast <4 x float> %lhs.coerce.conj to <4 x i32>
124  %xor.i.i = xor <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %0
125  %1 = bitcast <4 x i32> %xor.i.i to <4 x float>
126  %2 = bitcast <8 x half> %rhs.coerce to <4 x float>
127  %3 = tail call fast <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %1, <4 x float> %2, <4 x float> zeroinitializer, i8 -1) #2
128  %4 = bitcast <4 x float> %3 to <8 x half>
129  %add = fadd fast <8 x half> %acc.coerce, %4
130  ret <8 x half> %add
131}
132
133define dso_local <32 x half> @test9(<32 x half> %acc.coerce, <8 x i64> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
134; CHECK-LABEL: test9:
135; CHECK:       # %bb.0: # %entry
136; CHECK-NEXT:    vfcmaddcph %zmm1, %zmm2, %zmm0
137; CHECK-NEXT:    retq
138entry:
139  %xor1.i = xor <8 x i64> %lhs.coerce.conj, <i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160, i64 -9223372034707292160>
140  %0 = bitcast <8 x i64> %xor1.i to <16 x float>
141  %1 = bitcast <32 x half> %rhs.coerce to <16 x float>
142  %2 = tail call fast <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4) #2
143  %3 = bitcast <16 x float> %2 to <32 x half>
144  %add = fadd fast <32 x half> %3, %acc.coerce
145  ret <32 x half> %add
146}
147
148declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
149declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
150declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
151