xref: /minix3/external/bsd/llvm/dist/llvm/test/Analysis/CostModel/X86/reduction.ll (revision f4a2713ac843a11c696ec80c0a5e3e5d80b4d338)
1*f4a2713aSLionel Sambuc; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core2 -mtriple=x86_64-apple-darwin | FileCheck %s
2*f4a2713aSLionel Sambuc; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=SSE3
3*f4a2713aSLionel Sambuc; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7-avx -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX
4*f4a2713aSLionel Sambuc; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core-avx2 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX2
5*f4a2713aSLionel Sambuc
6*f4a2713aSLionel Sambucdefine fastcc float @reduction_cost_float(<4 x float> %rdx) {
7*f4a2713aSLionel Sambuc  %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
8*f4a2713aSLionel Sambuc  %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
9*f4a2713aSLionel Sambuc  %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
10*f4a2713aSLionel Sambuc  %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
11*f4a2713aSLionel Sambuc
12*f4a2713aSLionel Sambuc; Check that we recognize the tree starting at the extractelement as a
13*f4a2713aSLionel Sambuc; reduction.
14*f4a2713aSLionel Sambuc; CHECK-LABEL: reduction_cost
15*f4a2713aSLionel Sambuc; CHECK:  cost of 9 {{.*}} extractelement
16*f4a2713aSLionel Sambuc
17*f4a2713aSLionel Sambuc  %r = extractelement <4 x float> %bin.rdx8, i32 0
18*f4a2713aSLionel Sambuc  ret float %r
19*f4a2713aSLionel Sambuc}
20*f4a2713aSLionel Sambuc
21*f4a2713aSLionel Sambucdefine fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
22*f4a2713aSLionel Sambuc  %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef,
23*f4a2713aSLionel Sambuc   <8 x i32> <i32 4    , i32     5, i32     6, i32     7,
24*f4a2713aSLionel Sambuc              i32 undef, i32 undef, i32 undef, i32 undef>
25*f4a2713aSLionel Sambuc  %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
26*f4a2713aSLionel Sambuc  %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,
27*f4a2713aSLionel Sambuc   <8 x i32> <i32 2    , i32 3,     i32 undef, i32 undef,
28*f4a2713aSLionel Sambuc              i32 undef, i32 undef, i32 undef, i32 undef>
29*f4a2713aSLionel Sambuc  %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
30*f4a2713aSLionel Sambuc  %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef,
31*f4a2713aSLionel Sambuc   <8 x i32> <i32 1    , i32 undef, i32 undef, i32 undef,
32*f4a2713aSLionel Sambuc              i32 undef, i32 undef, i32 undef, i32 undef>
33*f4a2713aSLionel Sambuc  %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
34*f4a2713aSLionel Sambuc
35*f4a2713aSLionel Sambuc; CHECK-LABEL: reduction_cost_int
36*f4a2713aSLionel Sambuc; CHECK:  cost of 23 {{.*}} extractelement
37*f4a2713aSLionel Sambuc
38*f4a2713aSLionel Sambuc  %r = extractelement <8 x i32> %bin.rdx.3, i32 0
39*f4a2713aSLionel Sambuc  ret i32 %r
40*f4a2713aSLionel Sambuc}
41*f4a2713aSLionel Sambuc
42*f4a2713aSLionel Sambucdefine fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
43*f4a2713aSLionel Sambuc  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
44*f4a2713aSLionel Sambuc        <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
45*f4a2713aSLionel Sambuc  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
46*f4a2713aSLionel Sambuc        <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
47*f4a2713aSLionel Sambuc  %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
48*f4a2713aSLionel Sambuc  %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
49*f4a2713aSLionel Sambuc        <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
50*f4a2713aSLionel Sambuc  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
51*f4a2713aSLionel Sambuc        <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
52*f4a2713aSLionel Sambuc  %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
53*f4a2713aSLionel Sambuc
54*f4a2713aSLionel Sambuc; CHECK-LABEL: pairwise_hadd
55*f4a2713aSLionel Sambuc; CHECK: cost of 11 {{.*}} extractelement
56*f4a2713aSLionel Sambuc
57*f4a2713aSLionel Sambuc  %r = extractelement <4 x float> %bin.rdx.1, i32 0
58*f4a2713aSLionel Sambuc  %r2 = fadd float %r, %f1
59*f4a2713aSLionel Sambuc  ret float %r2
60*f4a2713aSLionel Sambuc}
61*f4a2713aSLionel Sambuc
62*f4a2713aSLionel Sambucdefine fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
63*f4a2713aSLionel Sambuc  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
64*f4a2713aSLionel Sambuc        <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
65*f4a2713aSLionel Sambuc  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
66*f4a2713aSLionel Sambuc        <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
67*f4a2713aSLionel Sambuc  %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
68*f4a2713aSLionel Sambuc  %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
69*f4a2713aSLionel Sambuc        <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
70*f4a2713aSLionel Sambuc  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
71*f4a2713aSLionel Sambuc        <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
72*f4a2713aSLionel Sambuc  %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
73*f4a2713aSLionel Sambuc
74*f4a2713aSLionel Sambuc; CHECK-LABEL: pairwise_hadd_assoc
75*f4a2713aSLionel Sambuc; CHECK: cost of 11 {{.*}} extractelement
76*f4a2713aSLionel Sambuc
77*f4a2713aSLionel Sambuc  %r = extractelement <4 x float> %bin.rdx.1, i32 0
78*f4a2713aSLionel Sambuc  %r2 = fadd float %r, %f1
79*f4a2713aSLionel Sambuc  ret float %r2
80*f4a2713aSLionel Sambuc}
81*f4a2713aSLionel Sambuc
82*f4a2713aSLionel Sambucdefine fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
83*f4a2713aSLionel Sambuc  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
84*f4a2713aSLionel Sambuc        <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
85*f4a2713aSLionel Sambuc  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
86*f4a2713aSLionel Sambuc        <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
87*f4a2713aSLionel Sambuc  %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
88*f4a2713aSLionel Sambuc  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
89*f4a2713aSLionel Sambuc        <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
90*f4a2713aSLionel Sambuc  %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
91*f4a2713aSLionel Sambuc
92*f4a2713aSLionel Sambuc; CHECK-LABEL: pairwise_hadd_skip_first
93*f4a2713aSLionel Sambuc; CHECK: cost of 11 {{.*}} extractelement
94*f4a2713aSLionel Sambuc
95*f4a2713aSLionel Sambuc  %r = extractelement <4 x float> %bin.rdx.1, i32 0
96*f4a2713aSLionel Sambuc  %r2 = fadd float %r, %f1
97*f4a2713aSLionel Sambuc  ret float %r2
98*f4a2713aSLionel Sambuc}
99*f4a2713aSLionel Sambuc
100*f4a2713aSLionel Sambucdefine fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1) {
101*f4a2713aSLionel Sambuc  %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
102*f4a2713aSLionel Sambuc  %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
103*f4a2713aSLionel Sambuc
104*f4a2713aSLionel Sambuc; SSE3:  cost of 2 {{.*}} extractelement
105*f4a2713aSLionel Sambuc; AVX:  cost of 2 {{.*}} extractelement
106*f4a2713aSLionel Sambuc; AVX2:  cost of 2 {{.*}} extractelement
107*f4a2713aSLionel Sambuc
108*f4a2713aSLionel Sambuc  %r = extractelement <2 x double> %bin.rdx, i32 0
109*f4a2713aSLionel Sambuc  ret double %r
110*f4a2713aSLionel Sambuc}
111*f4a2713aSLionel Sambuc
112*f4a2713aSLionel Sambucdefine fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
113*f4a2713aSLionel Sambuc  %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
114*f4a2713aSLionel Sambuc  %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
115*f4a2713aSLionel Sambuc  %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
116*f4a2713aSLionel Sambuc  %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
117*f4a2713aSLionel Sambuc
118*f4a2713aSLionel Sambuc; SSE3:  cost of 4 {{.*}} extractelement
119*f4a2713aSLionel Sambuc; AVX:  cost of 3 {{.*}} extractelement
120*f4a2713aSLionel Sambuc; AVX2:  cost of 3 {{.*}} extractelement
121*f4a2713aSLionel Sambuc
122*f4a2713aSLionel Sambuc  %r = extractelement <4 x float> %bin.rdx8, i32 0
123*f4a2713aSLionel Sambuc  ret float %r
124*f4a2713aSLionel Sambuc}
125*f4a2713aSLionel Sambuc
126*f4a2713aSLionel Sambucdefine fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) {
127*f4a2713aSLionel Sambuc  %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
128*f4a2713aSLionel Sambuc  %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
129*f4a2713aSLionel Sambuc  %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
130*f4a2713aSLionel Sambuc  %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
131*f4a2713aSLionel Sambuc
132*f4a2713aSLionel Sambuc; AVX:  cost of 3 {{.*}} extractelement
133*f4a2713aSLionel Sambuc; AVX2:  cost of 3 {{.*}} extractelement
134*f4a2713aSLionel Sambuc
135*f4a2713aSLionel Sambuc  %r = extractelement <4 x double> %bin.rdx8, i32 0
136*f4a2713aSLionel Sambuc  ret double %r
137*f4a2713aSLionel Sambuc}
138*f4a2713aSLionel Sambuc
139*f4a2713aSLionel Sambucdefine fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
140*f4a2713aSLionel Sambuc  %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
141*f4a2713aSLionel Sambuc  %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
142*f4a2713aSLionel Sambuc  %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
143*f4a2713aSLionel Sambuc  %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
144*f4a2713aSLionel Sambuc  %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
145*f4a2713aSLionel Sambuc  %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
146*f4a2713aSLionel Sambuc
147*f4a2713aSLionel Sambuc; AVX:  cost of 4 {{.*}} extractelement
148*f4a2713aSLionel Sambuc; AVX2:  cost of 4 {{.*}} extractelement
149*f4a2713aSLionel Sambuc
150*f4a2713aSLionel Sambuc  %r = extractelement <8 x float> %bin.rdx8, i32 0
151*f4a2713aSLionel Sambuc  ret float %r
152*f4a2713aSLionel Sambuc}
153*f4a2713aSLionel Sambuc
154*f4a2713aSLionel Sambucdefine fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
155*f4a2713aSLionel Sambuc  %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
156*f4a2713aSLionel Sambuc  %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
157*f4a2713aSLionel Sambuc
158*f4a2713aSLionel Sambuc; SSE3:  cost of 2 {{.*}} extractelement
159*f4a2713aSLionel Sambuc; AVX:  cost of 1 {{.*}} extractelement
160*f4a2713aSLionel Sambuc; AVX2:  cost of 1 {{.*}} extractelement
161*f4a2713aSLionel Sambuc
162*f4a2713aSLionel Sambuc  %r = extractelement <2 x i64> %bin.rdx, i32 0
163*f4a2713aSLionel Sambuc  ret i64 %r
164*f4a2713aSLionel Sambuc}
165*f4a2713aSLionel Sambuc
166*f4a2713aSLionel Sambucdefine fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
167*f4a2713aSLionel Sambuc  %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
168*f4a2713aSLionel Sambuc  %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
169*f4a2713aSLionel Sambuc  %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
170*f4a2713aSLionel Sambuc  %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
171*f4a2713aSLionel Sambuc
172*f4a2713aSLionel Sambuc; SSE3:  cost of 3 {{.*}} extractelement
173*f4a2713aSLionel Sambuc; AVX:  cost of 3 {{.*}} extractelement
174*f4a2713aSLionel Sambuc; AVX2:  cost of 3 {{.*}} extractelement
175*f4a2713aSLionel Sambuc
176*f4a2713aSLionel Sambuc  %r = extractelement <4 x i32> %bin.rdx8, i32 0
177*f4a2713aSLionel Sambuc  ret i32 %r
178*f4a2713aSLionel Sambuc}
179*f4a2713aSLionel Sambuc
180*f4a2713aSLionel Sambucdefine fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
181*f4a2713aSLionel Sambuc  %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
182*f4a2713aSLionel Sambuc  %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
183*f4a2713aSLionel Sambuc  %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
184*f4a2713aSLionel Sambuc  %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
185*f4a2713aSLionel Sambuc
186*f4a2713aSLionel Sambuc; AVX:  cost of 3 {{.*}} extractelement
187*f4a2713aSLionel Sambuc; AVX2:  cost of 3 {{.*}} extractelement
188*f4a2713aSLionel Sambuc
189*f4a2713aSLionel Sambuc  %r = extractelement <4 x i64> %bin.rdx8, i32 0
190*f4a2713aSLionel Sambuc  ret i64 %r
191*f4a2713aSLionel Sambuc}
192*f4a2713aSLionel Sambuc
193*f4a2713aSLionel Sambucdefine fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
194*f4a2713aSLionel Sambuc  %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
195*f4a2713aSLionel Sambuc  %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
196*f4a2713aSLionel Sambuc  %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
197*f4a2713aSLionel Sambuc  %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
198*f4a2713aSLionel Sambuc  %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
199*f4a2713aSLionel Sambuc  %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
200*f4a2713aSLionel Sambuc
201*f4a2713aSLionel Sambuc; SSE3:  cost of 4 {{.*}} extractelement
202*f4a2713aSLionel Sambuc; AVX:  cost of 4 {{.*}} extractelement
203*f4a2713aSLionel Sambuc; AVX2:  cost of 4 {{.*}} extractelement
204*f4a2713aSLionel Sambuc
205*f4a2713aSLionel Sambuc  %r = extractelement <8 x i16> %bin.rdx8, i32 0
206*f4a2713aSLionel Sambuc  ret i16 %r
207*f4a2713aSLionel Sambuc}
208*f4a2713aSLionel Sambuc
209*f4a2713aSLionel Sambucdefine fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
210*f4a2713aSLionel Sambuc  %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
211*f4a2713aSLionel Sambuc  %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
212*f4a2713aSLionel Sambuc  %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
213*f4a2713aSLionel Sambuc  %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
214*f4a2713aSLionel Sambuc  %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
215*f4a2713aSLionel Sambuc  %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
216*f4a2713aSLionel Sambuc
217*f4a2713aSLionel Sambuc; AVX:  cost of 5 {{.*}} extractelement
218*f4a2713aSLionel Sambuc; AVX2:  cost of 5 {{.*}} extractelement
219*f4a2713aSLionel Sambuc
220*f4a2713aSLionel Sambuc  %r = extractelement <8 x i32> %bin.rdx8, i32 0
221*f4a2713aSLionel Sambuc  ret i32 %r
222*f4a2713aSLionel Sambuc}
223*f4a2713aSLionel Sambuc
224*f4a2713aSLionel Sambucdefine fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) {
225*f4a2713aSLionel Sambuc  %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
226*f4a2713aSLionel Sambuc  %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
227*f4a2713aSLionel Sambuc  %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
228*f4a2713aSLionel Sambuc
229*f4a2713aSLionel Sambuc; SSE3:  cost of 2 {{.*}} extractelement
230*f4a2713aSLionel Sambuc; AVX:  cost of 2 {{.*}} extractelement
231*f4a2713aSLionel Sambuc; AVX2:  cost of 2 {{.*}} extractelement
232*f4a2713aSLionel Sambuc
233*f4a2713aSLionel Sambuc  %r = extractelement <2 x double> %bin.rdx8, i32 0
234*f4a2713aSLionel Sambuc  ret double %r
235*f4a2713aSLionel Sambuc}
236*f4a2713aSLionel Sambuc
237*f4a2713aSLionel Sambucdefine fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
238*f4a2713aSLionel Sambuc  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
239*f4a2713aSLionel Sambuc  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
240*f4a2713aSLionel Sambuc  %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
241*f4a2713aSLionel Sambuc  %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
242*f4a2713aSLionel Sambuc  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
243*f4a2713aSLionel Sambuc  %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
244*f4a2713aSLionel Sambuc
245*f4a2713aSLionel Sambuc; SSE3:  cost of 4 {{.*}} extractelement
246*f4a2713aSLionel Sambuc; AVX:  cost of 4 {{.*}} extractelement
247*f4a2713aSLionel Sambuc; AVX2:  cost of 4 {{.*}} extractelement
248*f4a2713aSLionel Sambuc
249*f4a2713aSLionel Sambuc  %r = extractelement <4 x float> %bin.rdx8, i32 0
250*f4a2713aSLionel Sambuc  ret float %r
251*f4a2713aSLionel Sambuc}
252*f4a2713aSLionel Sambuc
253*f4a2713aSLionel Sambucdefine fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
254*f4a2713aSLionel Sambuc  %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
255*f4a2713aSLionel Sambuc  %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
256*f4a2713aSLionel Sambuc  %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
257*f4a2713aSLionel Sambuc  %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
258*f4a2713aSLionel Sambuc  %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
259*f4a2713aSLionel Sambuc  %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
260*f4a2713aSLionel Sambuc
261*f4a2713aSLionel Sambuc; AVX:  cost of 5 {{.*}} extractelement
262*f4a2713aSLionel Sambuc; AVX2:  cost of 5 {{.*}} extractelement
263*f4a2713aSLionel Sambuc
264*f4a2713aSLionel Sambuc  %r = extractelement <4 x double> %bin.rdx8, i32 0
265*f4a2713aSLionel Sambuc  ret double %r
266*f4a2713aSLionel Sambuc}
267*f4a2713aSLionel Sambuc
268*f4a2713aSLionel Sambucdefine fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
269*f4a2713aSLionel Sambuc  %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
270*f4a2713aSLionel Sambuc  %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
271*f4a2713aSLionel Sambuc  %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
272*f4a2713aSLionel Sambuc  %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
273*f4a2713aSLionel Sambuc  %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
274*f4a2713aSLionel Sambuc  %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
275*f4a2713aSLionel Sambuc  %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
276*f4a2713aSLionel Sambuc  %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
277*f4a2713aSLionel Sambuc  %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
278*f4a2713aSLionel Sambuc
279*f4a2713aSLionel Sambuc; AVX:  cost of 7 {{.*}} extractelement
280*f4a2713aSLionel Sambuc; AVX2:  cost of 7 {{.*}} extractelement
281*f4a2713aSLionel Sambuc
282*f4a2713aSLionel Sambuc  %r = extractelement <8 x float> %bin.rdx9, i32 0
283*f4a2713aSLionel Sambuc  ret float %r
284*f4a2713aSLionel Sambuc}
285*f4a2713aSLionel Sambuc
286*f4a2713aSLionel Sambucdefine fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
287*f4a2713aSLionel Sambuc  %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
288*f4a2713aSLionel Sambuc  %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
289*f4a2713aSLionel Sambuc  %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
290*f4a2713aSLionel Sambuc
291*f4a2713aSLionel Sambuc; SSE3:  cost of 2 {{.*}} extractelement
292*f4a2713aSLionel Sambuc; AVX:  cost of 1 {{.*}} extractelement
293*f4a2713aSLionel Sambuc; AVX2:  cost of 1 {{.*}} extractelement
294*f4a2713aSLionel Sambuc
295*f4a2713aSLionel Sambuc  %r = extractelement <2 x i64> %bin.rdx8, i32 0
296*f4a2713aSLionel Sambuc  ret i64 %r
297*f4a2713aSLionel Sambuc}
298*f4a2713aSLionel Sambuc
299*f4a2713aSLionel Sambucdefine fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
300*f4a2713aSLionel Sambuc  %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
301*f4a2713aSLionel Sambuc  %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
302*f4a2713aSLionel Sambuc  %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
303*f4a2713aSLionel Sambuc  %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
304*f4a2713aSLionel Sambuc  %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
305*f4a2713aSLionel Sambuc  %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
306*f4a2713aSLionel Sambuc
307*f4a2713aSLionel Sambuc; SSE3:  cost of 3 {{.*}} extractelement
308*f4a2713aSLionel Sambuc; AVX:  cost of 3 {{.*}} extractelement
309*f4a2713aSLionel Sambuc; AVX2:  cost of 3 {{.*}} extractelement
310*f4a2713aSLionel Sambuc
311*f4a2713aSLionel Sambuc  %r = extractelement <4 x i32> %bin.rdx8, i32 0
312*f4a2713aSLionel Sambuc  ret i32 %r
313*f4a2713aSLionel Sambuc}
314*f4a2713aSLionel Sambuc
315*f4a2713aSLionel Sambucdefine fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
316*f4a2713aSLionel Sambuc  %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
317*f4a2713aSLionel Sambuc  %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
318*f4a2713aSLionel Sambuc  %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
319*f4a2713aSLionel Sambuc  %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
320*f4a2713aSLionel Sambuc  %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
321*f4a2713aSLionel Sambuc  %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
322*f4a2713aSLionel Sambuc
323*f4a2713aSLionel Sambuc; AVX:  cost of 5 {{.*}} extractelement
324*f4a2713aSLionel Sambuc; AVX2:  cost of 5 {{.*}} extractelement
325*f4a2713aSLionel Sambuc
326*f4a2713aSLionel Sambuc  %r = extractelement <4 x i64> %bin.rdx8, i32 0
327*f4a2713aSLionel Sambuc  ret i64 %r
328*f4a2713aSLionel Sambuc}
329*f4a2713aSLionel Sambuc
330*f4a2713aSLionel Sambucdefine fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
331*f4a2713aSLionel Sambuc  %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
332*f4a2713aSLionel Sambuc  %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
333*f4a2713aSLionel Sambuc  %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
334*f4a2713aSLionel Sambuc  %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
335*f4a2713aSLionel Sambuc  %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
336*f4a2713aSLionel Sambuc  %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
337*f4a2713aSLionel Sambuc  %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
338*f4a2713aSLionel Sambuc  %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
339*f4a2713aSLionel Sambuc  %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
340*f4a2713aSLionel Sambuc
341*f4a2713aSLionel Sambuc; SSE3:  cost of 5 {{.*}} extractelement
342*f4a2713aSLionel Sambuc; AVX:  cost of 5 {{.*}} extractelement
343*f4a2713aSLionel Sambuc; AVX2:  cost of 5 {{.*}} extractelement
344*f4a2713aSLionel Sambuc
345*f4a2713aSLionel Sambuc  %r = extractelement <8 x i16> %bin.rdx9, i32 0
346*f4a2713aSLionel Sambuc  ret i16 %r
347*f4a2713aSLionel Sambuc}
348*f4a2713aSLionel Sambuc
349*f4a2713aSLionel Sambucdefine fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
350*f4a2713aSLionel Sambuc  %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
351*f4a2713aSLionel Sambuc  %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
352*f4a2713aSLionel Sambuc  %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
353*f4a2713aSLionel Sambuc  %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
354*f4a2713aSLionel Sambuc  %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
355*f4a2713aSLionel Sambuc  %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
356*f4a2713aSLionel Sambuc  %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
357*f4a2713aSLionel Sambuc  %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
358*f4a2713aSLionel Sambuc  %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
359*f4a2713aSLionel Sambuc
360*f4a2713aSLionel Sambuc; AVX:  cost of 5 {{.*}} extractelement
361*f4a2713aSLionel Sambuc; AVX2:  cost of 5 {{.*}} extractelement
362*f4a2713aSLionel Sambuc
363*f4a2713aSLionel Sambuc  %r = extractelement <8 x i32> %bin.rdx9, i32 0
364*f4a2713aSLionel Sambuc  ret i32 %r
365*f4a2713aSLionel Sambuc}
366