1*f4a2713aSLionel Sambuc; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core2 -mtriple=x86_64-apple-darwin | FileCheck %s 2*f4a2713aSLionel Sambuc; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=SSE3 3*f4a2713aSLionel Sambuc; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7-avx -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX 4*f4a2713aSLionel Sambuc; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core-avx2 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX2 5*f4a2713aSLionel Sambuc 6*f4a2713aSLionel Sambucdefine fastcc float @reduction_cost_float(<4 x float> %rdx) { 7*f4a2713aSLionel Sambuc %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 8*f4a2713aSLionel Sambuc %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf 9*f4a2713aSLionel Sambuc %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 10*f4a2713aSLionel Sambuc %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 11*f4a2713aSLionel Sambuc 12*f4a2713aSLionel Sambuc; Check that we recognize the tree starting at the extractelement as a 13*f4a2713aSLionel Sambuc; reduction. 14*f4a2713aSLionel Sambuc; CHECK-LABEL: reduction_cost 15*f4a2713aSLionel Sambuc; CHECK: cost of 9 {{.*}} extractelement 16*f4a2713aSLionel Sambuc 17*f4a2713aSLionel Sambuc %r = extractelement <4 x float> %bin.rdx8, i32 0 18*f4a2713aSLionel Sambuc ret float %r 19*f4a2713aSLionel Sambuc} 20*f4a2713aSLionel Sambuc 21*f4a2713aSLionel Sambucdefine fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { 22*f4a2713aSLionel Sambuc %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, 23*f4a2713aSLionel Sambuc <8 x i32> <i32 4 , i32 5, i32 6, i32 7, 24*f4a2713aSLionel Sambuc i32 undef, i32 undef, i32 undef, i32 undef> 25*f4a2713aSLionel Sambuc %bin.rdx = add <8 x i32> %rdx, %rdx.shuf 26*f4a2713aSLionel Sambuc %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, 27*f4a2713aSLionel Sambuc <8 x i32> <i32 2 , i32 3, i32 undef, i32 undef, 28*f4a2713aSLionel Sambuc i32 undef, i32 undef, i32 undef, i32 undef> 29*f4a2713aSLionel Sambuc %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 30*f4a2713aSLionel Sambuc %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, 31*f4a2713aSLionel Sambuc <8 x i32> <i32 1 , i32 undef, i32 undef, i32 undef, 32*f4a2713aSLionel Sambuc i32 undef, i32 undef, i32 undef, i32 undef> 33*f4a2713aSLionel Sambuc %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 34*f4a2713aSLionel Sambuc 35*f4a2713aSLionel Sambuc; CHECK-LABEL: reduction_cost_int 36*f4a2713aSLionel Sambuc; CHECK: cost of 23 {{.*}} extractelement 37*f4a2713aSLionel Sambuc 38*f4a2713aSLionel Sambuc %r = extractelement <8 x i32> %bin.rdx.3, i32 0 39*f4a2713aSLionel Sambuc ret i32 %r 40*f4a2713aSLionel Sambuc} 41*f4a2713aSLionel Sambuc 42*f4a2713aSLionel Sambucdefine fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { 43*f4a2713aSLionel Sambuc %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, 44*f4a2713aSLionel Sambuc <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef> 45*f4a2713aSLionel Sambuc %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, 46*f4a2713aSLionel Sambuc <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 47*f4a2713aSLionel Sambuc %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 48*f4a2713aSLionel Sambuc %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 49*f4a2713aSLionel Sambuc <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 50*f4a2713aSLionel Sambuc %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 51*f4a2713aSLionel Sambuc <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 52*f4a2713aSLionel Sambuc %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 53*f4a2713aSLionel Sambuc 54*f4a2713aSLionel Sambuc; CHECK-LABEL: pairwise_hadd 55*f4a2713aSLionel Sambuc; CHECK: cost of 11 {{.*}} extractelement 56*f4a2713aSLionel Sambuc 57*f4a2713aSLionel Sambuc %r = extractelement <4 x float> %bin.rdx.1, i32 0 58*f4a2713aSLionel Sambuc %r2 = fadd float %r, %f1 59*f4a2713aSLionel Sambuc ret float %r2 60*f4a2713aSLionel Sambuc} 61*f4a2713aSLionel Sambuc 62*f4a2713aSLionel Sambucdefine fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { 63*f4a2713aSLionel Sambuc %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, 64*f4a2713aSLionel Sambuc <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef> 65*f4a2713aSLionel Sambuc %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, 66*f4a2713aSLionel Sambuc <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 67*f4a2713aSLionel Sambuc %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 68*f4a2713aSLionel Sambuc %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 69*f4a2713aSLionel Sambuc <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 70*f4a2713aSLionel Sambuc %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 71*f4a2713aSLionel Sambuc <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 72*f4a2713aSLionel Sambuc %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 73*f4a2713aSLionel Sambuc 74*f4a2713aSLionel Sambuc; CHECK-LABEL: pairwise_hadd_assoc 75*f4a2713aSLionel Sambuc; CHECK: cost of 11 {{.*}} extractelement 76*f4a2713aSLionel Sambuc 77*f4a2713aSLionel Sambuc %r = extractelement <4 x float> %bin.rdx.1, i32 0 78*f4a2713aSLionel Sambuc %r2 = fadd float %r, %f1 79*f4a2713aSLionel Sambuc ret float %r2 80*f4a2713aSLionel Sambuc} 81*f4a2713aSLionel Sambuc 82*f4a2713aSLionel Sambucdefine fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { 83*f4a2713aSLionel Sambuc %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, 84*f4a2713aSLionel Sambuc <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef> 85*f4a2713aSLionel Sambuc %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, 86*f4a2713aSLionel Sambuc <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 87*f4a2713aSLionel Sambuc %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 88*f4a2713aSLionel Sambuc %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 89*f4a2713aSLionel Sambuc <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 90*f4a2713aSLionel Sambuc %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 91*f4a2713aSLionel Sambuc 92*f4a2713aSLionel Sambuc; CHECK-LABEL: pairwise_hadd_skip_first 93*f4a2713aSLionel Sambuc; CHECK: cost of 11 {{.*}} extractelement 94*f4a2713aSLionel Sambuc 95*f4a2713aSLionel Sambuc %r = extractelement <4 x float> %bin.rdx.1, i32 0 96*f4a2713aSLionel Sambuc %r2 = fadd float %r, %f1 97*f4a2713aSLionel Sambuc ret float %r2 98*f4a2713aSLionel Sambuc} 99*f4a2713aSLionel Sambuc 100*f4a2713aSLionel Sambucdefine fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1) { 101*f4a2713aSLionel Sambuc %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 102*f4a2713aSLionel Sambuc %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf 103*f4a2713aSLionel Sambuc 104*f4a2713aSLionel Sambuc; SSE3: cost of 2 {{.*}} extractelement 105*f4a2713aSLionel Sambuc; AVX: cost of 2 {{.*}} extractelement 106*f4a2713aSLionel Sambuc; AVX2: cost of 2 {{.*}} extractelement 107*f4a2713aSLionel Sambuc 108*f4a2713aSLionel Sambuc %r = extractelement <2 x double> %bin.rdx, i32 0 109*f4a2713aSLionel Sambuc ret double %r 110*f4a2713aSLionel Sambuc} 111*f4a2713aSLionel Sambuc 112*f4a2713aSLionel Sambucdefine fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) { 113*f4a2713aSLionel Sambuc %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 114*f4a2713aSLionel Sambuc %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf 115*f4a2713aSLionel Sambuc %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 116*f4a2713aSLionel Sambuc %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 117*f4a2713aSLionel Sambuc 118*f4a2713aSLionel Sambuc; SSE3: cost of 4 {{.*}} extractelement 119*f4a2713aSLionel Sambuc; AVX: cost of 3 {{.*}} extractelement 120*f4a2713aSLionel Sambuc; AVX2: cost of 3 {{.*}} extractelement 121*f4a2713aSLionel Sambuc 122*f4a2713aSLionel Sambuc %r = extractelement <4 x float> %bin.rdx8, i32 0 123*f4a2713aSLionel Sambuc ret float %r 124*f4a2713aSLionel Sambuc} 125*f4a2713aSLionel Sambuc 126*f4a2713aSLionel Sambucdefine fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) { 127*f4a2713aSLionel Sambuc %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 128*f4a2713aSLionel Sambuc %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf 129*f4a2713aSLionel Sambuc %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 130*f4a2713aSLionel Sambuc %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 131*f4a2713aSLionel Sambuc 132*f4a2713aSLionel Sambuc; AVX: cost of 3 {{.*}} extractelement 133*f4a2713aSLionel Sambuc; AVX2: cost of 3 {{.*}} extractelement 134*f4a2713aSLionel Sambuc 135*f4a2713aSLionel Sambuc %r = extractelement <4 x double> %bin.rdx8, i32 0 136*f4a2713aSLionel Sambuc ret double %r 137*f4a2713aSLionel Sambuc} 138*f4a2713aSLionel Sambuc 139*f4a2713aSLionel Sambucdefine fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { 140*f4a2713aSLionel Sambuc %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 141*f4a2713aSLionel Sambuc %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 142*f4a2713aSLionel Sambuc %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 143*f4a2713aSLionel Sambuc %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf 144*f4a2713aSLionel Sambuc %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 145*f4a2713aSLionel Sambuc %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 146*f4a2713aSLionel Sambuc 147*f4a2713aSLionel Sambuc; AVX: cost of 4 {{.*}} extractelement 148*f4a2713aSLionel Sambuc; AVX2: cost of 4 {{.*}} extractelement 149*f4a2713aSLionel Sambuc 150*f4a2713aSLionel Sambuc %r = extractelement <8 x float> %bin.rdx8, i32 0 151*f4a2713aSLionel Sambuc ret float %r 152*f4a2713aSLionel Sambuc} 153*f4a2713aSLionel Sambuc 154*f4a2713aSLionel Sambucdefine fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { 155*f4a2713aSLionel Sambuc %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 156*f4a2713aSLionel Sambuc %bin.rdx = add <2 x i64> %rdx, %rdx.shuf 157*f4a2713aSLionel Sambuc 158*f4a2713aSLionel Sambuc; SSE3: cost of 2 {{.*}} extractelement 159*f4a2713aSLionel Sambuc; AVX: cost of 1 {{.*}} extractelement 160*f4a2713aSLionel Sambuc; AVX2: cost of 1 {{.*}} extractelement 161*f4a2713aSLionel Sambuc 162*f4a2713aSLionel Sambuc %r = extractelement <2 x i64> %bin.rdx, i32 0 163*f4a2713aSLionel Sambuc ret i64 %r 164*f4a2713aSLionel Sambuc} 165*f4a2713aSLionel Sambuc 166*f4a2713aSLionel Sambucdefine fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { 167*f4a2713aSLionel Sambuc %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 168*f4a2713aSLionel Sambuc %bin.rdx = add <4 x i32> %rdx, %rdx.shuf 169*f4a2713aSLionel Sambuc %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 170*f4a2713aSLionel Sambuc %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 171*f4a2713aSLionel Sambuc 172*f4a2713aSLionel Sambuc; SSE3: cost of 3 {{.*}} extractelement 173*f4a2713aSLionel Sambuc; AVX: cost of 3 {{.*}} extractelement 174*f4a2713aSLionel Sambuc; AVX2: cost of 3 {{.*}} extractelement 175*f4a2713aSLionel Sambuc 176*f4a2713aSLionel Sambuc %r = extractelement <4 x i32> %bin.rdx8, i32 0 177*f4a2713aSLionel Sambuc ret i32 %r 178*f4a2713aSLionel Sambuc} 179*f4a2713aSLionel Sambuc 180*f4a2713aSLionel Sambucdefine fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { 181*f4a2713aSLionel Sambuc %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 182*f4a2713aSLionel Sambuc %bin.rdx = add <4 x i64> %rdx, %rdx.shuf 183*f4a2713aSLionel Sambuc %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 184*f4a2713aSLionel Sambuc %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 185*f4a2713aSLionel Sambuc 186*f4a2713aSLionel Sambuc; AVX: cost of 3 {{.*}} extractelement 187*f4a2713aSLionel Sambuc; AVX2: cost of 3 {{.*}} extractelement 188*f4a2713aSLionel Sambuc 189*f4a2713aSLionel Sambuc %r = extractelement <4 x i64> %bin.rdx8, i32 0 190*f4a2713aSLionel Sambuc ret i64 %r 191*f4a2713aSLionel Sambuc} 192*f4a2713aSLionel Sambuc 193*f4a2713aSLionel Sambucdefine fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { 194*f4a2713aSLionel Sambuc %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 195*f4a2713aSLionel Sambuc %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 196*f4a2713aSLionel Sambuc %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 197*f4a2713aSLionel Sambuc %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf 198*f4a2713aSLionel Sambuc %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 199*f4a2713aSLionel Sambuc %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 200*f4a2713aSLionel Sambuc 201*f4a2713aSLionel Sambuc; SSE3: cost of 4 {{.*}} extractelement 202*f4a2713aSLionel Sambuc; AVX: cost of 4 {{.*}} extractelement 203*f4a2713aSLionel Sambuc; AVX2: cost of 4 {{.*}} extractelement 204*f4a2713aSLionel Sambuc 205*f4a2713aSLionel Sambuc %r = extractelement <8 x i16> %bin.rdx8, i32 0 206*f4a2713aSLionel Sambuc ret i16 %r 207*f4a2713aSLionel Sambuc} 208*f4a2713aSLionel Sambuc 209*f4a2713aSLionel Sambucdefine fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { 210*f4a2713aSLionel Sambuc %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 211*f4a2713aSLionel Sambuc %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 212*f4a2713aSLionel Sambuc %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 213*f4a2713aSLionel Sambuc %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf 214*f4a2713aSLionel Sambuc %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 215*f4a2713aSLionel Sambuc %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 216*f4a2713aSLionel Sambuc 217*f4a2713aSLionel Sambuc; AVX: cost of 5 {{.*}} extractelement 218*f4a2713aSLionel Sambuc; AVX2: cost of 5 {{.*}} extractelement 219*f4a2713aSLionel Sambuc 220*f4a2713aSLionel Sambuc %r = extractelement <8 x i32> %bin.rdx8, i32 0 221*f4a2713aSLionel Sambuc ret i32 %r 222*f4a2713aSLionel Sambuc} 223*f4a2713aSLionel Sambuc 224*f4a2713aSLionel Sambucdefine fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) { 225*f4a2713aSLionel Sambuc %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef> 226*f4a2713aSLionel Sambuc %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 227*f4a2713aSLionel Sambuc %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 228*f4a2713aSLionel Sambuc 229*f4a2713aSLionel Sambuc; SSE3: cost of 2 {{.*}} extractelement 230*f4a2713aSLionel Sambuc; AVX: cost of 2 {{.*}} extractelement 231*f4a2713aSLionel Sambuc; AVX2: cost of 2 {{.*}} extractelement 232*f4a2713aSLionel Sambuc 233*f4a2713aSLionel Sambuc %r = extractelement <2 x double> %bin.rdx8, i32 0 234*f4a2713aSLionel Sambuc ret double %r 235*f4a2713aSLionel Sambuc} 236*f4a2713aSLionel Sambuc 237*f4a2713aSLionel Sambucdefine fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { 238*f4a2713aSLionel Sambuc %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 239*f4a2713aSLionel Sambuc %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 240*f4a2713aSLionel Sambuc %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 241*f4a2713aSLionel Sambuc %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 242*f4a2713aSLionel Sambuc %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 243*f4a2713aSLionel Sambuc %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 244*f4a2713aSLionel Sambuc 245*f4a2713aSLionel Sambuc; SSE3: cost of 4 {{.*}} extractelement 246*f4a2713aSLionel Sambuc; AVX: cost of 4 {{.*}} extractelement 247*f4a2713aSLionel Sambuc; AVX2: cost of 4 {{.*}} extractelement 248*f4a2713aSLionel Sambuc 249*f4a2713aSLionel Sambuc %r = extractelement <4 x float> %bin.rdx8, i32 0 250*f4a2713aSLionel Sambuc ret float %r 251*f4a2713aSLionel Sambuc} 252*f4a2713aSLionel Sambuc 253*f4a2713aSLionel Sambucdefine fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { 254*f4a2713aSLionel Sambuc %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 255*f4a2713aSLionel Sambuc %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 256*f4a2713aSLionel Sambuc %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 257*f4a2713aSLionel Sambuc %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 258*f4a2713aSLionel Sambuc %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 259*f4a2713aSLionel Sambuc %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 260*f4a2713aSLionel Sambuc 261*f4a2713aSLionel Sambuc; AVX: cost of 5 {{.*}} extractelement 262*f4a2713aSLionel Sambuc; AVX2: cost of 5 {{.*}} extractelement 263*f4a2713aSLionel Sambuc 264*f4a2713aSLionel Sambuc %r = extractelement <4 x double> %bin.rdx8, i32 0 265*f4a2713aSLionel Sambuc ret double %r 266*f4a2713aSLionel Sambuc} 267*f4a2713aSLionel Sambuc 268*f4a2713aSLionel Sambucdefine fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { 269*f4a2713aSLionel Sambuc %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef> 270*f4a2713aSLionel Sambuc %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 271*f4a2713aSLionel Sambuc %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 272*f4a2713aSLionel Sambuc %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 273*f4a2713aSLionel Sambuc %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 274*f4a2713aSLionel Sambuc %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 275*f4a2713aSLionel Sambuc %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 276*f4a2713aSLionel Sambuc %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 277*f4a2713aSLionel Sambuc %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 278*f4a2713aSLionel Sambuc 279*f4a2713aSLionel Sambuc; AVX: cost of 7 {{.*}} extractelement 280*f4a2713aSLionel Sambuc; AVX2: cost of 7 {{.*}} extractelement 281*f4a2713aSLionel Sambuc 282*f4a2713aSLionel Sambuc %r = extractelement <8 x float> %bin.rdx9, i32 0 283*f4a2713aSLionel Sambuc ret float %r 284*f4a2713aSLionel Sambuc} 285*f4a2713aSLionel Sambuc 286*f4a2713aSLionel Sambucdefine fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { 287*f4a2713aSLionel Sambuc %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef> 288*f4a2713aSLionel Sambuc %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 289*f4a2713aSLionel Sambuc %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 290*f4a2713aSLionel Sambuc 291*f4a2713aSLionel Sambuc; SSE3: cost of 2 {{.*}} extractelement 292*f4a2713aSLionel Sambuc; AVX: cost of 1 {{.*}} extractelement 293*f4a2713aSLionel Sambuc; AVX2: cost of 1 {{.*}} extractelement 294*f4a2713aSLionel Sambuc 295*f4a2713aSLionel Sambuc %r = extractelement <2 x i64> %bin.rdx8, i32 0 296*f4a2713aSLionel Sambuc ret i64 %r 297*f4a2713aSLionel Sambuc} 298*f4a2713aSLionel Sambuc 299*f4a2713aSLionel Sambucdefine fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { 300*f4a2713aSLionel Sambuc %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 301*f4a2713aSLionel Sambuc %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 302*f4a2713aSLionel Sambuc %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 303*f4a2713aSLionel Sambuc %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 304*f4a2713aSLionel Sambuc %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 305*f4a2713aSLionel Sambuc %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 306*f4a2713aSLionel Sambuc 307*f4a2713aSLionel Sambuc; SSE3: cost of 3 {{.*}} extractelement 308*f4a2713aSLionel Sambuc; AVX: cost of 3 {{.*}} extractelement 309*f4a2713aSLionel Sambuc; AVX2: cost of 3 {{.*}} extractelement 310*f4a2713aSLionel Sambuc 311*f4a2713aSLionel Sambuc %r = extractelement <4 x i32> %bin.rdx8, i32 0 312*f4a2713aSLionel Sambuc ret i32 %r 313*f4a2713aSLionel Sambuc} 314*f4a2713aSLionel Sambuc 315*f4a2713aSLionel Sambucdefine fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { 316*f4a2713aSLionel Sambuc %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 317*f4a2713aSLionel Sambuc %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 318*f4a2713aSLionel Sambuc %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 319*f4a2713aSLionel Sambuc %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 320*f4a2713aSLionel Sambuc %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 321*f4a2713aSLionel Sambuc %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 322*f4a2713aSLionel Sambuc 323*f4a2713aSLionel Sambuc; AVX: cost of 5 {{.*}} extractelement 324*f4a2713aSLionel Sambuc; AVX2: cost of 5 {{.*}} extractelement 325*f4a2713aSLionel Sambuc 326*f4a2713aSLionel Sambuc %r = extractelement <4 x i64> %bin.rdx8, i32 0 327*f4a2713aSLionel Sambuc ret i64 %r 328*f4a2713aSLionel Sambuc} 329*f4a2713aSLionel Sambuc 330*f4a2713aSLionel Sambucdefine fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { 331*f4a2713aSLionel Sambuc %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef> 332*f4a2713aSLionel Sambuc %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 333*f4a2713aSLionel Sambuc %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1 334*f4a2713aSLionel Sambuc %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 335*f4a2713aSLionel Sambuc %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 336*f4a2713aSLionel Sambuc %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 337*f4a2713aSLionel Sambuc %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 338*f4a2713aSLionel Sambuc %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 339*f4a2713aSLionel Sambuc %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 340*f4a2713aSLionel Sambuc 341*f4a2713aSLionel Sambuc; SSE3: cost of 5 {{.*}} extractelement 342*f4a2713aSLionel Sambuc; AVX: cost of 5 {{.*}} extractelement 343*f4a2713aSLionel Sambuc; AVX2: cost of 5 {{.*}} extractelement 344*f4a2713aSLionel Sambuc 345*f4a2713aSLionel Sambuc %r = extractelement <8 x i16> %bin.rdx9, i32 0 346*f4a2713aSLionel Sambuc ret i16 %r 347*f4a2713aSLionel Sambuc} 348*f4a2713aSLionel Sambuc 349*f4a2713aSLionel Sambucdefine fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { 350*f4a2713aSLionel Sambuc %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef> 351*f4a2713aSLionel Sambuc %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 352*f4a2713aSLionel Sambuc %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 353*f4a2713aSLionel Sambuc %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 354*f4a2713aSLionel Sambuc %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 355*f4a2713aSLionel Sambuc %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 356*f4a2713aSLionel Sambuc %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 357*f4a2713aSLionel Sambuc %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 358*f4a2713aSLionel Sambuc %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 359*f4a2713aSLionel Sambuc 360*f4a2713aSLionel Sambuc; AVX: cost of 5 {{.*}} extractelement 361*f4a2713aSLionel Sambuc; AVX2: cost of 5 {{.*}} extractelement 362*f4a2713aSLionel Sambuc 363*f4a2713aSLionel Sambuc %r = extractelement <8 x i32> %bin.rdx9, i32 0 364*f4a2713aSLionel Sambuc ret i32 %r 365*f4a2713aSLionel Sambuc} 366