1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3-SLOW 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3-FAST 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX-SLOW,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-SLOW,AVX2-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX2-FAST 8 9; Vectorized Pairwise Sum Reductions 10; e.g. 11; inline STYPE sum(VTYPE x) { 12; return (x[0] + x[1]) + (x[2] + x[3]); 13; } 14; 15; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) { 16; return (VTYPE) { sum( A0 ), sum( A1 ), sum( A2 ), sum( A3 ) }; 17; } 18 19define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { 20; SSSE3-SLOW-LABEL: pair_sum_v4f32_v4f32: 21; SSSE3-SLOW: # %bb.0: 22; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0 23; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm3 24; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm0 25; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,3,2] 26; SSSE3-SLOW-NEXT: retq 27; 28; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32: 29; SSSE3-FAST: # %bb.0: 30; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0 31; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2 32; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0 33; SSSE3-FAST-NEXT: retq 34; 35; AVX-SLOW-LABEL: pair_sum_v4f32_v4f32: 36; AVX-SLOW: # %bb.0: 37; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 38; AVX-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 39; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1] 40; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] 41; AVX-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 42; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 43; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] 44; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 45; AVX-SLOW-NEXT: retq 46; 47; AVX-FAST-LABEL: pair_sum_v4f32_v4f32: 48; AVX-FAST: # %bb.0: 49; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 50; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm1 51; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 52; AVX-FAST-NEXT: retq 53 %5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2> 54 %6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3> 55 %7 = fadd <2 x float> %5, %6 56 %8 = shufflevector <2 x float> %7, <2 x float> poison, <2 x i32> <i32 1, i32 undef> 57 %9 = fadd <2 x float> %7, %8 58 %10 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2> 59 %11 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3> 60 %12 = fadd <2 x float> %10, %11 61 %13 = shufflevector <2 x float> %12, <2 x float> poison, <2 x i32> <i32 1, i32 undef> 62 %14 = fadd <2 x float> %12, %13 63 %15 = shufflevector <2 x float> %9, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 64 %16 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 0, i32 2> 65 %17 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 1, i32 3> 66 %18 = fadd <2 x float> %16, %17 67 %19 = shufflevector <2 x float> %18, <2 x float> poison, <2 x i32> <i32 1, i32 undef> 68 %20 = fadd <2 x float> %18, %19 69 %21 = shufflevector <2 x float> %20, <2 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 70 %22 = shufflevector <4 x float> %15, <4 x float> %21, <4 x i32> <i32 0, i32 1, i32 4, i32 undef> 71 %23 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 0, i32 2> 72 %24 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3> 73 %25 = fadd <2 x float> %23, %24 74 %26 = shufflevector <2 x float> %25, <2 x float> poison, <2 x i32> <i32 1, i32 undef> 75 %27 = fadd <2 x float> %25, %26 76 %28 = shufflevector <2 x float> %27, <2 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 77 %29 = shufflevector <4 x float> %22, <4 x float> %28, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 78 ret <4 x float> %29 79} 80 81define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { 82; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32: 83; SSSE3-SLOW: # %bb.0: 84; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 85; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] 86; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 87; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 88; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm3 89; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] 90; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 91; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 92; SSSE3-SLOW-NEXT: retq 93; 94; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32: 95; SSSE3-FAST: # %bb.0: 96; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0 97; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2 98; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm0 99; SSSE3-FAST-NEXT: retq 100; 101; AVX1-SLOW-LABEL: pair_sum_v4i32_v4i32: 102; AVX1-SLOW: # %bb.0: 103; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 104; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] 105; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 106; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 107; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 108; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 109; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 110; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 111; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 112; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] 113; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 114; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 115; AVX1-SLOW-NEXT: retq 116; 117; AVX-FAST-LABEL: pair_sum_v4i32_v4i32: 118; AVX-FAST: # %bb.0: 119; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 120; AVX-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1 121; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 122; AVX-FAST-NEXT: retq 123; 124; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32: 125; AVX2-SLOW: # %bb.0: 126; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 127; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] 128; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 129; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 130; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 131; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 132; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 133; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 134; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 135; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2 136; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 137; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 138; AVX2-SLOW-NEXT: retq 139 %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 140 %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 141 %7 = add <2 x i32> %5, %6 142 %8 = shufflevector <2 x i32> %7, <2 x i32> poison, <2 x i32> <i32 1, i32 undef> 143 %9 = add <2 x i32> %7, %8 144 %10 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 145 %11 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 146 %12 = add <2 x i32> %10, %11 147 %13 = shufflevector <2 x i32> %12, <2 x i32> poison, <2 x i32> <i32 1, i32 undef> 148 %14 = add <2 x i32> %12, %13 149 %15 = shufflevector <2 x i32> %9, <2 x i32> %14, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 150 %16 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 151 %17 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 152 %18 = add <2 x i32> %16, %17 153 %19 = shufflevector <2 x i32> %18, <2 x i32> poison, <2 x i32> <i32 1, i32 undef> 154 %20 = add <2 x i32> %18, %19 155 %21 = shufflevector <2 x i32> %20, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 156 %22 = shufflevector <4 x i32> %15, <4 x i32> %21, <4 x i32> <i32 0, i32 1, i32 4, i32 undef> 157 %23 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 158 %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 159 %25 = add <2 x i32> %23, %24 160 %26 = shufflevector <2 x i32> %25, <2 x i32> poison, <2 x i32> <i32 1, i32 undef> 161 %27 = add <2 x i32> %25, %26 162 %28 = shufflevector <2 x i32> %27, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 163 %29 = shufflevector <4 x i32> %22, <4 x i32> %28, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 164 ret <4 x i32> %29 165} 166 167define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5, <4 x float> %6, <4 x float> %7) { 168; SSSE3-SLOW-LABEL: pair_sum_v8f32_v4f32: 169; SSSE3-SLOW: # %bb.0: 170; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0 171; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 172; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] 173; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 174; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 175; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2 176; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5 177; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm2 178; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2] 179; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 180; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6 181; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6 182; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1] 183; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 184; SSSE3-SLOW-NEXT: retq 185; 186; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32: 187; SSSE3-FAST: # %bb.0: 188; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0 189; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 190; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2 191; SSSE3-FAST-NEXT: haddps %xmm5, %xmm4 192; SSSE3-FAST-NEXT: haddps %xmm4, %xmm2 193; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 194; SSSE3-FAST-NEXT: haddps %xmm7, %xmm6 195; SSSE3-FAST-NEXT: haddps %xmm6, %xmm4 196; SSSE3-FAST-NEXT: movaps %xmm4, %xmm1 197; SSSE3-FAST-NEXT: retq 198; 199; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32: 200; AVX1-SLOW: # %bb.0: 201; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 202; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3] 203; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 204; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 205; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 206; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 207; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2 208; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1] 209; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 210; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] 211; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] 212; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 213; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 214; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 215; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 216; AVX1-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2 217; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 218; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 219; AVX1-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] 220; AVX1-SLOW-NEXT: retq 221; 222; AVX1-FAST-LABEL: pair_sum_v8f32_v4f32: 223; AVX1-FAST: # %bb.0: 224; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 225; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 226; AVX1-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1 227; AVX1-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4 228; AVX1-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2 229; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1] 230; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 231; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] 232; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] 233; AVX1-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 234; AVX1-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 235; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 236; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 237; AVX1-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 238; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 239; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 240; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] 241; AVX1-FAST-NEXT: retq 242; 243; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32: 244; AVX2-SLOW: # %bb.0: 245; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 246; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3] 247; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 248; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 249; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 250; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm8 251; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2 252; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1] 253; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0] 254; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm3 255; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1] 256; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 257; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 258; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 259; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 260; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2 261; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 262; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 263; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] 264; AVX2-SLOW-NEXT: retq 265; 266; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32: 267; AVX2-FAST: # %bb.0: 268; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 269; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 270; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1 271; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm8 272; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2 273; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1] 274; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0] 275; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm3 276; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1] 277; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 278; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 279; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 280; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 281; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 282; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 283; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 284; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] 285; AVX2-FAST-NEXT: retq 286 %9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2> 287 %10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3> 288 %11 = fadd <2 x float> %9, %10 289 %12 = shufflevector <2 x float> %11, <2 x float> poison, <2 x i32> <i32 1, i32 undef> 290 %13 = fadd <2 x float> %11, %12 291 %14 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2> 292 %15 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3> 293 %16 = fadd <2 x float> %14, %15 294 %17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> <i32 1, i32 undef> 295 %18 = fadd <2 x float> %16, %17 296 %19 = shufflevector <2 x float> %13, <2 x float> %18, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 297 %20 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 0, i32 2> 298 %21 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 1, i32 3> 299 %22 = fadd <2 x float> %20, %21 300 %23 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 0, i32 2> 301 %24 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3> 302 %25 = fadd <2 x float> %23, %24 303 %26 = shufflevector <4 x float> %4, <4 x float> poison, <2 x i32> <i32 0, i32 2> 304 %27 = shufflevector <4 x float> %4, <4 x float> poison, <2 x i32> <i32 1, i32 3> 305 %28 = fadd <2 x float> %26, %27 306 %29 = shufflevector <2 x float> %28, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 307 %30 = shufflevector <4 x float> %5, <4 x float> poison, <2 x i32> <i32 0, i32 2> 308 %31 = shufflevector <4 x float> %5, <4 x float> poison, <2 x i32> <i32 1, i32 3> 309 %32 = fadd <2 x float> %30, %31 310 %33 = shufflevector <2 x float> %32, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 311 %34 = shufflevector <2 x float> %22, <2 x float> %25, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 312 %35 = shufflevector <4 x float> %34, <4 x float> %29, <4 x i32> <i32 0, i32 1, i32 4, i32 undef> 313 %36 = shufflevector <4 x float> %35, <4 x float> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 314 %37 = shufflevector <2 x float> %22, <2 x float> %25, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 315 %38 = shufflevector <4 x float> %37, <4 x float> %29, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 316 %39 = shufflevector <4 x float> %38, <4 x float> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 5> 317 %40 = fadd <4 x float> %36, %39 318 %41 = shufflevector <4 x float> %40, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 319 %42 = shufflevector <8 x float> %19, <8 x float> %41, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef> 320 %43 = shufflevector <4 x float> %6, <4 x float> poison, <2 x i32> <i32 0, i32 2> 321 %44 = shufflevector <4 x float> %6, <4 x float> poison, <2 x i32> <i32 1, i32 3> 322 %45 = fadd <2 x float> %43, %44 323 %46 = shufflevector <4 x float> %7, <4 x float> poison, <2 x i32> <i32 0, i32 2> 324 %47 = shufflevector <4 x float> %7, <4 x float> poison, <2 x i32> <i32 1, i32 3> 325 %48 = fadd <2 x float> %46, %47 326 %49 = shufflevector <2 x float> %45, <2 x float> %48, <2 x i32> <i32 0, i32 2> 327 %50 = shufflevector <2 x float> %45, <2 x float> %48, <2 x i32> <i32 1, i32 3> 328 %51 = fadd <2 x float> %49, %50 329 %52 = shufflevector <2 x float> %51, <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 330 %53 = shufflevector <8 x float> %42, <8 x float> %52, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9> 331 ret <8 x float> %53 332} 333 334define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, <4 x i32> %6, <4 x i32> %7) { 335; SSSE3-SLOW-LABEL: pair_sum_v8i32_v4i32: 336; SSSE3-SLOW: # %bb.0: 337; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 338; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] 339; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 340; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 341; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm2 342; SSSE3-SLOW-NEXT: phaddd %xmm4, %xmm5 343; SSSE3-SLOW-NEXT: phaddd %xmm5, %xmm2 344; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,3,2] 345; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 346; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6 347; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6 348; SSSE3-SLOW-NEXT: palignr {{.*#+}} xmm6 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] 349; SSSE3-SLOW-NEXT: movdqa %xmm6, %xmm1 350; SSSE3-SLOW-NEXT: retq 351; 352; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32: 353; SSSE3-FAST: # %bb.0: 354; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0 355; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 356; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2 357; SSSE3-FAST-NEXT: phaddd %xmm5, %xmm4 358; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm2 359; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 360; SSSE3-FAST-NEXT: phaddd %xmm6, %xmm6 361; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm7 362; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm6 363; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2] 364; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 365; SSSE3-FAST-NEXT: retq 366; 367; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32: 368; AVX1-SLOW: # %bb.0: 369; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 370; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] 371; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 372; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 373; AVX1-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1 374; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4 375; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2 376; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] 377; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 378; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] 379; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] 380; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] 381; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] 382; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 383; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 384; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 385; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 386; AVX1-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm2 387; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2 388; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 389; AVX1-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] 390; AVX1-SLOW-NEXT: retq 391; 392; AVX1-FAST-LABEL: pair_sum_v8i32_v4i32: 393; AVX1-FAST: # %bb.0: 394; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 395; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 396; AVX1-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1 397; AVX1-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4 398; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 399; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] 400; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 401; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] 402; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] 403; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] 404; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] 405; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 406; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 407; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 408; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 409; AVX1-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm2 410; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm2, %xmm2 411; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 412; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] 413; AVX1-FAST-NEXT: retq 414; 415; AVX2-SLOW-LABEL: pair_sum_v8i32_v4i32: 416; AVX2-SLOW: # %bb.0: 417; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 418; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] 419; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 420; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 421; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1 422; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4 423; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2 424; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] 425; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 426; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5 427; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] 428; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] 429; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] 430; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 431; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 432; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 433; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 434; AVX2-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm1 435; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1 436; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 437; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 438; AVX2-SLOW-NEXT: retq 439; 440; AVX2-FAST-LABEL: pair_sum_v8i32_v4i32: 441; AVX2-FAST: # %bb.0: 442; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 443; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 444; AVX2-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1 445; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4 446; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 447; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] 448; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 449; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5 450; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] 451; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] 452; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] 453; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 454; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 455; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 456; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 457; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm1 458; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm1 459; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 460; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 461; AVX2-FAST-NEXT: retq 462 %9 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 463 %10 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 464 %11 = add <2 x i32> %9, %10 465 %12 = shufflevector <2 x i32> %11, <2 x i32> poison, <2 x i32> <i32 1, i32 undef> 466 %13 = add <2 x i32> %11, %12 467 %14 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 468 %15 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 469 %16 = add <2 x i32> %14, %15 470 %17 = shufflevector <2 x i32> %16, <2 x i32> poison, <2 x i32> <i32 1, i32 undef> 471 %18 = add <2 x i32> %16, %17 472 %19 = shufflevector <2 x i32> %13, <2 x i32> %18, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 473 %20 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 474 %21 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 475 %22 = add <2 x i32> %20, %21 476 %23 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 477 %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 478 %25 = add <2 x i32> %23, %24 479 %26 = shufflevector <4 x i32> %4, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 480 %27 = shufflevector <4 x i32> %4, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 481 %28 = add <2 x i32> %26, %27 482 %29 = shufflevector <2 x i32> %28, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 483 %30 = shufflevector <4 x i32> %5, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 484 %31 = shufflevector <4 x i32> %5, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 485 %32 = add <2 x i32> %30, %31 486 %33 = shufflevector <2 x i32> %32, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 487 %34 = shufflevector <2 x i32> %22, <2 x i32> %25, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 488 %35 = shufflevector <4 x i32> %34, <4 x i32> %29, <4 x i32> <i32 0, i32 1, i32 4, i32 undef> 489 %36 = shufflevector <4 x i32> %35, <4 x i32> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 490 %37 = shufflevector <2 x i32> %22, <2 x i32> %25, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 491 %38 = shufflevector <4 x i32> %37, <4 x i32> %29, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 492 %39 = shufflevector <4 x i32> %38, <4 x i32> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 5> 493 %40 = add <4 x i32> %36, %39 494 %41 = shufflevector <4 x i32> %40, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 495 %42 = shufflevector <8 x i32> %19, <8 x i32> %41, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef> 496 %43 = shufflevector <4 x i32> %6, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 497 %44 = shufflevector <4 x i32> %6, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 498 %45 = add <2 x i32> %43, %44 499 %46 = shufflevector <4 x i32> %7, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 500 %47 = shufflevector <4 x i32> %7, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 501 %48 = add <2 x i32> %46, %47 502 %49 = shufflevector <2 x i32> %45, <2 x i32> %48, <2 x i32> <i32 0, i32 2> 503 %50 = shufflevector <2 x i32> %45, <2 x i32> %48, <2 x i32> <i32 1, i32 3> 504 %51 = add <2 x i32> %49, %50 505 %52 = shufflevector <2 x i32> %51, <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 506 %53 = shufflevector <8 x i32> %42, <8 x i32> %52, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9> 507 ret <8 x i32> %53 508} 509 510; Vectorized Sequential Sum Reductions 511; e.g. 512; inline STYPE sum(VTYPE x) { 513; return ((x[0] + x[1]) + x[2]) + x[3]; 514; } 515; 516; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) { 517; return (VTYPE) { sum( A0 ), sum( A1 ), sum( A2 ), sum( A3 ) }; 518; } 519 520define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { 521; SSSE3-SLOW-LABEL: sequential_sum_v4f32_v4f32: 522; SSSE3-SLOW: # %bb.0: 523; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5 524; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm5 525; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 526; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] 527; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] 528; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] 529; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0 530; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] 531; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] 532; SSSE3-SLOW-NEXT: addps %xmm5, %xmm4 533; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] 534; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 535; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3] 536; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0 537; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1 538; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 539; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1 540; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 541; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3 542; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] 543; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] 544; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm0 545; SSSE3-SLOW-NEXT: retq 546; 547; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32: 548; SSSE3-FAST: # %bb.0: 549; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 550; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5 551; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5 552; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 553; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] 554; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 555; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] 556; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2 557; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,1] 558; SSSE3-FAST-NEXT: addps %xmm5, %xmm0 559; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 560; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 561; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1 562; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2 563; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] 564; SSSE3-FAST-NEXT: addps %xmm1, %xmm2 565; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 566; SSSE3-FAST-NEXT: addps %xmm2, %xmm3 567; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] 568; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] 569; SSSE3-FAST-NEXT: retq 570; 571; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32: 572; AVX-SLOW: # %bb.0: 573; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm4 574; AVX-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 575; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 576; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero 577; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 578; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 579; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1] 580; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3] 581; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] 582; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] 583; AVX-SLOW-NEXT: vaddps %xmm3, %xmm4, %xmm4 584; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 585; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2] 586; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 587; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] 588; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 589; AVX-SLOW-NEXT: retq 590; 591; AVX-FAST-LABEL: sequential_sum_v4f32_v4f32: 592; AVX-FAST: # %bb.0: 593; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm4 594; AVX-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 595; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 596; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero 597; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 598; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1] 599; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3] 600; AVX-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] 601; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4 602; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 603; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2] 604; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1 605; AVX-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] 606; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 607; AVX-FAST-NEXT: retq 608 %5 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 0, i32 4> 609 %6 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 1, i32 5> 610 %7 = fadd <2 x float> %5, %6 611 %8 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 6> 612 %9 = fadd <2 x float> %8, %7 613 %10 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 7> 614 %11 = fadd <2 x float> %10, %9 615 %12 = shufflevector <2 x float> %11, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 616 %13 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 617 %14 = fadd <4 x float> %13, %2 618 %15 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef> 619 %16 = fadd <4 x float> %15, %14 620 %17 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 621 %18 = fadd <4 x float> %17, %16 622 %19 = shufflevector <4 x float> %12, <4 x float> %18, <4 x i32> <i32 0, i32 1, i32 4, i32 undef> 623 %20 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 624 %21 = fadd <4 x float> %20, %3 625 %22 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef> 626 %23 = fadd <4 x float> %22, %21 627 %24 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 628 %25 = fadd <4 x float> %24, %23 629 %26 = shufflevector <4 x float> %19, <4 x float> %25, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 630 ret <4 x float> %26 631} 632 633define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { 634; SSSE3-SLOW-LABEL: sequential_sum_v4i32_v4i32: 635; SSSE3-SLOW: # %bb.0: 636; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm4 637; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm4 638; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 639; SSSE3-SLOW-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 640; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 641; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 642; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] 643; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 644; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5 645; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] 646; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 647; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] 648; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm6 649; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3] 650; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] 651; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] 652; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0] 653; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 654; SSSE3-SLOW-NEXT: retq 655; 656; SSSE3-FAST-LABEL: sequential_sum_v4i32_v4i32: 657; SSSE3-FAST: # %bb.0: 658; SSSE3-FAST-NEXT: movdqa %xmm0, %xmm4 659; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm4 660; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 661; SSSE3-FAST-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 662; SSSE3-FAST-NEXT: paddd %xmm0, %xmm4 663; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1 664; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1 665; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 666; SSSE3-FAST-NEXT: movdqa %xmm3, %xmm5 667; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm5 668; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] 669; SSSE3-FAST-NEXT: paddd %xmm5, %xmm6 670; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3] 671; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] 672; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] 673; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0] 674; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0 675; SSSE3-FAST-NEXT: retq 676; 677; AVX1-SLOW-LABEL: sequential_sum_v4i32_v4i32: 678; AVX1-SLOW: # %bb.0: 679; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4 680; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 681; AVX1-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 682; AVX1-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 683; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 684; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 685; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 686; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] 687; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] 688; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 689; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] 690; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 691; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 692; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] 693; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0] 694; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 695; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] 696; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 697; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 698; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 699; AVX1-SLOW-NEXT: retq 700; 701; AVX1-FAST-LABEL: sequential_sum_v4i32_v4i32: 702; AVX1-FAST: # %bb.0: 703; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4 704; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 705; AVX1-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 706; AVX1-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 707; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 708; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 709; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] 710; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] 711; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 712; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] 713; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 714; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 715; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 716; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] 717; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 718; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 719; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 720; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 721; AVX1-FAST-NEXT: retq 722; 723; AVX2-SLOW-LABEL: sequential_sum_v4i32_v4i32: 724; AVX2-SLOW: # %bb.0: 725; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4 726; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 727; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 728; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 729; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 730; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 731; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 732; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] 733; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] 734; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 735; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] 736; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 737; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 738; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1 739; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2 740; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 741; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] 742; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 743; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 744; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 745; AVX2-SLOW-NEXT: retq 746; 747; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32: 748; AVX2-FAST: # %bb.0: 749; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4 750; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 751; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 752; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 753; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 754; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 755; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] 756; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] 757; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 758; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] 759; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 760; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 761; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 762; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] 763; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 764; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 765; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 766; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 767; AVX2-FAST-NEXT: retq 768 %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 0, i32 4> 769 %6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 1, i32 5> 770 %7 = add <2 x i32> %5, %6 771 %8 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 2, i32 6> 772 %9 = add <2 x i32> %8, %7 773 %10 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 3, i32 7> 774 %11 = add <2 x i32> %10, %9 775 %12 = shufflevector <2 x i32> %11, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 776 %13 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 777 %14 = add <4 x i32> %13, %2 778 %15 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef> 779 %16 = add <4 x i32> %15, %14 780 %17 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 781 %18 = add <4 x i32> %17, %16 782 %19 = shufflevector <4 x i32> %12, <4 x i32> %18, <4 x i32> <i32 0, i32 1, i32 4, i32 undef> 783 %20 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 784 %21 = add <4 x i32> %20, %3 785 %22 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef> 786 %23 = add <4 x i32> %22, %21 787 %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 788 %25 = add <4 x i32> %24, %23 789 %26 = shufflevector <4 x i32> %19, <4 x i32> %25, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 790 ret <4 x i32> %26 791} 792 793; Vectorized Reductions 794; e.g. 795; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) { 796; return (VTYPE) { reduce( A0 ), reduce( A1 ), reduce( A2 ), reduce( A3 ) }; 797; } 798 799define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { 800; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32: 801; SSSE3-SLOW: # %bb.0: 802; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 803; SSSE3-SLOW-NEXT: addss %xmm0, %xmm4 804; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5 805; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 806; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5 807; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 808; SSSE3-SLOW-NEXT: addss %xmm5, %xmm0 809; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 810; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4 811; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5 812; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 813; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5 814; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 815; SSSE3-SLOW-NEXT: addss %xmm5, %xmm1 816; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 817; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 818; SSSE3-SLOW-NEXT: addss %xmm2, %xmm1 819; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm4 820; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] 821; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4 822; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 823; SSSE3-SLOW-NEXT: addss %xmm4, %xmm2 824; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 825; SSSE3-SLOW-NEXT: addss %xmm3, %xmm1 826; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm4 827; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 828; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4 829; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 830; SSSE3-SLOW-NEXT: addss %xmm4, %xmm3 831; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 832; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 833; SSSE3-SLOW-NEXT: retq 834; 835; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32: 836; SSSE3-FAST: # %bb.0: 837; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 838; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4 839; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5 840; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 841; SSSE3-FAST-NEXT: addss %xmm4, %xmm5 842; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 843; SSSE3-FAST-NEXT: addss %xmm5, %xmm0 844; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4 845; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4 846; SSSE3-FAST-NEXT: movaps %xmm1, %xmm5 847; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 848; SSSE3-FAST-NEXT: addss %xmm4, %xmm5 849; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 850; SSSE3-FAST-NEXT: addss %xmm5, %xmm1 851; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 852; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 853; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1 854; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4 855; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] 856; SSSE3-FAST-NEXT: addss %xmm1, %xmm4 857; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 858; SSSE3-FAST-NEXT: addss %xmm4, %xmm2 859; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 860; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1 861; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4 862; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 863; SSSE3-FAST-NEXT: addss %xmm1, %xmm4 864; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 865; SSSE3-FAST-NEXT: addss %xmm4, %xmm3 866; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 867; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 868; SSSE3-FAST-NEXT: retq 869; 870; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32: 871; AVX-SLOW: # %bb.0: 872; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 873; AVX-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm4 874; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0] 875; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4 876; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 877; AVX-SLOW-NEXT: vaddss %xmm0, %xmm4, %xmm0 878; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 879; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm4 880; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] 881; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4 882; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 883; AVX-SLOW-NEXT: vaddss %xmm1, %xmm4, %xmm1 884; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 885; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 886; AVX-SLOW-NEXT: vaddss %xmm1, %xmm2, %xmm1 887; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] 888; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1 889; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 890; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 891; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 892; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 893; AVX-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1 894; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] 895; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 896; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3] 897; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 898; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 899; AVX-SLOW-NEXT: retq 900; 901; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32: 902; AVX-FAST: # %bb.0: 903; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4 904; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0] 905; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4 906; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 907; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0 908; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4 909; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] 910; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4 911; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 912; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1 913; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 914; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 915; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] 916; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1 917; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 918; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 919; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 920; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1 921; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] 922; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 923; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3] 924; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 925; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 926; AVX-FAST-NEXT: retq 927 %5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0) 928 %6 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1) 929 %7 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2) 930 %8 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3) 931 %9 = insertelement <4 x float> undef, float %5, i32 0 932 %10 = insertelement <4 x float> %9, float %6, i32 1 933 %11 = insertelement <4 x float> %10, float %7, i32 2 934 %12 = insertelement <4 x float> %11, float %8, i32 3 935 ret <4 x float> %12 936} 937declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) 938 939define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { 940; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc: 941; SSSE3-SLOW: # %bb.0: 942; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 943; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] 944; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 945; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 946; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5 947; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 948; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5 949; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3] 950; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 951; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 952; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 953; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 954; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2 955; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] 956; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2 957; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3 958; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] 959; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 960; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] 961; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 962; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] 963; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 964; SSSE3-SLOW-NEXT: retq 965; 966; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: 967; SSSE3-FAST: # %bb.0: 968; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 969; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] 970; SSSE3-FAST-NEXT: addps %xmm4, %xmm0 971; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4 972; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 973; SSSE3-FAST-NEXT: addps %xmm1, %xmm4 974; SSSE3-FAST-NEXT: haddps %xmm4, %xmm0 975; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 976; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 977; SSSE3-FAST-NEXT: addps %xmm2, %xmm1 978; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2 979; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] 980; SSSE3-FAST-NEXT: addps %xmm3, %xmm2 981; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1 982; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 983; SSSE3-FAST-NEXT: retq 984; 985; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc: 986; AVX-SLOW: # %bb.0: 987; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] 988; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 989; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] 990; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 991; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] 992; AVX-SLOW-NEXT: vaddps %xmm4, %xmm2, %xmm2 993; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] 994; AVX-SLOW-NEXT: vaddps %xmm4, %xmm3, %xmm3 995; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1] 996; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero 997; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0] 998; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] 999; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1000; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] 1001; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 1002; AVX-SLOW-NEXT: retq 1003; 1004; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: 1005; AVX-FAST: # %bb.0: 1006; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] 1007; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0 1008; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] 1009; AVX-FAST-NEXT: vaddps %xmm4, %xmm1, %xmm1 1010; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 1011; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] 1012; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1 1013; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] 1014; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2 1015; AVX-FAST-NEXT: vhaddps %xmm2, %xmm1, %xmm1 1016; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1017; AVX-FAST-NEXT: retq 1018 %5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0) 1019 %6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1) 1020 %7 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2) 1021 %8 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3) 1022 %9 = insertelement <4 x float> undef, float %5, i32 0 1023 %10 = insertelement <4 x float> %9, float %6, i32 1 1024 %11 = insertelement <4 x float> %10, float %7, i32 2 1025 %12 = insertelement <4 x float> %11, float %8, i32 3 1026 ret <4 x float> %12 1027} 1028 1029define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { 1030; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32: 1031; SSSE3-SLOW: # %bb.0: 1032; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 1033; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 1034; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 1035; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] 1036; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 1037; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] 1038; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1039; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 1040; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 1041; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 1042; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] 1043; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6 1044; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] 1045; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1046; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] 1047; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] 1048; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 1049; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1050; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 1051; SSSE3-SLOW-NEXT: retq 1052; 1053; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32: 1054; SSSE3-FAST: # %bb.0: 1055; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 1056; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0 1057; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] 1058; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4 1059; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm0 1060; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 1061; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 1062; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 1063; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2 1064; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1 1065; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1066; SSSE3-FAST-NEXT: retq 1067; 1068; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32: 1069; AVX1-SLOW: # %bb.0: 1070; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 1071; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 1072; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 1073; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] 1074; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 1075; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] 1076; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1077; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] 1078; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 1079; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] 1080; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] 1081; AVX1-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3 1082; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] 1083; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 1084; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1085; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 1086; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1087; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 1088; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1089; AVX1-SLOW-NEXT: retq 1090; 1091; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32: 1092; AVX-FAST: # %bb.0: 1093; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 1094; AVX-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0 1095; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] 1096; AVX-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1 1097; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1098; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 1099; AVX-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 1100; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 1101; AVX-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 1102; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1 1103; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1104; AVX-FAST-NEXT: retq 1105; 1106; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32: 1107; AVX2-SLOW: # %bb.0: 1108; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 1109; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 1110; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 1111; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] 1112; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 1113; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] 1114; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 1115; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] 1116; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm3, %xmm3 1117; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1118; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] 1119; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] 1120; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1121; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm1 1122; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %xmm2 1123; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1124; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1125; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 1126; AVX2-SLOW-NEXT: retq 1127 %5 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %0) 1128 %6 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %1) 1129 %7 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %2) 1130 %8 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %3) 1131 %9 = insertelement <4 x i32> undef, i32 %5, i32 0 1132 %10 = insertelement <4 x i32> %9, i32 %6, i32 1 1133 %11 = insertelement <4 x i32> %10, i32 %7, i32 2 1134 %12 = insertelement <4 x i32> %11, i32 %8, i32 3 1135 ret <4 x i32> %12 1136} 1137declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>) 1138