1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3-SLOW 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSSE3-FAST 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1-SLOW 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefix=AVX1-FAST 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 8 9; PR37890 - subvector reduction followed by shuffle reduction 10 11define float @PR37890_v4f32(<4 x float> %a) { 12; SSE2-LABEL: PR37890_v4f32: 13; SSE2: # %bb.0: 14; SSE2-NEXT: movaps %xmm0, %xmm1 15; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 16; SSE2-NEXT: addps %xmm1, %xmm0 17; SSE2-NEXT: movaps %xmm0, %xmm1 18; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 19; SSE2-NEXT: addss %xmm1, %xmm0 20; SSE2-NEXT: retq 21; 22; SSSE3-SLOW-LABEL: PR37890_v4f32: 23; SSSE3-SLOW: # %bb.0: 24; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 25; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 26; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 27; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 28; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 29; SSSE3-SLOW-NEXT: retq 30; 31; SSSE3-FAST-LABEL: PR37890_v4f32: 32; SSSE3-FAST: # %bb.0: 33; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 34; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 35; SSSE3-FAST-NEXT: retq 36; 37; AVX1-SLOW-LABEL: PR37890_v4f32: 38; AVX1-SLOW: # %bb.0: 39; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 40; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 41; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 42; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 43; AVX1-SLOW-NEXT: retq 44; 45; AVX1-FAST-LABEL: PR37890_v4f32: 46; AVX1-FAST: # %bb.0: 47; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 48; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 49; AVX1-FAST-NEXT: retq 50; 51; AVX2-LABEL: PR37890_v4f32: 52; AVX2: # %bb.0: 53; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 54; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 55; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 56; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 57; AVX2-NEXT: retq 58 %hi0 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3> 59 %lo0 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1> 60 %sum0 = fadd fast <2 x float> %lo0, %hi0 61 %hi1 = shufflevector <2 x float> %sum0, <2 x float> undef, <2 x i32> <i32 1, i32 undef> 62 %sum1 = fadd fast <2 x float> %sum0, %hi1 63 %e = extractelement <2 x float> %sum1, i32 0 64 ret float %e 65} 66 67define double @PR37890_v4f64(<4 x double> %a) { 68; SSE2-LABEL: PR37890_v4f64: 69; SSE2: # %bb.0: 70; SSE2-NEXT: addpd %xmm1, %xmm0 71; SSE2-NEXT: movapd %xmm0, %xmm1 72; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 73; SSE2-NEXT: addsd %xmm1, %xmm0 74; SSE2-NEXT: retq 75; 76; SSSE3-SLOW-LABEL: PR37890_v4f64: 77; SSSE3-SLOW: # %bb.0: 78; SSSE3-SLOW-NEXT: addpd %xmm1, %xmm0 79; SSSE3-SLOW-NEXT: movapd %xmm0, %xmm1 80; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 81; SSSE3-SLOW-NEXT: addsd %xmm1, %xmm0 82; SSSE3-SLOW-NEXT: retq 83; 84; SSSE3-FAST-LABEL: PR37890_v4f64: 85; SSSE3-FAST: # %bb.0: 86; SSSE3-FAST-NEXT: addpd %xmm1, %xmm0 87; SSSE3-FAST-NEXT: haddpd %xmm0, %xmm0 88; SSSE3-FAST-NEXT: retq 89; 90; AVX1-SLOW-LABEL: PR37890_v4f64: 91; AVX1-SLOW: # %bb.0: 92; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 93; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 94; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 95; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 96; AVX1-SLOW-NEXT: vzeroupper 97; AVX1-SLOW-NEXT: retq 98; 99; AVX1-FAST-LABEL: PR37890_v4f64: 100; AVX1-FAST: # %bb.0: 101; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 102; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 103; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 104; AVX1-FAST-NEXT: vzeroupper 105; AVX1-FAST-NEXT: retq 106; 107; AVX2-LABEL: PR37890_v4f64: 108; AVX2: # %bb.0: 109; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 110; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 111; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 112; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 113; AVX2-NEXT: vzeroupper 114; AVX2-NEXT: retq 115 %hi0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3> 116 %lo0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1> 117 %sum0 = fadd fast <2 x double> %lo0, %hi0 118 %hi1 = shufflevector <2 x double> %sum0, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 119 %sum1 = fadd fast <2 x double> %sum0, %hi1 120 %e = extractelement <2 x double> %sum1, i32 0 121 ret double %e 122} 123 124define float @PR37890_v8f32(<8 x float> %a) { 125; SSE2-LABEL: PR37890_v8f32: 126; SSE2: # %bb.0: 127; SSE2-NEXT: addps %xmm1, %xmm0 128; SSE2-NEXT: movaps %xmm0, %xmm1 129; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 130; SSE2-NEXT: addps %xmm1, %xmm0 131; SSE2-NEXT: movaps %xmm0, %xmm1 132; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 133; SSE2-NEXT: addss %xmm1, %xmm0 134; SSE2-NEXT: retq 135; 136; SSSE3-SLOW-LABEL: PR37890_v8f32: 137; SSSE3-SLOW: # %bb.0: 138; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 139; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 140; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 141; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 142; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 143; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 144; SSSE3-SLOW-NEXT: retq 145; 146; SSSE3-FAST-LABEL: PR37890_v8f32: 147; SSSE3-FAST: # %bb.0: 148; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 149; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 150; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 151; SSSE3-FAST-NEXT: retq 152; 153; AVX1-SLOW-LABEL: PR37890_v8f32: 154; AVX1-SLOW: # %bb.0: 155; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 156; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 157; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 158; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 159; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 160; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 161; AVX1-SLOW-NEXT: vzeroupper 162; AVX1-SLOW-NEXT: retq 163; 164; AVX1-FAST-LABEL: PR37890_v8f32: 165; AVX1-FAST: # %bb.0: 166; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 167; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 168; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 169; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 170; AVX1-FAST-NEXT: vzeroupper 171; AVX1-FAST-NEXT: retq 172; 173; AVX2-LABEL: PR37890_v8f32: 174; AVX2: # %bb.0: 175; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 176; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 177; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 178; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 179; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 180; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 181; AVX2-NEXT: vzeroupper 182; AVX2-NEXT: retq 183 %hi0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 184 %lo0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 185 %sum0 = fadd fast <4 x float> %lo0, %hi0 186 %hi1 = shufflevector <4 x float> %sum0, <4 x float> undef, <2 x i32> <i32 2, i32 3> 187 %lo1 = shufflevector <4 x float> %sum0, <4 x float> undef, <2 x i32> <i32 0, i32 1> 188 %sum1 = fadd fast <2 x float> %lo1, %hi1 189 %hi2 = shufflevector <2 x float> %sum1, <2 x float> undef, <2 x i32> <i32 1, i32 undef> 190 %sum2 = fadd fast <2 x float> %sum1, %hi2 191 %e = extractelement <2 x float> %sum2, i32 0 192 ret float %e 193} 194 195define double @PR37890_v8f64(<8 x double> %a) { 196; SSE2-LABEL: PR37890_v8f64: 197; SSE2: # %bb.0: 198; SSE2-NEXT: addpd %xmm3, %xmm1 199; SSE2-NEXT: addpd %xmm2, %xmm0 200; SSE2-NEXT: addpd %xmm1, %xmm0 201; SSE2-NEXT: movapd %xmm0, %xmm1 202; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 203; SSE2-NEXT: addsd %xmm1, %xmm0 204; SSE2-NEXT: retq 205; 206; SSSE3-SLOW-LABEL: PR37890_v8f64: 207; SSSE3-SLOW: # %bb.0: 208; SSSE3-SLOW-NEXT: addpd %xmm3, %xmm1 209; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm0 210; SSSE3-SLOW-NEXT: addpd %xmm1, %xmm0 211; SSSE3-SLOW-NEXT: movapd %xmm0, %xmm1 212; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 213; SSSE3-SLOW-NEXT: addsd %xmm1, %xmm0 214; SSSE3-SLOW-NEXT: retq 215; 216; SSSE3-FAST-LABEL: PR37890_v8f64: 217; SSSE3-FAST: # %bb.0: 218; SSSE3-FAST-NEXT: addpd %xmm3, %xmm1 219; SSSE3-FAST-NEXT: addpd %xmm2, %xmm0 220; SSSE3-FAST-NEXT: addpd %xmm1, %xmm0 221; SSSE3-FAST-NEXT: haddpd %xmm0, %xmm0 222; SSSE3-FAST-NEXT: retq 223; 224; AVX1-SLOW-LABEL: PR37890_v8f64: 225; AVX1-SLOW: # %bb.0: 226; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 227; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 228; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 229; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 230; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 231; AVX1-SLOW-NEXT: vzeroupper 232; AVX1-SLOW-NEXT: retq 233; 234; AVX1-FAST-LABEL: PR37890_v8f64: 235; AVX1-FAST: # %bb.0: 236; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 237; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 238; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 239; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 240; AVX1-FAST-NEXT: vzeroupper 241; AVX1-FAST-NEXT: retq 242; 243; AVX2-LABEL: PR37890_v8f64: 244; AVX2: # %bb.0: 245; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 246; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 247; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 248; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 249; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 250; AVX2-NEXT: vzeroupper 251; AVX2-NEXT: retq 252 %hi0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 253 %lo0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 254 %sum0 = fadd fast <4 x double> %lo0, %hi0 255 %hi1 = shufflevector <4 x double> %sum0, <4 x double> undef, <2 x i32> <i32 2, i32 3> 256 %lo1 = shufflevector <4 x double> %sum0, <4 x double> undef, <2 x i32> <i32 0, i32 1> 257 %sum1 = fadd fast <2 x double> %lo1, %hi1 258 %hi2 = shufflevector <2 x double> %sum1, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 259 %sum2 = fadd fast <2 x double> %sum1, %hi2 260 %e = extractelement <2 x double> %sum2, i32 0 261 ret double %e 262} 263 264define float @PR37890_v16f32(<16 x float> %a) { 265; SSE2-LABEL: PR37890_v16f32: 266; SSE2: # %bb.0: 267; SSE2-NEXT: addps %xmm3, %xmm1 268; SSE2-NEXT: addps %xmm2, %xmm0 269; SSE2-NEXT: addps %xmm1, %xmm0 270; SSE2-NEXT: movaps %xmm0, %xmm1 271; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 272; SSE2-NEXT: addps %xmm1, %xmm0 273; SSE2-NEXT: movaps %xmm0, %xmm1 274; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 275; SSE2-NEXT: addss %xmm1, %xmm0 276; SSE2-NEXT: retq 277; 278; SSSE3-SLOW-LABEL: PR37890_v16f32: 279; SSSE3-SLOW: # %bb.0: 280; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 281; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0 282; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 283; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 284; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 285; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 286; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 287; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 288; SSSE3-SLOW-NEXT: retq 289; 290; SSSE3-FAST-LABEL: PR37890_v16f32: 291; SSSE3-FAST: # %bb.0: 292; SSSE3-FAST-NEXT: addps %xmm3, %xmm1 293; SSSE3-FAST-NEXT: addps %xmm2, %xmm0 294; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 295; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1 296; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 297; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 298; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 299; SSSE3-FAST-NEXT: retq 300; 301; AVX1-SLOW-LABEL: PR37890_v16f32: 302; AVX1-SLOW: # %bb.0: 303; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 304; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 305; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 306; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 307; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 308; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 309; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 310; AVX1-SLOW-NEXT: vzeroupper 311; AVX1-SLOW-NEXT: retq 312; 313; AVX1-FAST-LABEL: PR37890_v16f32: 314; AVX1-FAST: # %bb.0: 315; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 316; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 317; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 318; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 319; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 320; AVX1-FAST-NEXT: vzeroupper 321; AVX1-FAST-NEXT: retq 322; 323; AVX2-LABEL: PR37890_v16f32: 324; AVX2: # %bb.0: 325; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 326; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 327; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 328; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 329; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 330; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 331; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 332; AVX2-NEXT: vzeroupper 333; AVX2-NEXT: retq 334 %hi0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 335 %lo0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 336 %sum0 = fadd fast <8 x float> %lo0, %hi0 337 %hi1 = shufflevector <8 x float> %sum0, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 338 %lo1 = shufflevector <8 x float> %sum0, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 339 %sum1 = fadd fast <4 x float> %lo1, %hi1 340 %hi2 = shufflevector <4 x float> %sum1, <4 x float> undef, <2 x i32> <i32 2, i32 3> 341 %lo2 = shufflevector <4 x float> %sum1, <4 x float> undef, <2 x i32> <i32 0, i32 1> 342 %sum2 = fadd fast <2 x float> %lo2, %hi2 343 %hi3 = shufflevector <2 x float> %sum2, <2 x float> undef, <2 x i32> <i32 1, i32 undef> 344 %sum3 = fadd fast <2 x float> %sum2, %hi3 345 %e = extractelement <2 x float> %sum3, i32 0 346 ret float %e 347} 348