1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 9 10; 11; vXf32 (accum) 12; 13 14define float @test_v2f32(float %a0, <2 x float> %a1) { 15; SSE2-LABEL: test_v2f32: 16; SSE2: # %bb.0: 17; SSE2-NEXT: addss %xmm1, %xmm0 18; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 19; SSE2-NEXT: addss %xmm1, %xmm0 20; SSE2-NEXT: retq 21; 22; SSE41-LABEL: test_v2f32: 23; SSE41: # %bb.0: 24; SSE41-NEXT: addss %xmm1, %xmm0 25; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 26; SSE41-NEXT: addss %xmm1, %xmm0 27; SSE41-NEXT: retq 28; 29; AVX-LABEL: test_v2f32: 30; AVX: # %bb.0: 31; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 32; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 33; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 34; AVX-NEXT: retq 35; 36; AVX512-LABEL: test_v2f32: 37; AVX512: # %bb.0: 38; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 39; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 40; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 41; AVX512-NEXT: retq 42 %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1) 43 ret float %1 44} 45 46define float @test_v4f32(float %a0, <4 x float> %a1) { 47; SSE2-LABEL: test_v4f32: 48; SSE2: # %bb.0: 49; SSE2-NEXT: addss %xmm1, %xmm0 50; SSE2-NEXT: movaps %xmm1, %xmm2 51; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 52; SSE2-NEXT: addss %xmm2, %xmm0 53; SSE2-NEXT: movaps %xmm1, %xmm2 54; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 55; SSE2-NEXT: addss %xmm2, %xmm0 56; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 57; SSE2-NEXT: addss %xmm1, %xmm0 58; SSE2-NEXT: retq 59; 60; SSE41-LABEL: test_v4f32: 61; SSE41: # %bb.0: 62; SSE41-NEXT: addss %xmm1, %xmm0 63; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 64; SSE41-NEXT: addss %xmm2, %xmm0 65; SSE41-NEXT: movaps %xmm1, %xmm2 66; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 67; SSE41-NEXT: addss %xmm2, %xmm0 68; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 69; SSE41-NEXT: addss %xmm1, %xmm0 70; SSE41-NEXT: retq 71; 72; AVX-LABEL: test_v4f32: 73; AVX: # %bb.0: 74; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 75; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 76; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 77; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 78; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 79; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 80; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 81; AVX-NEXT: retq 82; 83; AVX512-LABEL: test_v4f32: 84; AVX512: # %bb.0: 85; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 86; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 87; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 88; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 89; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 90; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 91; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 92; AVX512-NEXT: retq 93 %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1) 94 ret float %1 95} 96 97define float @test_v8f32(float %a0, <8 x float> %a1) { 98; SSE2-LABEL: test_v8f32: 99; SSE2: # %bb.0: 100; SSE2-NEXT: addss %xmm1, %xmm0 101; SSE2-NEXT: movaps %xmm1, %xmm3 102; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] 103; SSE2-NEXT: addss %xmm3, %xmm0 104; SSE2-NEXT: movaps %xmm1, %xmm3 105; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] 106; SSE2-NEXT: addss %xmm3, %xmm0 107; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 108; SSE2-NEXT: addss %xmm1, %xmm0 109; SSE2-NEXT: addss %xmm2, %xmm0 110; SSE2-NEXT: movaps %xmm2, %xmm1 111; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 112; SSE2-NEXT: addss %xmm1, %xmm0 113; SSE2-NEXT: movaps %xmm2, %xmm1 114; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 115; SSE2-NEXT: addss %xmm1, %xmm0 116; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 117; SSE2-NEXT: addss %xmm2, %xmm0 118; SSE2-NEXT: retq 119; 120; SSE41-LABEL: test_v8f32: 121; SSE41: # %bb.0: 122; SSE41-NEXT: addss %xmm1, %xmm0 123; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 124; SSE41-NEXT: addss %xmm3, %xmm0 125; SSE41-NEXT: movaps %xmm1, %xmm3 126; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] 127; SSE41-NEXT: addss %xmm3, %xmm0 128; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 129; SSE41-NEXT: addss %xmm1, %xmm0 130; SSE41-NEXT: addss %xmm2, %xmm0 131; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 132; SSE41-NEXT: addss %xmm1, %xmm0 133; SSE41-NEXT: movaps %xmm2, %xmm1 134; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 135; SSE41-NEXT: addss %xmm1, %xmm0 136; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 137; SSE41-NEXT: addss %xmm2, %xmm0 138; SSE41-NEXT: retq 139; 140; AVX-LABEL: test_v8f32: 141; AVX: # %bb.0: 142; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 143; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 144; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 145; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 146; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 147; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] 148; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 149; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 150; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 151; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 152; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 153; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 154; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 155; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 156; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 157; AVX-NEXT: vzeroupper 158; AVX-NEXT: retq 159; 160; AVX512-LABEL: test_v8f32: 161; AVX512: # %bb.0: 162; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 163; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 164; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 165; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 166; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 167; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] 168; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 169; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 170; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 171; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 172; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 173; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 174; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 175; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 176; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 177; AVX512-NEXT: vzeroupper 178; AVX512-NEXT: retq 179 %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) 180 ret float %1 181} 182 183define float @test_v16f32(float %a0, <16 x float> %a1) { 184; SSE2-LABEL: test_v16f32: 185; SSE2: # %bb.0: 186; SSE2-NEXT: addss %xmm1, %xmm0 187; SSE2-NEXT: movaps %xmm1, %xmm5 188; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] 189; SSE2-NEXT: addss %xmm5, %xmm0 190; SSE2-NEXT: movaps %xmm1, %xmm5 191; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 192; SSE2-NEXT: addss %xmm5, %xmm0 193; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 194; SSE2-NEXT: addss %xmm1, %xmm0 195; SSE2-NEXT: addss %xmm2, %xmm0 196; SSE2-NEXT: movaps %xmm2, %xmm1 197; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 198; SSE2-NEXT: addss %xmm1, %xmm0 199; SSE2-NEXT: movaps %xmm2, %xmm1 200; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 201; SSE2-NEXT: addss %xmm1, %xmm0 202; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 203; SSE2-NEXT: addss %xmm2, %xmm0 204; SSE2-NEXT: addss %xmm3, %xmm0 205; SSE2-NEXT: movaps %xmm3, %xmm1 206; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] 207; SSE2-NEXT: addss %xmm1, %xmm0 208; SSE2-NEXT: movaps %xmm3, %xmm1 209; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 210; SSE2-NEXT: addss %xmm1, %xmm0 211; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 212; SSE2-NEXT: addss %xmm3, %xmm0 213; SSE2-NEXT: addss %xmm4, %xmm0 214; SSE2-NEXT: movaps %xmm4, %xmm1 215; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] 216; SSE2-NEXT: addss %xmm1, %xmm0 217; SSE2-NEXT: movaps %xmm4, %xmm1 218; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] 219; SSE2-NEXT: addss %xmm1, %xmm0 220; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] 221; SSE2-NEXT: addss %xmm4, %xmm0 222; SSE2-NEXT: retq 223; 224; SSE41-LABEL: test_v16f32: 225; SSE41: # %bb.0: 226; SSE41-NEXT: addss %xmm1, %xmm0 227; SSE41-NEXT: movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] 228; SSE41-NEXT: addss %xmm5, %xmm0 229; SSE41-NEXT: movaps %xmm1, %xmm5 230; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 231; SSE41-NEXT: addss %xmm5, %xmm0 232; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 233; SSE41-NEXT: addss %xmm1, %xmm0 234; SSE41-NEXT: addss %xmm2, %xmm0 235; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 236; SSE41-NEXT: addss %xmm1, %xmm0 237; SSE41-NEXT: movaps %xmm2, %xmm1 238; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 239; SSE41-NEXT: addss %xmm1, %xmm0 240; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 241; SSE41-NEXT: addss %xmm2, %xmm0 242; SSE41-NEXT: addss %xmm3, %xmm0 243; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 244; SSE41-NEXT: addss %xmm1, %xmm0 245; SSE41-NEXT: movaps %xmm3, %xmm1 246; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 247; SSE41-NEXT: addss %xmm1, %xmm0 248; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 249; SSE41-NEXT: addss %xmm3, %xmm0 250; SSE41-NEXT: addss %xmm4, %xmm0 251; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] 252; SSE41-NEXT: addss %xmm1, %xmm0 253; SSE41-NEXT: movaps %xmm4, %xmm1 254; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] 255; SSE41-NEXT: addss %xmm1, %xmm0 256; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] 257; SSE41-NEXT: addss %xmm4, %xmm0 258; SSE41-NEXT: retq 259; 260; AVX-LABEL: test_v16f32: 261; AVX: # %bb.0: 262; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 263; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 264; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 265; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] 266; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 267; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] 268; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 269; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 270; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 271; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 272; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 273; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] 274; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 275; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 276; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 277; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 278; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 279; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 280; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] 281; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 282; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] 283; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 284; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 285; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 286; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 287; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 288; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 289; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 290; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 291; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 292; AVX-NEXT: vzeroupper 293; AVX-NEXT: retq 294; 295; AVX512-LABEL: test_v16f32: 296; AVX512: # %bb.0: 297; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 298; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 299; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 300; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 301; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 302; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] 303; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 304; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 305; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 306; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 307; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 308; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] 309; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 310; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 311; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 312; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 313; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 314; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 315; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 316; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] 317; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 318; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 319; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 320; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 321; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 322; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 323; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 324; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 325; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 326; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 327; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 328; AVX512-NEXT: vzeroupper 329; AVX512-NEXT: retq 330 %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1) 331 ret float %1 332} 333 334; 335; vXf32 (zero) 336; 337 338define float @test_v2f32_zero(<2 x float> %a0) { 339; SSE2-LABEL: test_v2f32_zero: 340; SSE2: # %bb.0: 341; SSE2-NEXT: movaps %xmm0, %xmm1 342; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 343; SSE2-NEXT: addss %xmm1, %xmm0 344; SSE2-NEXT: retq 345; 346; SSE41-LABEL: test_v2f32_zero: 347; SSE41: # %bb.0: 348; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 349; SSE41-NEXT: addss %xmm1, %xmm0 350; SSE41-NEXT: retq 351; 352; AVX1-SLOW-LABEL: test_v2f32_zero: 353; AVX1-SLOW: # %bb.0: 354; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 355; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 356; AVX1-SLOW-NEXT: retq 357; 358; AVX1-FAST-LABEL: test_v2f32_zero: 359; AVX1-FAST: # %bb.0: 360; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 361; AVX1-FAST-NEXT: retq 362; 363; AVX2-LABEL: test_v2f32_zero: 364; AVX2: # %bb.0: 365; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 366; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 367; AVX2-NEXT: retq 368; 369; AVX512-LABEL: test_v2f32_zero: 370; AVX512: # %bb.0: 371; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 372; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 373; AVX512-NEXT: retq 374 %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %a0) 375 ret float %1 376} 377 378define float @test_v4f32_zero(<4 x float> %a0) { 379; SSE2-LABEL: test_v4f32_zero: 380; SSE2: # %bb.0: 381; SSE2-NEXT: movaps %xmm0, %xmm1 382; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 383; SSE2-NEXT: addss %xmm0, %xmm1 384; SSE2-NEXT: movaps %xmm0, %xmm2 385; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 386; SSE2-NEXT: addss %xmm1, %xmm2 387; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 388; SSE2-NEXT: addss %xmm2, %xmm0 389; SSE2-NEXT: retq 390; 391; SSE41-LABEL: test_v4f32_zero: 392; SSE41: # %bb.0: 393; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 394; SSE41-NEXT: addss %xmm0, %xmm1 395; SSE41-NEXT: movaps %xmm0, %xmm2 396; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 397; SSE41-NEXT: addss %xmm1, %xmm2 398; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 399; SSE41-NEXT: addss %xmm2, %xmm0 400; SSE41-NEXT: retq 401; 402; AVX1-SLOW-LABEL: test_v4f32_zero: 403; AVX1-SLOW: # %bb.0: 404; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 405; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 406; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 407; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 408; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 409; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 410; AVX1-SLOW-NEXT: retq 411; 412; AVX1-FAST-LABEL: test_v4f32_zero: 413; AVX1-FAST: # %bb.0: 414; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1 415; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 416; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 417; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 418; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0 419; AVX1-FAST-NEXT: retq 420; 421; AVX2-LABEL: test_v4f32_zero: 422; AVX2: # %bb.0: 423; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 424; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm1 425; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 426; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 427; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 428; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0 429; AVX2-NEXT: retq 430; 431; AVX512-LABEL: test_v4f32_zero: 432; AVX512: # %bb.0: 433; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 434; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 435; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 436; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 437; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 438; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 439; AVX512-NEXT: retq 440 %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a0) 441 ret float %1 442} 443 444define float @test_v8f32_zero(<8 x float> %a0) { 445; SSE2-LABEL: test_v8f32_zero: 446; SSE2: # %bb.0: 447; SSE2-NEXT: movaps %xmm0, %xmm2 448; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] 449; SSE2-NEXT: addss %xmm0, %xmm2 450; SSE2-NEXT: movaps %xmm0, %xmm3 451; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 452; SSE2-NEXT: addss %xmm2, %xmm3 453; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 454; SSE2-NEXT: addss %xmm3, %xmm0 455; SSE2-NEXT: addss %xmm1, %xmm0 456; SSE2-NEXT: movaps %xmm1, %xmm2 457; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 458; SSE2-NEXT: addss %xmm2, %xmm0 459; SSE2-NEXT: movaps %xmm1, %xmm2 460; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 461; SSE2-NEXT: addss %xmm2, %xmm0 462; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 463; SSE2-NEXT: addss %xmm1, %xmm0 464; SSE2-NEXT: retq 465; 466; SSE41-LABEL: test_v8f32_zero: 467; SSE41: # %bb.0: 468; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 469; SSE41-NEXT: addss %xmm0, %xmm2 470; SSE41-NEXT: movaps %xmm0, %xmm3 471; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 472; SSE41-NEXT: addss %xmm2, %xmm3 473; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 474; SSE41-NEXT: addss %xmm3, %xmm0 475; SSE41-NEXT: addss %xmm1, %xmm0 476; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 477; SSE41-NEXT: addss %xmm2, %xmm0 478; SSE41-NEXT: movaps %xmm1, %xmm2 479; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 480; SSE41-NEXT: addss %xmm2, %xmm0 481; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 482; SSE41-NEXT: addss %xmm1, %xmm0 483; SSE41-NEXT: retq 484; 485; AVX1-SLOW-LABEL: test_v8f32_zero: 486; AVX1-SLOW: # %bb.0: 487; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 488; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 489; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 490; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 491; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] 492; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 493; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 494; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm1 495; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 496; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 497; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 498; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 499; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 500; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 501; AVX1-SLOW-NEXT: vzeroupper 502; AVX1-SLOW-NEXT: retq 503; 504; AVX1-FAST-LABEL: test_v8f32_zero: 505; AVX1-FAST: # %bb.0: 506; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1 507; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 508; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 509; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] 510; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 511; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 512; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm1 513; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 514; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 515; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 516; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 517; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 518; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0 519; AVX1-FAST-NEXT: vzeroupper 520; AVX1-FAST-NEXT: retq 521; 522; AVX2-LABEL: test_v8f32_zero: 523; AVX2: # %bb.0: 524; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 525; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm1 526; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 527; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 528; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] 529; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 530; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 531; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm1 532; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 533; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 534; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 535; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 536; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 537; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0 538; AVX2-NEXT: vzeroupper 539; AVX2-NEXT: retq 540; 541; AVX512-LABEL: test_v8f32_zero: 542; AVX512: # %bb.0: 543; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 544; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 545; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 546; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 547; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] 548; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 549; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 550; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 551; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 552; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 553; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 554; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 555; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 556; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 557; AVX512-NEXT: vzeroupper 558; AVX512-NEXT: retq 559 %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a0) 560 ret float %1 561} 562 563define float @test_v16f32_zero(<16 x float> %a0) { 564; SSE2-LABEL: test_v16f32_zero: 565; SSE2: # %bb.0: 566; SSE2-NEXT: movaps %xmm0, %xmm4 567; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] 568; SSE2-NEXT: addss %xmm0, %xmm4 569; SSE2-NEXT: movaps %xmm0, %xmm5 570; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 571; SSE2-NEXT: addss %xmm4, %xmm5 572; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 573; SSE2-NEXT: addss %xmm5, %xmm0 574; SSE2-NEXT: addss %xmm1, %xmm0 575; SSE2-NEXT: movaps %xmm1, %xmm4 576; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] 577; SSE2-NEXT: addss %xmm4, %xmm0 578; SSE2-NEXT: movaps %xmm1, %xmm4 579; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 580; SSE2-NEXT: addss %xmm4, %xmm0 581; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 582; SSE2-NEXT: addss %xmm1, %xmm0 583; SSE2-NEXT: addss %xmm2, %xmm0 584; SSE2-NEXT: movaps %xmm2, %xmm1 585; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 586; SSE2-NEXT: addss %xmm1, %xmm0 587; SSE2-NEXT: movaps %xmm2, %xmm1 588; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 589; SSE2-NEXT: addss %xmm1, %xmm0 590; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 591; SSE2-NEXT: addss %xmm2, %xmm0 592; SSE2-NEXT: addss %xmm3, %xmm0 593; SSE2-NEXT: movaps %xmm3, %xmm1 594; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] 595; SSE2-NEXT: addss %xmm1, %xmm0 596; SSE2-NEXT: movaps %xmm3, %xmm1 597; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 598; SSE2-NEXT: addss %xmm1, %xmm0 599; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 600; SSE2-NEXT: addss %xmm3, %xmm0 601; SSE2-NEXT: retq 602; 603; SSE41-LABEL: test_v16f32_zero: 604; SSE41: # %bb.0: 605; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 606; SSE41-NEXT: addss %xmm0, %xmm4 607; SSE41-NEXT: movaps %xmm0, %xmm5 608; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 609; SSE41-NEXT: addss %xmm4, %xmm5 610; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 611; SSE41-NEXT: addss %xmm5, %xmm0 612; SSE41-NEXT: addss %xmm1, %xmm0 613; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 614; SSE41-NEXT: addss %xmm4, %xmm0 615; SSE41-NEXT: movaps %xmm1, %xmm4 616; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 617; SSE41-NEXT: addss %xmm4, %xmm0 618; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 619; SSE41-NEXT: addss %xmm1, %xmm0 620; SSE41-NEXT: addss %xmm2, %xmm0 621; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 622; SSE41-NEXT: addss %xmm1, %xmm0 623; SSE41-NEXT: movaps %xmm2, %xmm1 624; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 625; SSE41-NEXT: addss %xmm1, %xmm0 626; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 627; SSE41-NEXT: addss %xmm2, %xmm0 628; SSE41-NEXT: addss %xmm3, %xmm0 629; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 630; SSE41-NEXT: addss %xmm1, %xmm0 631; SSE41-NEXT: movaps %xmm3, %xmm1 632; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 633; SSE41-NEXT: addss %xmm1, %xmm0 634; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 635; SSE41-NEXT: addss %xmm3, %xmm0 636; SSE41-NEXT: retq 637; 638; AVX1-SLOW-LABEL: test_v16f32_zero: 639; AVX1-SLOW: # %bb.0: 640; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 641; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2 642; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 643; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 644; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] 645; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 646; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 647; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm2 648; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 649; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 650; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 651; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 652; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 653; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0 654; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 655; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 656; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 657; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 658; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 659; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] 660; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 661; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 662; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 663; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 664; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 665; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 666; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 667; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 668; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 669; AVX1-SLOW-NEXT: vzeroupper 670; AVX1-SLOW-NEXT: retq 671; 672; AVX1-FAST-LABEL: test_v16f32_zero: 673; AVX1-FAST: # %bb.0: 674; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm2 675; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 676; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 677; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] 678; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 679; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 680; AVX1-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm2 681; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 682; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 683; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 684; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 685; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 686; AVX1-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm0 687; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 688; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 689; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 690; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 691; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 692; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] 693; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 694; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 695; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 696; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 697; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 698; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 699; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 700; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 701; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 702; AVX1-FAST-NEXT: vzeroupper 703; AVX1-FAST-NEXT: retq 704; 705; AVX2-LABEL: test_v16f32_zero: 706; AVX2: # %bb.0: 707; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 708; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm2 709; AVX2-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 710; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 711; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] 712; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 713; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 714; AVX2-NEXT: vaddss %xmm0, %xmm2, %xmm2 715; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 716; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 717; AVX2-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 718; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 719; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 720; AVX2-NEXT: vaddss %xmm0, %xmm2, %xmm0 721; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 722; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 723; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 724; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 725; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 726; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] 727; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 728; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 729; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 730; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 731; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 732; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 733; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 734; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 735; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 736; AVX2-NEXT: vzeroupper 737; AVX2-NEXT: retq 738; 739; AVX512-LABEL: test_v16f32_zero: 740; AVX512: # %bb.0: 741; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 742; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 743; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 744; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 745; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] 746; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 747; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 748; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 749; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 750; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 751; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] 752; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 753; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 754; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 755; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 756; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 757; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 758; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 759; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] 760; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 761; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 762; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 763; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 764; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 765; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 766; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 767; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 768; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 769; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 770; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 771; AVX512-NEXT: vzeroupper 772; AVX512-NEXT: retq 773 %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float -0.0, <16 x float> %a0) 774 ret float %1 775} 776 777; 778; vXf32 (undef) 779; 780 781define float @test_v2f32_undef(<2 x float> %a0) { 782; SSE2-LABEL: test_v2f32_undef: 783; SSE2: # %bb.0: 784; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 785; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 786; SSE2-NEXT: retq 787; 788; SSE41-LABEL: test_v2f32_undef: 789; SSE41: # %bb.0: 790; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 791; SSE41-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 792; SSE41-NEXT: retq 793; 794; AVX-LABEL: test_v2f32_undef: 795; AVX: # %bb.0: 796; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 797; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 798; AVX-NEXT: retq 799; 800; AVX512-LABEL: test_v2f32_undef: 801; AVX512: # %bb.0: 802; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 803; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 804; AVX512-NEXT: retq 805 %1 = call float @llvm.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %a0) 806 ret float %1 807} 808 809define float @test_v4f32_undef(<4 x float> %a0) { 810; SSE2-LABEL: test_v4f32_undef: 811; SSE2: # %bb.0: 812; SSE2-NEXT: movaps %xmm0, %xmm1 813; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 814; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 815; SSE2-NEXT: movaps %xmm0, %xmm2 816; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 817; SSE2-NEXT: addss %xmm1, %xmm2 818; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 819; SSE2-NEXT: addss %xmm2, %xmm0 820; SSE2-NEXT: retq 821; 822; SSE41-LABEL: test_v4f32_undef: 823; SSE41: # %bb.0: 824; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 825; SSE41-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 826; SSE41-NEXT: movaps %xmm0, %xmm2 827; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 828; SSE41-NEXT: addss %xmm1, %xmm2 829; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 830; SSE41-NEXT: addss %xmm2, %xmm0 831; SSE41-NEXT: retq 832; 833; AVX-LABEL: test_v4f32_undef: 834; AVX: # %bb.0: 835; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 836; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 837; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 838; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 839; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 840; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 841; AVX-NEXT: retq 842; 843; AVX512-LABEL: test_v4f32_undef: 844; AVX512: # %bb.0: 845; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 846; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 847; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 848; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 849; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 850; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 851; AVX512-NEXT: retq 852 %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0) 853 ret float %1 854} 855 856define float @test_v8f32_undef(<8 x float> %a0) { 857; SSE2-LABEL: test_v8f32_undef: 858; SSE2: # %bb.0: 859; SSE2-NEXT: movaps %xmm0, %xmm2 860; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] 861; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 862; SSE2-NEXT: movaps %xmm0, %xmm3 863; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 864; SSE2-NEXT: addss %xmm2, %xmm3 865; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 866; SSE2-NEXT: addss %xmm3, %xmm0 867; SSE2-NEXT: addss %xmm1, %xmm0 868; SSE2-NEXT: movaps %xmm1, %xmm2 869; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 870; SSE2-NEXT: addss %xmm2, %xmm0 871; SSE2-NEXT: movaps %xmm1, %xmm2 872; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 873; SSE2-NEXT: addss %xmm2, %xmm0 874; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 875; SSE2-NEXT: addss %xmm1, %xmm0 876; SSE2-NEXT: retq 877; 878; SSE41-LABEL: test_v8f32_undef: 879; SSE41: # %bb.0: 880; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 881; SSE41-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 882; SSE41-NEXT: movaps %xmm0, %xmm3 883; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 884; SSE41-NEXT: addss %xmm2, %xmm3 885; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 886; SSE41-NEXT: addss %xmm3, %xmm0 887; SSE41-NEXT: addss %xmm1, %xmm0 888; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 889; SSE41-NEXT: addss %xmm2, %xmm0 890; SSE41-NEXT: movaps %xmm1, %xmm2 891; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 892; SSE41-NEXT: addss %xmm2, %xmm0 893; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 894; SSE41-NEXT: addss %xmm1, %xmm0 895; SSE41-NEXT: retq 896; 897; AVX-LABEL: test_v8f32_undef: 898; AVX: # %bb.0: 899; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 900; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 901; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 902; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 903; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] 904; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 905; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 906; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm1 907; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 908; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 909; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 910; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 911; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 912; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 913; AVX-NEXT: vzeroupper 914; AVX-NEXT: retq 915; 916; AVX512-LABEL: test_v8f32_undef: 917; AVX512: # %bb.0: 918; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 919; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 920; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 921; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 922; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] 923; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 924; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 925; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 926; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 927; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 928; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 929; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 930; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 931; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 932; AVX512-NEXT: vzeroupper 933; AVX512-NEXT: retq 934 %1 = call float @llvm.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %a0) 935 ret float %1 936} 937 938define float @test_v16f32_undef(<16 x float> %a0) { 939; SSE2-LABEL: test_v16f32_undef: 940; SSE2: # %bb.0: 941; SSE2-NEXT: movaps %xmm0, %xmm4 942; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] 943; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 944; SSE2-NEXT: movaps %xmm0, %xmm5 945; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 946; SSE2-NEXT: addss %xmm4, %xmm5 947; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 948; SSE2-NEXT: addss %xmm5, %xmm0 949; SSE2-NEXT: addss %xmm1, %xmm0 950; SSE2-NEXT: movaps %xmm1, %xmm4 951; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] 952; SSE2-NEXT: addss %xmm4, %xmm0 953; SSE2-NEXT: movaps %xmm1, %xmm4 954; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 955; SSE2-NEXT: addss %xmm4, %xmm0 956; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 957; SSE2-NEXT: addss %xmm1, %xmm0 958; SSE2-NEXT: addss %xmm2, %xmm0 959; SSE2-NEXT: movaps %xmm2, %xmm1 960; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 961; SSE2-NEXT: addss %xmm1, %xmm0 962; SSE2-NEXT: movaps %xmm2, %xmm1 963; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 964; SSE2-NEXT: addss %xmm1, %xmm0 965; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 966; SSE2-NEXT: addss %xmm2, %xmm0 967; SSE2-NEXT: addss %xmm3, %xmm0 968; SSE2-NEXT: movaps %xmm3, %xmm1 969; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] 970; SSE2-NEXT: addss %xmm1, %xmm0 971; SSE2-NEXT: movaps %xmm3, %xmm1 972; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 973; SSE2-NEXT: addss %xmm1, %xmm0 974; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 975; SSE2-NEXT: addss %xmm3, %xmm0 976; SSE2-NEXT: retq 977; 978; SSE41-LABEL: test_v16f32_undef: 979; SSE41: # %bb.0: 980; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 981; SSE41-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 982; SSE41-NEXT: movaps %xmm0, %xmm5 983; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 984; SSE41-NEXT: addss %xmm4, %xmm5 985; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 986; SSE41-NEXT: addss %xmm5, %xmm0 987; SSE41-NEXT: addss %xmm1, %xmm0 988; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 989; SSE41-NEXT: addss %xmm4, %xmm0 990; SSE41-NEXT: movaps %xmm1, %xmm4 991; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 992; SSE41-NEXT: addss %xmm4, %xmm0 993; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 994; SSE41-NEXT: addss %xmm1, %xmm0 995; SSE41-NEXT: addss %xmm2, %xmm0 996; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 997; SSE41-NEXT: addss %xmm1, %xmm0 998; SSE41-NEXT: movaps %xmm2, %xmm1 999; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 1000; SSE41-NEXT: addss %xmm1, %xmm0 1001; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 1002; SSE41-NEXT: addss %xmm2, %xmm0 1003; SSE41-NEXT: addss %xmm3, %xmm0 1004; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 1005; SSE41-NEXT: addss %xmm1, %xmm0 1006; SSE41-NEXT: movaps %xmm3, %xmm1 1007; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 1008; SSE41-NEXT: addss %xmm1, %xmm0 1009; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 1010; SSE41-NEXT: addss %xmm3, %xmm0 1011; SSE41-NEXT: retq 1012; 1013; AVX-LABEL: test_v16f32_undef: 1014; AVX: # %bb.0: 1015; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 1016; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1017; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 1018; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 1019; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] 1020; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 1021; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1022; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm2 1023; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 1024; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 1025; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 1026; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 1027; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1028; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0 1029; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 1030; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 1031; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 1032; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1033; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 1034; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] 1035; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 1036; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1037; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 1038; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 1039; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 1040; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1041; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 1042; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 1043; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 1044; AVX-NEXT: vzeroupper 1045; AVX-NEXT: retq 1046; 1047; AVX512-LABEL: test_v16f32_undef: 1048; AVX512: # %bb.0: 1049; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1050; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1051; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 1052; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1053; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] 1054; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1055; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 1056; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1057; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 1058; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 1059; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] 1060; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 1061; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 1062; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1063; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 1064; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1065; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 1066; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 1067; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] 1068; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 1069; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 1070; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1071; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1072; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 1073; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 1074; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1075; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 1076; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 1077; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1078; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 1079; AVX512-NEXT: vzeroupper 1080; AVX512-NEXT: retq 1081 %1 = call float @llvm.vector.reduce.fadd.f32.v16f32(float undef, <16 x float> %a0) 1082 ret float %1 1083} 1084 1085; 1086; vXf64 (accum) 1087; 1088 1089define double @test_v2f64(double %a0, <2 x double> %a1) { 1090; SSE-LABEL: test_v2f64: 1091; SSE: # %bb.0: 1092; SSE-NEXT: addsd %xmm1, %xmm0 1093; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1094; SSE-NEXT: addsd %xmm1, %xmm0 1095; SSE-NEXT: retq 1096; 1097; AVX-LABEL: test_v2f64: 1098; AVX: # %bb.0: 1099; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1100; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1101; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1102; AVX-NEXT: retq 1103; 1104; AVX512-LABEL: test_v2f64: 1105; AVX512: # %bb.0: 1106; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1107; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1108; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1109; AVX512-NEXT: retq 1110 %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1) 1111 ret double %1 1112} 1113 1114define double @test_v4f64(double %a0, <4 x double> %a1) { 1115; SSE-LABEL: test_v4f64: 1116; SSE: # %bb.0: 1117; SSE-NEXT: addsd %xmm1, %xmm0 1118; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1119; SSE-NEXT: addsd %xmm1, %xmm0 1120; SSE-NEXT: addsd %xmm2, %xmm0 1121; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1122; SSE-NEXT: addsd %xmm2, %xmm0 1123; SSE-NEXT: retq 1124; 1125; AVX-LABEL: test_v4f64: 1126; AVX: # %bb.0: 1127; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1128; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1129; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1130; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1131; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1132; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1133; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1134; AVX-NEXT: vzeroupper 1135; AVX-NEXT: retq 1136; 1137; AVX512-LABEL: test_v4f64: 1138; AVX512: # %bb.0: 1139; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1140; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1141; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1142; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 1143; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1144; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1145; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1146; AVX512-NEXT: vzeroupper 1147; AVX512-NEXT: retq 1148 %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) 1149 ret double %1 1150} 1151 1152define double @test_v8f64(double %a0, <8 x double> %a1) { 1153; SSE-LABEL: test_v8f64: 1154; SSE: # %bb.0: 1155; SSE-NEXT: addsd %xmm1, %xmm0 1156; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1157; SSE-NEXT: addsd %xmm1, %xmm0 1158; SSE-NEXT: addsd %xmm2, %xmm0 1159; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1160; SSE-NEXT: addsd %xmm2, %xmm0 1161; SSE-NEXT: addsd %xmm3, %xmm0 1162; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1163; SSE-NEXT: addsd %xmm3, %xmm0 1164; SSE-NEXT: addsd %xmm4, %xmm0 1165; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1166; SSE-NEXT: addsd %xmm4, %xmm0 1167; SSE-NEXT: retq 1168; 1169; AVX-LABEL: test_v8f64: 1170; AVX: # %bb.0: 1171; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1172; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] 1173; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1174; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1175; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1176; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1177; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1178; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1179; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] 1180; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1181; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1182; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1183; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1184; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1185; AVX-NEXT: vzeroupper 1186; AVX-NEXT: retq 1187; 1188; AVX512-LABEL: test_v8f64: 1189; AVX512: # %bb.0: 1190; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1191; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1192; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1193; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1194; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1195; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1196; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1197; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1198; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1199; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1200; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1201; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1202; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1203; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1204; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1205; AVX512-NEXT: vzeroupper 1206; AVX512-NEXT: retq 1207 %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1) 1208 ret double %1 1209} 1210 1211define double @test_v16f64(double %a0, <16 x double> %a1) { 1212; SSE2-LABEL: test_v16f64: 1213; SSE2: # %bb.0: 1214; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 1215; SSE2-NEXT: addsd %xmm1, %xmm0 1216; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1217; SSE2-NEXT: addsd %xmm1, %xmm0 1218; SSE2-NEXT: addsd %xmm2, %xmm0 1219; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1220; SSE2-NEXT: addsd %xmm2, %xmm0 1221; SSE2-NEXT: addsd %xmm3, %xmm0 1222; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1223; SSE2-NEXT: addsd %xmm3, %xmm0 1224; SSE2-NEXT: addsd %xmm4, %xmm0 1225; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1226; SSE2-NEXT: addsd %xmm4, %xmm0 1227; SSE2-NEXT: addsd %xmm5, %xmm0 1228; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1229; SSE2-NEXT: addsd %xmm5, %xmm0 1230; SSE2-NEXT: addsd %xmm6, %xmm0 1231; SSE2-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1232; SSE2-NEXT: addsd %xmm6, %xmm0 1233; SSE2-NEXT: addsd %xmm7, %xmm0 1234; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1235; SSE2-NEXT: addsd %xmm7, %xmm0 1236; SSE2-NEXT: addsd %xmm8, %xmm0 1237; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] 1238; SSE2-NEXT: addsd %xmm8, %xmm0 1239; SSE2-NEXT: retq 1240; 1241; SSE41-LABEL: test_v16f64: 1242; SSE41: # %bb.0: 1243; SSE41-NEXT: addsd %xmm1, %xmm0 1244; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1245; SSE41-NEXT: addsd %xmm1, %xmm0 1246; SSE41-NEXT: addsd %xmm2, %xmm0 1247; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1248; SSE41-NEXT: addsd %xmm2, %xmm0 1249; SSE41-NEXT: addsd %xmm3, %xmm0 1250; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1251; SSE41-NEXT: addsd %xmm3, %xmm0 1252; SSE41-NEXT: addsd %xmm4, %xmm0 1253; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1254; SSE41-NEXT: addsd %xmm4, %xmm0 1255; SSE41-NEXT: addsd %xmm5, %xmm0 1256; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1257; SSE41-NEXT: addsd %xmm5, %xmm0 1258; SSE41-NEXT: addsd %xmm6, %xmm0 1259; SSE41-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1260; SSE41-NEXT: addsd %xmm6, %xmm0 1261; SSE41-NEXT: addsd %xmm7, %xmm0 1262; SSE41-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1263; SSE41-NEXT: addsd %xmm7, %xmm0 1264; SSE41-NEXT: addsd {{[0-9]+}}(%rsp), %xmm0 1265; SSE41-NEXT: addsd {{[0-9]+}}(%rsp), %xmm0 1266; SSE41-NEXT: retq 1267; 1268; AVX-LABEL: test_v16f64: 1269; AVX: # %bb.0: 1270; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1271; AVX-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] 1272; AVX-NEXT: vaddsd %xmm5, %xmm0, %xmm0 1273; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1274; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1275; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1276; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1277; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1278; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] 1279; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1280; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1281; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1282; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1283; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1284; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1285; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] 1286; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1287; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1288; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1289; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1290; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1291; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1292; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm4[1,0] 1293; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1294; AVX-NEXT: vextractf128 $1, %ymm4, %xmm1 1295; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1296; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1297; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1298; AVX-NEXT: vzeroupper 1299; AVX-NEXT: retq 1300; 1301; AVX512-LABEL: test_v16f64: 1302; AVX512: # %bb.0: 1303; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1304; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] 1305; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1306; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3 1307; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1308; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] 1309; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1310; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm3 1311; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1312; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] 1313; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1314; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1315; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1316; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1317; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1318; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1319; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] 1320; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1321; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm1 1322; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1323; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1324; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1325; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm1 1326; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1327; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1328; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1329; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm1 1330; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1331; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1332; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1333; AVX512-NEXT: vzeroupper 1334; AVX512-NEXT: retq 1335 %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1) 1336 ret double %1 1337} 1338 1339; 1340; vXf64 (zero) 1341; 1342 1343define double @test_v2f64_zero(<2 x double> %a0) { 1344; SSE-LABEL: test_v2f64_zero: 1345; SSE: # %bb.0: 1346; SSE-NEXT: movapd %xmm0, %xmm1 1347; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1348; SSE-NEXT: addsd %xmm1, %xmm0 1349; SSE-NEXT: retq 1350; 1351; AVX1-SLOW-LABEL: test_v2f64_zero: 1352; AVX1-SLOW: # %bb.0: 1353; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1354; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1355; AVX1-SLOW-NEXT: retq 1356; 1357; AVX1-FAST-LABEL: test_v2f64_zero: 1358; AVX1-FAST: # %bb.0: 1359; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1360; AVX1-FAST-NEXT: retq 1361; 1362; AVX2-LABEL: test_v2f64_zero: 1363; AVX2: # %bb.0: 1364; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1365; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1366; AVX2-NEXT: retq 1367; 1368; AVX512-LABEL: test_v2f64_zero: 1369; AVX512: # %bb.0: 1370; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1371; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1372; AVX512-NEXT: retq 1373 %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %a0) 1374 ret double %1 1375} 1376 1377define double @test_v4f64_zero(<4 x double> %a0) { 1378; SSE-LABEL: test_v4f64_zero: 1379; SSE: # %bb.0: 1380; SSE-NEXT: movapd %xmm0, %xmm2 1381; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 1382; SSE-NEXT: addsd %xmm2, %xmm0 1383; SSE-NEXT: addsd %xmm1, %xmm0 1384; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1385; SSE-NEXT: addsd %xmm1, %xmm0 1386; SSE-NEXT: retq 1387; 1388; AVX1-SLOW-LABEL: test_v4f64_zero: 1389; AVX1-SLOW: # %bb.0: 1390; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1391; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1392; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 1393; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1394; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1395; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1396; AVX1-SLOW-NEXT: vzeroupper 1397; AVX1-SLOW-NEXT: retq 1398; 1399; AVX1-FAST-LABEL: test_v4f64_zero: 1400; AVX1-FAST: # %bb.0: 1401; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm1 1402; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 1403; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1404; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1405; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1406; AVX1-FAST-NEXT: vzeroupper 1407; AVX1-FAST-NEXT: retq 1408; 1409; AVX2-LABEL: test_v4f64_zero: 1410; AVX2: # %bb.0: 1411; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1412; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1413; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 1414; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1415; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1416; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1417; AVX2-NEXT: vzeroupper 1418; AVX2-NEXT: retq 1419; 1420; AVX512-LABEL: test_v4f64_zero: 1421; AVX512: # %bb.0: 1422; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1423; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1424; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1425; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1426; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1427; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1428; AVX512-NEXT: vzeroupper 1429; AVX512-NEXT: retq 1430 %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %a0) 1431 ret double %1 1432} 1433 1434define double @test_v8f64_zero(<8 x double> %a0) { 1435; SSE-LABEL: test_v8f64_zero: 1436; SSE: # %bb.0: 1437; SSE-NEXT: movapd %xmm0, %xmm4 1438; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] 1439; SSE-NEXT: addsd %xmm4, %xmm0 1440; SSE-NEXT: addsd %xmm1, %xmm0 1441; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1442; SSE-NEXT: addsd %xmm1, %xmm0 1443; SSE-NEXT: addsd %xmm2, %xmm0 1444; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1445; SSE-NEXT: addsd %xmm2, %xmm0 1446; SSE-NEXT: addsd %xmm3, %xmm0 1447; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1448; SSE-NEXT: addsd %xmm3, %xmm0 1449; SSE-NEXT: retq 1450; 1451; AVX1-SLOW-LABEL: test_v8f64_zero: 1452; AVX1-SLOW: # %bb.0: 1453; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 1454; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm2 1455; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 1456; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1457; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1458; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1459; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1460; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1461; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1462; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 1463; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1464; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1465; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1466; AVX1-SLOW-NEXT: vzeroupper 1467; AVX1-SLOW-NEXT: retq 1468; 1469; AVX1-FAST-LABEL: test_v8f64_zero: 1470; AVX1-FAST: # %bb.0: 1471; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm2 1472; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 1473; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1474; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1475; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1476; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1477; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1478; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1479; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 1480; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1481; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1482; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1483; AVX1-FAST-NEXT: vzeroupper 1484; AVX1-FAST-NEXT: retq 1485; 1486; AVX2-LABEL: test_v8f64_zero: 1487; AVX2: # %bb.0: 1488; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 1489; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm2 1490; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 1491; AVX2-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1492; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1493; AVX2-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1494; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1495; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1496; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1497; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 1498; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1499; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1500; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1501; AVX2-NEXT: vzeroupper 1502; AVX2-NEXT: retq 1503; 1504; AVX512-LABEL: test_v8f64_zero: 1505; AVX512: # %bb.0: 1506; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1507; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1 1508; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 1509; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1510; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1511; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1512; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 1513; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1514; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1515; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1516; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1517; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1518; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1519; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1520; AVX512-NEXT: vzeroupper 1521; AVX512-NEXT: retq 1522 %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double -0.0, <8 x double> %a0) 1523 ret double %1 1524} 1525 1526define double @test_v16f64_zero(<16 x double> %a0) { 1527; SSE-LABEL: test_v16f64_zero: 1528; SSE: # %bb.0: 1529; SSE-NEXT: movapd %xmm0, %xmm8 1530; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] 1531; SSE-NEXT: addsd %xmm8, %xmm0 1532; SSE-NEXT: addsd %xmm1, %xmm0 1533; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1534; SSE-NEXT: addsd %xmm1, %xmm0 1535; SSE-NEXT: addsd %xmm2, %xmm0 1536; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1537; SSE-NEXT: addsd %xmm2, %xmm0 1538; SSE-NEXT: addsd %xmm3, %xmm0 1539; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1540; SSE-NEXT: addsd %xmm3, %xmm0 1541; SSE-NEXT: addsd %xmm4, %xmm0 1542; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1543; SSE-NEXT: addsd %xmm4, %xmm0 1544; SSE-NEXT: addsd %xmm5, %xmm0 1545; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1546; SSE-NEXT: addsd %xmm5, %xmm0 1547; SSE-NEXT: addsd %xmm6, %xmm0 1548; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1549; SSE-NEXT: addsd %xmm6, %xmm0 1550; SSE-NEXT: addsd %xmm7, %xmm0 1551; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1552; SSE-NEXT: addsd %xmm7, %xmm0 1553; SSE-NEXT: retq 1554; 1555; AVX1-SLOW-LABEL: test_v16f64_zero: 1556; AVX1-SLOW: # %bb.0: 1557; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] 1558; AVX1-SLOW-NEXT: vaddsd %xmm4, %xmm0, %xmm4 1559; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 1560; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm4, %xmm4 1561; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1562; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm4, %xmm0 1563; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1564; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] 1565; AVX1-SLOW-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1566; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 1567; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1568; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1569; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1570; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1571; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] 1572; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1573; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm1 1574; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1575; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1576; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1577; AVX1-SLOW-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1578; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] 1579; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1580; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm1 1581; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1582; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1583; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1584; AVX1-SLOW-NEXT: vzeroupper 1585; AVX1-SLOW-NEXT: retq 1586; 1587; AVX1-FAST-LABEL: test_v16f64_zero: 1588; AVX1-FAST: # %bb.0: 1589; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm4 1590; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 1591; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm4, %xmm4 1592; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1593; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm4, %xmm0 1594; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1595; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] 1596; AVX1-FAST-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1597; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 1598; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1599; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1600; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1601; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1602; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] 1603; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1604; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm1 1605; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1606; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1607; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1608; AVX1-FAST-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1609; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] 1610; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1611; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm1 1612; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1613; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1614; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1615; AVX1-FAST-NEXT: vzeroupper 1616; AVX1-FAST-NEXT: retq 1617; 1618; AVX2-LABEL: test_v16f64_zero: 1619; AVX2: # %bb.0: 1620; AVX2-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] 1621; AVX2-NEXT: vaddsd %xmm4, %xmm0, %xmm4 1622; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 1623; AVX2-NEXT: vaddsd %xmm0, %xmm4, %xmm4 1624; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1625; AVX2-NEXT: vaddsd %xmm0, %xmm4, %xmm0 1626; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1627; AVX2-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] 1628; AVX2-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1629; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 1630; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1631; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1632; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1633; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1634; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] 1635; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1636; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1 1637; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1638; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1639; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1640; AVX2-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1641; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] 1642; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1643; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm1 1644; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1645; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1646; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1647; AVX2-NEXT: vzeroupper 1648; AVX2-NEXT: retq 1649; 1650; AVX512-LABEL: test_v16f64_zero: 1651; AVX512: # %bb.0: 1652; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 1653; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm2 1654; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 1655; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1656; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] 1657; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1658; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 1659; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1660; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] 1661; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1662; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1663; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1664; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1665; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1666; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1667; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1668; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1669; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1670; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1671; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1672; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1673; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1674; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1675; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1676; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1677; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1678; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1679; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1680; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1681; AVX512-NEXT: vzeroupper 1682; AVX512-NEXT: retq 1683 %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double -0.0, <16 x double> %a0) 1684 ret double %1 1685} 1686 1687; 1688; vXf64 (undef) 1689; 1690 1691define double @test_v2f64_undef(<2 x double> %a0) { 1692; SSE-LABEL: test_v2f64_undef: 1693; SSE: # %bb.0: 1694; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1695; SSE-NEXT: addsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1696; SSE-NEXT: retq 1697; 1698; AVX-LABEL: test_v2f64_undef: 1699; AVX: # %bb.0: 1700; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1701; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1702; AVX-NEXT: retq 1703; 1704; AVX512-LABEL: test_v2f64_undef: 1705; AVX512: # %bb.0: 1706; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1707; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1708; AVX512-NEXT: retq 1709 %1 = call double @llvm.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %a0) 1710 ret double %1 1711} 1712 1713define double @test_v4f64_undef(<4 x double> %a0) { 1714; SSE-LABEL: test_v4f64_undef: 1715; SSE: # %bb.0: 1716; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1717; SSE-NEXT: addsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1718; SSE-NEXT: addsd %xmm1, %xmm0 1719; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1720; SSE-NEXT: addsd %xmm1, %xmm0 1721; SSE-NEXT: retq 1722; 1723; AVX-LABEL: test_v4f64_undef: 1724; AVX: # %bb.0: 1725; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1726; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1727; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1728; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1729; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1730; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1731; AVX-NEXT: vzeroupper 1732; AVX-NEXT: retq 1733; 1734; AVX512-LABEL: test_v4f64_undef: 1735; AVX512: # %bb.0: 1736; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1737; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1738; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1739; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1740; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1741; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1742; AVX512-NEXT: vzeroupper 1743; AVX512-NEXT: retq 1744 %1 = call double @llvm.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %a0) 1745 ret double %1 1746} 1747 1748define double @test_v8f64_undef(<8 x double> %a0) { 1749; SSE-LABEL: test_v8f64_undef: 1750; SSE: # %bb.0: 1751; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1752; SSE-NEXT: addsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1753; SSE-NEXT: addsd %xmm1, %xmm0 1754; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1755; SSE-NEXT: addsd %xmm1, %xmm0 1756; SSE-NEXT: addsd %xmm2, %xmm0 1757; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1758; SSE-NEXT: addsd %xmm2, %xmm0 1759; SSE-NEXT: addsd %xmm3, %xmm0 1760; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1761; SSE-NEXT: addsd %xmm3, %xmm0 1762; SSE-NEXT: retq 1763; 1764; AVX-LABEL: test_v8f64_undef: 1765; AVX: # %bb.0: 1766; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 1767; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1768; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1769; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1770; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1771; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1772; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1773; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1774; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1775; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1776; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1777; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1778; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1779; AVX-NEXT: vzeroupper 1780; AVX-NEXT: retq 1781; 1782; AVX512-LABEL: test_v8f64_undef: 1783; AVX512: # %bb.0: 1784; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1785; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1786; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 1787; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1788; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1789; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1790; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 1791; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1792; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1793; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1794; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1795; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1 1796; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1797; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1798; AVX512-NEXT: vzeroupper 1799; AVX512-NEXT: retq 1800 %1 = call double @llvm.vector.reduce.fadd.f64.v8f64(double undef, <8 x double> %a0) 1801 ret double %1 1802} 1803 1804define double @test_v16f64_undef(<16 x double> %a0) { 1805; SSE-LABEL: test_v16f64_undef: 1806; SSE: # %bb.0: 1807; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1808; SSE-NEXT: addsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1809; SSE-NEXT: addsd %xmm1, %xmm0 1810; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1811; SSE-NEXT: addsd %xmm1, %xmm0 1812; SSE-NEXT: addsd %xmm2, %xmm0 1813; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1814; SSE-NEXT: addsd %xmm2, %xmm0 1815; SSE-NEXT: addsd %xmm3, %xmm0 1816; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1817; SSE-NEXT: addsd %xmm3, %xmm0 1818; SSE-NEXT: addsd %xmm4, %xmm0 1819; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1820; SSE-NEXT: addsd %xmm4, %xmm0 1821; SSE-NEXT: addsd %xmm5, %xmm0 1822; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1823; SSE-NEXT: addsd %xmm5, %xmm0 1824; SSE-NEXT: addsd %xmm6, %xmm0 1825; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1826; SSE-NEXT: addsd %xmm6, %xmm0 1827; SSE-NEXT: addsd %xmm7, %xmm0 1828; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1829; SSE-NEXT: addsd %xmm7, %xmm0 1830; SSE-NEXT: retq 1831; 1832; AVX-LABEL: test_v16f64_undef: 1833; AVX: # %bb.0: 1834; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] 1835; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 1836; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1837; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm4 1838; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1839; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm0 1840; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1841; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] 1842; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0 1843; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1844; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1845; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1846; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1847; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1848; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] 1849; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1850; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1851; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1852; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1853; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1854; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0 1855; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] 1856; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1857; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1858; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1859; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1860; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1861; AVX-NEXT: vzeroupper 1862; AVX-NEXT: retq 1863; 1864; AVX512-LABEL: test_v16f64_undef: 1865; AVX512: # %bb.0: 1866; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 1867; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1868; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 1869; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1870; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] 1871; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1872; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 1873; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1874; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] 1875; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2 1876; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1877; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm2 1878; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1879; AVX512-NEXT: vaddsd %xmm0, %xmm2, %xmm0 1880; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1881; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1882; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1883; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1884; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1885; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1886; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1887; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1888; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1889; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1890; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0 1891; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1892; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1893; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1894; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1895; AVX512-NEXT: vzeroupper 1896; AVX512-NEXT: retq 1897 %1 = call double @llvm.vector.reduce.fadd.f64.v16f64(double undef, <16 x double> %a0) 1898 ret double %1 1899} 1900 1901define float @PR64627() { 1902; SSE-LABEL: PR64627: 1903; SSE: # %bb.0: 1904; SSE-NEXT: movss {{.*#+}} xmm0 = [5.0E+0,0.0E+0,0.0E+0,0.0E+0] 1905; SSE-NEXT: retq 1906; 1907; AVX-LABEL: PR64627: 1908; AVX: # %bb.0: 1909; AVX-NEXT: vmovss {{.*#+}} xmm0 = [5.0E+0,0.0E+0,0.0E+0,0.0E+0] 1910; AVX-NEXT: retq 1911; 1912; AVX512-LABEL: PR64627: 1913; AVX512: # %bb.0: 1914; AVX512-NEXT: vmovss {{.*#+}} xmm0 = [5.0E+0,0.0E+0,0.0E+0,0.0E+0] 1915; AVX512-NEXT: retq 1916 %1 = bitcast i5 0 to <5 x i1> 1917 %2 = select <5 x i1> %1, <5 x float> zeroinitializer, <5 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> 1918 %3 = call float @llvm.vector.reduce.fadd.v5f32(float -0.0, <5 x float> %2) 1919 ret float %3 1920} 1921declare float @llvm.vector.reduce.fadd.v5f32(float, <5 x float>) 1922 1923declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>) 1924declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) 1925declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) 1926declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>) 1927 1928declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>) 1929declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>) 1930declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>) 1931declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>) 1932