1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx,+fast-hops | FileCheck %s --check-prefix=AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 7; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 9 10; 11; vXf32 (accum) 12; 13 14define float @test_v2f32(float %a0, <2 x float> %a1) { 15; SSE2-LABEL: test_v2f32: 16; SSE2: # %bb.0: 17; SSE2-NEXT: movaps %xmm1, %xmm2 18; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 19; SSE2-NEXT: addss %xmm1, %xmm2 20; SSE2-NEXT: addss %xmm2, %xmm0 21; SSE2-NEXT: retq 22; 23; SSE41-LABEL: test_v2f32: 24; SSE41: # %bb.0: 25; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 26; SSE41-NEXT: addss %xmm1, %xmm2 27; SSE41-NEXT: addss %xmm2, %xmm0 28; SSE41-NEXT: retq 29; 30; AVX1-SLOW-LABEL: test_v2f32: 31; AVX1-SLOW: # %bb.0: 32; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 33; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 34; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 35; AVX1-SLOW-NEXT: retq 36; 37; AVX1-FAST-LABEL: test_v2f32: 38; AVX1-FAST: # %bb.0: 39; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 40; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 41; AVX1-FAST-NEXT: retq 42; 43; AVX2-LABEL: test_v2f32: 44; AVX2: # %bb.0: 45; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 46; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 47; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 48; AVX2-NEXT: retq 49; 50; AVX512-LABEL: test_v2f32: 51; AVX512: # %bb.0: 52; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 53; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 54; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 55; AVX512-NEXT: retq 56 %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1) 57 ret float %1 58} 59 60define float @test_v4f32(float %a0, <4 x float> %a1) { 61; SSE2-LABEL: test_v4f32: 62; SSE2: # %bb.0: 63; SSE2-NEXT: movaps %xmm1, %xmm2 64; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 65; SSE2-NEXT: addps %xmm1, %xmm2 66; SSE2-NEXT: movaps %xmm2, %xmm1 67; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 68; SSE2-NEXT: addss %xmm2, %xmm1 69; SSE2-NEXT: addss %xmm1, %xmm0 70; SSE2-NEXT: retq 71; 72; SSE41-LABEL: test_v4f32: 73; SSE41: # %bb.0: 74; SSE41-NEXT: movaps %xmm1, %xmm2 75; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 76; SSE41-NEXT: addps %xmm1, %xmm2 77; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 78; SSE41-NEXT: addss %xmm2, %xmm1 79; SSE41-NEXT: addss %xmm1, %xmm0 80; SSE41-NEXT: retq 81; 82; AVX1-SLOW-LABEL: test_v4f32: 83; AVX1-SLOW: # %bb.0: 84; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 85; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 86; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 87; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 88; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 89; AVX1-SLOW-NEXT: retq 90; 91; AVX1-FAST-LABEL: test_v4f32: 92; AVX1-FAST: # %bb.0: 93; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 94; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 95; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 96; AVX1-FAST-NEXT: retq 97; 98; AVX2-LABEL: test_v4f32: 99; AVX2: # %bb.0: 100; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 101; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 102; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 103; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 104; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 105; AVX2-NEXT: retq 106; 107; AVX512-LABEL: test_v4f32: 108; AVX512: # %bb.0: 109; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 110; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 111; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 112; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 113; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 114; AVX512-NEXT: retq 115 %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1) 116 ret float %1 117} 118 119define float @test_v8f32(float %a0, <8 x float> %a1) { 120; SSE2-LABEL: test_v8f32: 121; SSE2: # %bb.0: 122; SSE2-NEXT: addps %xmm2, %xmm1 123; SSE2-NEXT: movaps %xmm1, %xmm2 124; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 125; SSE2-NEXT: addps %xmm1, %xmm2 126; SSE2-NEXT: movaps %xmm2, %xmm1 127; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 128; SSE2-NEXT: addss %xmm2, %xmm1 129; SSE2-NEXT: addss %xmm1, %xmm0 130; SSE2-NEXT: retq 131; 132; SSE41-LABEL: test_v8f32: 133; SSE41: # %bb.0: 134; SSE41-NEXT: addps %xmm2, %xmm1 135; SSE41-NEXT: movaps %xmm1, %xmm2 136; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 137; SSE41-NEXT: addps %xmm1, %xmm2 138; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 139; SSE41-NEXT: addss %xmm2, %xmm1 140; SSE41-NEXT: addss %xmm1, %xmm0 141; SSE41-NEXT: retq 142; 143; AVX1-SLOW-LABEL: test_v8f32: 144; AVX1-SLOW: # %bb.0: 145; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 146; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 147; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 148; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 149; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 150; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 151; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 152; AVX1-SLOW-NEXT: vzeroupper 153; AVX1-SLOW-NEXT: retq 154; 155; AVX1-FAST-LABEL: test_v8f32: 156; AVX1-FAST: # %bb.0: 157; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 158; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1 159; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 160; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 161; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 162; AVX1-FAST-NEXT: vzeroupper 163; AVX1-FAST-NEXT: retq 164; 165; AVX2-LABEL: test_v8f32: 166; AVX2: # %bb.0: 167; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 168; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 169; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 170; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 171; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 172; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 173; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 174; AVX2-NEXT: vzeroupper 175; AVX2-NEXT: retq 176; 177; AVX512-LABEL: test_v8f32: 178; AVX512: # %bb.0: 179; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 180; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 181; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 182; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 183; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 184; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 185; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 186; AVX512-NEXT: vzeroupper 187; AVX512-NEXT: retq 188 %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) 189 ret float %1 190} 191 192define float @test_v16f32(float %a0, <16 x float> %a1) { 193; SSE2-LABEL: test_v16f32: 194; SSE2: # %bb.0: 195; SSE2-NEXT: addps %xmm4, %xmm2 196; SSE2-NEXT: addps %xmm3, %xmm1 197; SSE2-NEXT: addps %xmm2, %xmm1 198; SSE2-NEXT: movaps %xmm1, %xmm2 199; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 200; SSE2-NEXT: addps %xmm1, %xmm2 201; SSE2-NEXT: movaps %xmm2, %xmm1 202; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 203; SSE2-NEXT: addss %xmm2, %xmm1 204; SSE2-NEXT: addss %xmm1, %xmm0 205; SSE2-NEXT: retq 206; 207; SSE41-LABEL: test_v16f32: 208; SSE41: # %bb.0: 209; SSE41-NEXT: addps %xmm4, %xmm2 210; SSE41-NEXT: addps %xmm3, %xmm1 211; SSE41-NEXT: addps %xmm2, %xmm1 212; SSE41-NEXT: movaps %xmm1, %xmm2 213; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 214; SSE41-NEXT: addps %xmm1, %xmm2 215; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 216; SSE41-NEXT: addss %xmm2, %xmm1 217; SSE41-NEXT: addss %xmm1, %xmm0 218; SSE41-NEXT: retq 219; 220; AVX1-SLOW-LABEL: test_v16f32: 221; AVX1-SLOW: # %bb.0: 222; AVX1-SLOW-NEXT: vaddps %ymm2, %ymm1, %ymm1 223; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 224; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 225; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 226; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 227; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 228; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 229; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 230; AVX1-SLOW-NEXT: vzeroupper 231; AVX1-SLOW-NEXT: retq 232; 233; AVX1-FAST-LABEL: test_v16f32: 234; AVX1-FAST: # %bb.0: 235; AVX1-FAST-NEXT: vaddps %ymm2, %ymm1, %ymm1 236; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 237; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 238; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 239; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 240; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 241; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 242; AVX1-FAST-NEXT: vzeroupper 243; AVX1-FAST-NEXT: retq 244; 245; AVX2-LABEL: test_v16f32: 246; AVX2: # %bb.0: 247; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 248; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 249; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 250; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 251; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 252; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 253; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 254; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 255; AVX2-NEXT: vzeroupper 256; AVX2-NEXT: retq 257; 258; AVX512-LABEL: test_v16f32: 259; AVX512: # %bb.0: 260; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 261; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm1 262; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 263; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 264; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 265; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 266; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 267; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 268; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 269; AVX512-NEXT: vzeroupper 270; AVX512-NEXT: retq 271 %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1) 272 ret float %1 273} 274 275; 276; vXf32 (zero) 277; 278 279define float @test_v2f32_zero(<2 x float> %a0) { 280; SSE2-LABEL: test_v2f32_zero: 281; SSE2: # %bb.0: 282; SSE2-NEXT: movaps %xmm0, %xmm1 283; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 284; SSE2-NEXT: addss %xmm1, %xmm0 285; SSE2-NEXT: retq 286; 287; SSE41-LABEL: test_v2f32_zero: 288; SSE41: # %bb.0: 289; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 290; SSE41-NEXT: addss %xmm1, %xmm0 291; SSE41-NEXT: retq 292; 293; AVX1-SLOW-LABEL: test_v2f32_zero: 294; AVX1-SLOW: # %bb.0: 295; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 296; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 297; AVX1-SLOW-NEXT: retq 298; 299; AVX1-FAST-LABEL: test_v2f32_zero: 300; AVX1-FAST: # %bb.0: 301; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 302; AVX1-FAST-NEXT: retq 303; 304; AVX2-LABEL: test_v2f32_zero: 305; AVX2: # %bb.0: 306; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 307; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 308; AVX2-NEXT: retq 309; 310; AVX512-LABEL: test_v2f32_zero: 311; AVX512: # %bb.0: 312; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 313; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 314; AVX512-NEXT: retq 315 %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0) 316 ret float %1 317} 318 319define float @test_v4f32_zero(<4 x float> %a0) { 320; SSE2-LABEL: test_v4f32_zero: 321; SSE2: # %bb.0: 322; SSE2-NEXT: movaps %xmm0, %xmm1 323; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 324; SSE2-NEXT: addps %xmm1, %xmm0 325; SSE2-NEXT: movaps %xmm0, %xmm1 326; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 327; SSE2-NEXT: addss %xmm1, %xmm0 328; SSE2-NEXT: retq 329; 330; SSE41-LABEL: test_v4f32_zero: 331; SSE41: # %bb.0: 332; SSE41-NEXT: movaps %xmm0, %xmm1 333; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 334; SSE41-NEXT: addps %xmm1, %xmm0 335; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 336; SSE41-NEXT: addss %xmm1, %xmm0 337; SSE41-NEXT: retq 338; 339; AVX1-SLOW-LABEL: test_v4f32_zero: 340; AVX1-SLOW: # %bb.0: 341; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 342; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 343; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 344; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 345; AVX1-SLOW-NEXT: retq 346; 347; AVX1-FAST-LABEL: test_v4f32_zero: 348; AVX1-FAST: # %bb.0: 349; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 350; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 351; AVX1-FAST-NEXT: retq 352; 353; AVX2-LABEL: test_v4f32_zero: 354; AVX2: # %bb.0: 355; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 356; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 357; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 358; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 359; AVX2-NEXT: retq 360; 361; AVX512-LABEL: test_v4f32_zero: 362; AVX512: # %bb.0: 363; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 364; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 365; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 366; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 367; AVX512-NEXT: retq 368 %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0) 369 ret float %1 370} 371 372define float @test_v8f32_zero(<8 x float> %a0) { 373; SSE2-LABEL: test_v8f32_zero: 374; SSE2: # %bb.0: 375; SSE2-NEXT: addps %xmm1, %xmm0 376; SSE2-NEXT: movaps %xmm0, %xmm1 377; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 378; SSE2-NEXT: addps %xmm1, %xmm0 379; SSE2-NEXT: movaps %xmm0, %xmm1 380; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 381; SSE2-NEXT: addss %xmm1, %xmm0 382; SSE2-NEXT: retq 383; 384; SSE41-LABEL: test_v8f32_zero: 385; SSE41: # %bb.0: 386; SSE41-NEXT: addps %xmm1, %xmm0 387; SSE41-NEXT: movaps %xmm0, %xmm1 388; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 389; SSE41-NEXT: addps %xmm1, %xmm0 390; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 391; SSE41-NEXT: addss %xmm1, %xmm0 392; SSE41-NEXT: retq 393; 394; AVX1-SLOW-LABEL: test_v8f32_zero: 395; AVX1-SLOW: # %bb.0: 396; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 397; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 398; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 399; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 400; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 401; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 402; AVX1-SLOW-NEXT: vzeroupper 403; AVX1-SLOW-NEXT: retq 404; 405; AVX1-FAST-LABEL: test_v8f32_zero: 406; AVX1-FAST: # %bb.0: 407; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 408; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 409; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 410; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 411; AVX1-FAST-NEXT: vzeroupper 412; AVX1-FAST-NEXT: retq 413; 414; AVX2-LABEL: test_v8f32_zero: 415; AVX2: # %bb.0: 416; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 417; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 418; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 419; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 420; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 421; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 422; AVX2-NEXT: vzeroupper 423; AVX2-NEXT: retq 424; 425; AVX512-LABEL: test_v8f32_zero: 426; AVX512: # %bb.0: 427; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 428; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 429; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 430; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 431; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 432; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 433; AVX512-NEXT: vzeroupper 434; AVX512-NEXT: retq 435 %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0) 436 ret float %1 437} 438 439define float @test_v16f32_zero(<16 x float> %a0) { 440; SSE2-LABEL: test_v16f32_zero: 441; SSE2: # %bb.0: 442; SSE2-NEXT: addps %xmm3, %xmm1 443; SSE2-NEXT: addps %xmm2, %xmm0 444; SSE2-NEXT: addps %xmm1, %xmm0 445; SSE2-NEXT: movaps %xmm0, %xmm1 446; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 447; SSE2-NEXT: addps %xmm1, %xmm0 448; SSE2-NEXT: movaps %xmm0, %xmm1 449; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 450; SSE2-NEXT: addss %xmm1, %xmm0 451; SSE2-NEXT: retq 452; 453; SSE41-LABEL: test_v16f32_zero: 454; SSE41: # %bb.0: 455; SSE41-NEXT: addps %xmm3, %xmm1 456; SSE41-NEXT: addps %xmm2, %xmm0 457; SSE41-NEXT: addps %xmm1, %xmm0 458; SSE41-NEXT: movaps %xmm0, %xmm1 459; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 460; SSE41-NEXT: addps %xmm1, %xmm0 461; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 462; SSE41-NEXT: addss %xmm1, %xmm0 463; SSE41-NEXT: retq 464; 465; AVX1-SLOW-LABEL: test_v16f32_zero: 466; AVX1-SLOW: # %bb.0: 467; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 468; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 469; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 470; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 471; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 472; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 473; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 474; AVX1-SLOW-NEXT: vzeroupper 475; AVX1-SLOW-NEXT: retq 476; 477; AVX1-FAST-LABEL: test_v16f32_zero: 478; AVX1-FAST: # %bb.0: 479; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 480; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 481; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 482; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 483; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 484; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 485; AVX1-FAST-NEXT: vzeroupper 486; AVX1-FAST-NEXT: retq 487; 488; AVX2-LABEL: test_v16f32_zero: 489; AVX2: # %bb.0: 490; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 491; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 492; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 493; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 494; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 495; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 496; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 497; AVX2-NEXT: vzeroupper 498; AVX2-NEXT: retq 499; 500; AVX512-LABEL: test_v16f32_zero: 501; AVX512: # %bb.0: 502; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 503; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 504; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 505; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 506; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 507; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 508; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 509; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 510; AVX512-NEXT: vzeroupper 511; AVX512-NEXT: retq 512 %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0) 513 ret float %1 514} 515 516; 517; vXf32 (undef) 518; 519 520define float @test_v2f32_undef(<2 x float> %a0) { 521; SSE2-LABEL: test_v2f32_undef: 522; SSE2: # %bb.0: 523; SSE2-NEXT: movaps %xmm0, %xmm1 524; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 525; SSE2-NEXT: addss %xmm1, %xmm0 526; SSE2-NEXT: retq 527; 528; SSE41-LABEL: test_v2f32_undef: 529; SSE41: # %bb.0: 530; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 531; SSE41-NEXT: addss %xmm1, %xmm0 532; SSE41-NEXT: retq 533; 534; AVX1-SLOW-LABEL: test_v2f32_undef: 535; AVX1-SLOW: # %bb.0: 536; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 537; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 538; AVX1-SLOW-NEXT: retq 539; 540; AVX1-FAST-LABEL: test_v2f32_undef: 541; AVX1-FAST: # %bb.0: 542; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 543; AVX1-FAST-NEXT: retq 544; 545; AVX2-LABEL: test_v2f32_undef: 546; AVX2: # %bb.0: 547; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 548; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 549; AVX2-NEXT: retq 550; 551; AVX512-LABEL: test_v2f32_undef: 552; AVX512: # %bb.0: 553; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 554; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 555; AVX512-NEXT: retq 556 %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0) 557 ret float %1 558} 559 560define float @test_v4f32_undef(<4 x float> %a0) { 561; SSE2-LABEL: test_v4f32_undef: 562; SSE2: # %bb.0: 563; SSE2-NEXT: movaps %xmm0, %xmm1 564; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 565; SSE2-NEXT: addps %xmm1, %xmm0 566; SSE2-NEXT: movaps %xmm0, %xmm1 567; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 568; SSE2-NEXT: addss %xmm1, %xmm0 569; SSE2-NEXT: retq 570; 571; SSE41-LABEL: test_v4f32_undef: 572; SSE41: # %bb.0: 573; SSE41-NEXT: movaps %xmm0, %xmm1 574; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 575; SSE41-NEXT: addps %xmm1, %xmm0 576; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 577; SSE41-NEXT: addss %xmm1, %xmm0 578; SSE41-NEXT: retq 579; 580; AVX1-SLOW-LABEL: test_v4f32_undef: 581; AVX1-SLOW: # %bb.0: 582; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 583; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 584; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 585; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 586; AVX1-SLOW-NEXT: retq 587; 588; AVX1-FAST-LABEL: test_v4f32_undef: 589; AVX1-FAST: # %bb.0: 590; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 591; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 592; AVX1-FAST-NEXT: retq 593; 594; AVX2-LABEL: test_v4f32_undef: 595; AVX2: # %bb.0: 596; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 597; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 598; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 599; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 600; AVX2-NEXT: retq 601; 602; AVX512-LABEL: test_v4f32_undef: 603; AVX512: # %bb.0: 604; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 605; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 606; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 607; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 608; AVX512-NEXT: retq 609 %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0) 610 ret float %1 611} 612 613define float @test_v8f32_undef(<8 x float> %a0) { 614; SSE2-LABEL: test_v8f32_undef: 615; SSE2: # %bb.0: 616; SSE2-NEXT: addps %xmm1, %xmm0 617; SSE2-NEXT: movaps %xmm0, %xmm1 618; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 619; SSE2-NEXT: addps %xmm1, %xmm0 620; SSE2-NEXT: movaps %xmm0, %xmm1 621; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 622; SSE2-NEXT: addss %xmm1, %xmm0 623; SSE2-NEXT: retq 624; 625; SSE41-LABEL: test_v8f32_undef: 626; SSE41: # %bb.0: 627; SSE41-NEXT: addps %xmm1, %xmm0 628; SSE41-NEXT: movaps %xmm0, %xmm1 629; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 630; SSE41-NEXT: addps %xmm1, %xmm0 631; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 632; SSE41-NEXT: addss %xmm1, %xmm0 633; SSE41-NEXT: retq 634; 635; AVX1-SLOW-LABEL: test_v8f32_undef: 636; AVX1-SLOW: # %bb.0: 637; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 638; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 639; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 640; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 641; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 642; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 643; AVX1-SLOW-NEXT: vzeroupper 644; AVX1-SLOW-NEXT: retq 645; 646; AVX1-FAST-LABEL: test_v8f32_undef: 647; AVX1-FAST: # %bb.0: 648; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 649; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 650; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 651; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 652; AVX1-FAST-NEXT: vzeroupper 653; AVX1-FAST-NEXT: retq 654; 655; AVX2-LABEL: test_v8f32_undef: 656; AVX2: # %bb.0: 657; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 658; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 659; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 660; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 661; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 662; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 663; AVX2-NEXT: vzeroupper 664; AVX2-NEXT: retq 665; 666; AVX512-LABEL: test_v8f32_undef: 667; AVX512: # %bb.0: 668; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 669; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 670; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 671; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 672; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 673; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 674; AVX512-NEXT: vzeroupper 675; AVX512-NEXT: retq 676 %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0) 677 ret float %1 678} 679 680define float @test_v16f32_undef(<16 x float> %a0) { 681; SSE2-LABEL: test_v16f32_undef: 682; SSE2: # %bb.0: 683; SSE2-NEXT: addps %xmm3, %xmm1 684; SSE2-NEXT: addps %xmm2, %xmm0 685; SSE2-NEXT: addps %xmm1, %xmm0 686; SSE2-NEXT: movaps %xmm0, %xmm1 687; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 688; SSE2-NEXT: addps %xmm1, %xmm0 689; SSE2-NEXT: movaps %xmm0, %xmm1 690; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 691; SSE2-NEXT: addss %xmm1, %xmm0 692; SSE2-NEXT: retq 693; 694; SSE41-LABEL: test_v16f32_undef: 695; SSE41: # %bb.0: 696; SSE41-NEXT: addps %xmm3, %xmm1 697; SSE41-NEXT: addps %xmm2, %xmm0 698; SSE41-NEXT: addps %xmm1, %xmm0 699; SSE41-NEXT: movaps %xmm0, %xmm1 700; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 701; SSE41-NEXT: addps %xmm1, %xmm0 702; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 703; SSE41-NEXT: addss %xmm1, %xmm0 704; SSE41-NEXT: retq 705; 706; AVX1-SLOW-LABEL: test_v16f32_undef: 707; AVX1-SLOW: # %bb.0: 708; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 709; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 710; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 711; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 712; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 713; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 714; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 715; AVX1-SLOW-NEXT: vzeroupper 716; AVX1-SLOW-NEXT: retq 717; 718; AVX1-FAST-LABEL: test_v16f32_undef: 719; AVX1-FAST: # %bb.0: 720; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 721; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 722; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 723; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 724; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 725; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 726; AVX1-FAST-NEXT: vzeroupper 727; AVX1-FAST-NEXT: retq 728; 729; AVX2-LABEL: test_v16f32_undef: 730; AVX2: # %bb.0: 731; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 732; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 733; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 734; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 735; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 736; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 737; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 738; AVX2-NEXT: vzeroupper 739; AVX2-NEXT: retq 740; 741; AVX512-LABEL: test_v16f32_undef: 742; AVX512: # %bb.0: 743; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 744; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 745; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 746; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 747; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 748; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 749; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 750; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 751; AVX512-NEXT: vzeroupper 752; AVX512-NEXT: retq 753 %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0) 754 ret float %1 755} 756 757; 758; vXf64 (accum) 759; 760 761define double @test_v2f64(double %a0, <2 x double> %a1) { 762; SSE-LABEL: test_v2f64: 763; SSE: # %bb.0: 764; SSE-NEXT: movapd %xmm1, %xmm2 765; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 766; SSE-NEXT: addsd %xmm1, %xmm2 767; SSE-NEXT: addsd %xmm2, %xmm0 768; SSE-NEXT: retq 769; 770; AVX1-SLOW-LABEL: test_v2f64: 771; AVX1-SLOW: # %bb.0: 772; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 773; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 774; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 775; AVX1-SLOW-NEXT: retq 776; 777; AVX1-FAST-LABEL: test_v2f64: 778; AVX1-FAST: # %bb.0: 779; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 780; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 781; AVX1-FAST-NEXT: retq 782; 783; AVX2-LABEL: test_v2f64: 784; AVX2: # %bb.0: 785; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 786; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 787; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 788; AVX2-NEXT: retq 789; 790; AVX512-LABEL: test_v2f64: 791; AVX512: # %bb.0: 792; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 793; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 794; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 795; AVX512-NEXT: retq 796 %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1) 797 ret double %1 798} 799 800define double @test_v4f64(double %a0, <4 x double> %a1) { 801; SSE-LABEL: test_v4f64: 802; SSE: # %bb.0: 803; SSE-NEXT: addpd %xmm2, %xmm1 804; SSE-NEXT: movapd %xmm1, %xmm2 805; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 806; SSE-NEXT: addsd %xmm1, %xmm2 807; SSE-NEXT: addsd %xmm2, %xmm0 808; SSE-NEXT: retq 809; 810; AVX1-SLOW-LABEL: test_v4f64: 811; AVX1-SLOW: # %bb.0: 812; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 813; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 814; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 815; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 816; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 817; AVX1-SLOW-NEXT: vzeroupper 818; AVX1-SLOW-NEXT: retq 819; 820; AVX1-FAST-LABEL: test_v4f64: 821; AVX1-FAST: # %bb.0: 822; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 823; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1 824; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 825; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 826; AVX1-FAST-NEXT: vzeroupper 827; AVX1-FAST-NEXT: retq 828; 829; AVX2-LABEL: test_v4f64: 830; AVX2: # %bb.0: 831; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 832; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 833; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 834; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 835; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 836; AVX2-NEXT: vzeroupper 837; AVX2-NEXT: retq 838; 839; AVX512-LABEL: test_v4f64: 840; AVX512: # %bb.0: 841; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 842; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1 843; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 844; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 845; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 846; AVX512-NEXT: vzeroupper 847; AVX512-NEXT: retq 848 %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) 849 ret double %1 850} 851 852define double @test_v8f64(double %a0, <8 x double> %a1) { 853; SSE-LABEL: test_v8f64: 854; SSE: # %bb.0: 855; SSE-NEXT: addpd %xmm4, %xmm2 856; SSE-NEXT: addpd %xmm3, %xmm1 857; SSE-NEXT: addpd %xmm2, %xmm1 858; SSE-NEXT: movapd %xmm1, %xmm2 859; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 860; SSE-NEXT: addsd %xmm1, %xmm2 861; SSE-NEXT: addsd %xmm2, %xmm0 862; SSE-NEXT: retq 863; 864; AVX1-SLOW-LABEL: test_v8f64: 865; AVX1-SLOW: # %bb.0: 866; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm1, %ymm1 867; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 868; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 869; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 870; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 871; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 872; AVX1-SLOW-NEXT: vzeroupper 873; AVX1-SLOW-NEXT: retq 874; 875; AVX1-FAST-LABEL: test_v8f64: 876; AVX1-FAST: # %bb.0: 877; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm1, %ymm1 878; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 879; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1 880; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 881; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 882; AVX1-FAST-NEXT: vzeroupper 883; AVX1-FAST-NEXT: retq 884; 885; AVX2-LABEL: test_v8f64: 886; AVX2: # %bb.0: 887; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 888; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 889; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 890; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 891; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 892; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 893; AVX2-NEXT: vzeroupper 894; AVX2-NEXT: retq 895; 896; AVX512-LABEL: test_v8f64: 897; AVX512: # %bb.0: 898; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 899; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 900; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 901; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1 902; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 903; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 904; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 905; AVX512-NEXT: vzeroupper 906; AVX512-NEXT: retq 907 %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1) 908 ret double %1 909} 910 911define double @test_v16f64(double %a0, <16 x double> %a1) { 912; SSE-LABEL: test_v16f64: 913; SSE: # %bb.0: 914; SSE-NEXT: addpd %xmm6, %xmm2 915; SSE-NEXT: addpd %xmm7, %xmm3 916; SSE-NEXT: addpd %xmm5, %xmm1 917; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4 918; SSE-NEXT: addpd %xmm3, %xmm1 919; SSE-NEXT: addpd %xmm2, %xmm4 920; SSE-NEXT: addpd %xmm1, %xmm4 921; SSE-NEXT: movapd %xmm4, %xmm1 922; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] 923; SSE-NEXT: addsd %xmm4, %xmm1 924; SSE-NEXT: addsd %xmm1, %xmm0 925; SSE-NEXT: retq 926; 927; AVX1-SLOW-LABEL: test_v16f64: 928; AVX1-SLOW: # %bb.0: 929; AVX1-SLOW-NEXT: vaddpd %ymm4, %ymm2, %ymm2 930; AVX1-SLOW-NEXT: vaddpd %ymm3, %ymm1, %ymm1 931; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm1, %ymm1 932; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 933; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 934; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 935; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 936; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 937; AVX1-SLOW-NEXT: vzeroupper 938; AVX1-SLOW-NEXT: retq 939; 940; AVX1-FAST-LABEL: test_v16f64: 941; AVX1-FAST: # %bb.0: 942; AVX1-FAST-NEXT: vaddpd %ymm4, %ymm2, %ymm2 943; AVX1-FAST-NEXT: vaddpd %ymm3, %ymm1, %ymm1 944; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm1, %ymm1 945; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 946; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1 947; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 948; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 949; AVX1-FAST-NEXT: vzeroupper 950; AVX1-FAST-NEXT: retq 951; 952; AVX2-LABEL: test_v16f64: 953; AVX2: # %bb.0: 954; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 955; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 956; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 957; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 958; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 959; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 960; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 961; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 962; AVX2-NEXT: vzeroupper 963; AVX2-NEXT: retq 964; 965; AVX512-LABEL: test_v16f64: 966; AVX512: # %bb.0: 967; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 968; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 969; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 970; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 971; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1 972; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 973; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 974; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 975; AVX512-NEXT: vzeroupper 976; AVX512-NEXT: retq 977 %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1) 978 ret double %1 979} 980 981; 982; vXf64 (zero) 983; 984 985define double @test_v2f64_zero(<2 x double> %a0) { 986; SSE-LABEL: test_v2f64_zero: 987; SSE: # %bb.0: 988; SSE-NEXT: movapd %xmm0, %xmm1 989; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 990; SSE-NEXT: addsd %xmm1, %xmm0 991; SSE-NEXT: retq 992; 993; AVX1-SLOW-LABEL: test_v2f64_zero: 994; AVX1-SLOW: # %bb.0: 995; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 996; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 997; AVX1-SLOW-NEXT: retq 998; 999; AVX1-FAST-LABEL: test_v2f64_zero: 1000; AVX1-FAST: # %bb.0: 1001; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1002; AVX1-FAST-NEXT: retq 1003; 1004; AVX2-LABEL: test_v2f64_zero: 1005; AVX2: # %bb.0: 1006; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1007; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1008; AVX2-NEXT: retq 1009; 1010; AVX512-LABEL: test_v2f64_zero: 1011; AVX512: # %bb.0: 1012; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1013; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1014; AVX512-NEXT: retq 1015 %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0) 1016 ret double %1 1017} 1018 1019define double @test_v4f64_zero(<4 x double> %a0) { 1020; SSE-LABEL: test_v4f64_zero: 1021; SSE: # %bb.0: 1022; SSE-NEXT: addpd %xmm1, %xmm0 1023; SSE-NEXT: movapd %xmm0, %xmm1 1024; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1025; SSE-NEXT: addsd %xmm1, %xmm0 1026; SSE-NEXT: retq 1027; 1028; AVX1-SLOW-LABEL: test_v4f64_zero: 1029; AVX1-SLOW: # %bb.0: 1030; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1031; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1032; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1033; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1034; AVX1-SLOW-NEXT: vzeroupper 1035; AVX1-SLOW-NEXT: retq 1036; 1037; AVX1-FAST-LABEL: test_v4f64_zero: 1038; AVX1-FAST: # %bb.0: 1039; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1040; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 1041; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1042; AVX1-FAST-NEXT: vzeroupper 1043; AVX1-FAST-NEXT: retq 1044; 1045; AVX2-LABEL: test_v4f64_zero: 1046; AVX2: # %bb.0: 1047; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 1048; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1049; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1050; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1051; AVX2-NEXT: vzeroupper 1052; AVX2-NEXT: retq 1053; 1054; AVX512-LABEL: test_v4f64_zero: 1055; AVX512: # %bb.0: 1056; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1057; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1058; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1059; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1060; AVX512-NEXT: vzeroupper 1061; AVX512-NEXT: retq 1062 %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0) 1063 ret double %1 1064} 1065 1066define double @test_v8f64_zero(<8 x double> %a0) { 1067; SSE-LABEL: test_v8f64_zero: 1068; SSE: # %bb.0: 1069; SSE-NEXT: addpd %xmm3, %xmm1 1070; SSE-NEXT: addpd %xmm2, %xmm0 1071; SSE-NEXT: addpd %xmm1, %xmm0 1072; SSE-NEXT: movapd %xmm0, %xmm1 1073; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1074; SSE-NEXT: addsd %xmm1, %xmm0 1075; SSE-NEXT: retq 1076; 1077; AVX1-SLOW-LABEL: test_v8f64_zero: 1078; AVX1-SLOW: # %bb.0: 1079; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1080; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1081; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1082; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1083; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1084; AVX1-SLOW-NEXT: vzeroupper 1085; AVX1-SLOW-NEXT: retq 1086; 1087; AVX1-FAST-LABEL: test_v8f64_zero: 1088; AVX1-FAST: # %bb.0: 1089; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1090; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1091; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1092; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1093; AVX1-FAST-NEXT: vzeroupper 1094; AVX1-FAST-NEXT: retq 1095; 1096; AVX2-LABEL: test_v8f64_zero: 1097; AVX2: # %bb.0: 1098; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1099; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 1100; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1101; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1102; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1103; AVX2-NEXT: vzeroupper 1104; AVX2-NEXT: retq 1105; 1106; AVX512-LABEL: test_v8f64_zero: 1107; AVX512: # %bb.0: 1108; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 1109; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1110; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1111; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1112; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1113; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1114; AVX512-NEXT: vzeroupper 1115; AVX512-NEXT: retq 1116 %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0) 1117 ret double %1 1118} 1119 1120define double @test_v16f64_zero(<16 x double> %a0) { 1121; SSE-LABEL: test_v16f64_zero: 1122; SSE: # %bb.0: 1123; SSE-NEXT: addpd %xmm6, %xmm2 1124; SSE-NEXT: addpd %xmm4, %xmm0 1125; SSE-NEXT: addpd %xmm2, %xmm0 1126; SSE-NEXT: addpd %xmm7, %xmm3 1127; SSE-NEXT: addpd %xmm5, %xmm1 1128; SSE-NEXT: addpd %xmm3, %xmm1 1129; SSE-NEXT: addpd %xmm1, %xmm0 1130; SSE-NEXT: movapd %xmm0, %xmm1 1131; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1132; SSE-NEXT: addsd %xmm1, %xmm0 1133; SSE-NEXT: retq 1134; 1135; AVX1-SLOW-LABEL: test_v16f64_zero: 1136; AVX1-SLOW: # %bb.0: 1137; AVX1-SLOW-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1138; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1139; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1140; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1141; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1142; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1143; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1144; AVX1-SLOW-NEXT: vzeroupper 1145; AVX1-SLOW-NEXT: retq 1146; 1147; AVX1-FAST-LABEL: test_v16f64_zero: 1148; AVX1-FAST: # %bb.0: 1149; AVX1-FAST-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1150; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1151; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1152; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1153; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1154; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1155; AVX1-FAST-NEXT: vzeroupper 1156; AVX1-FAST-NEXT: retq 1157; 1158; AVX2-LABEL: test_v16f64_zero: 1159; AVX2: # %bb.0: 1160; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1161; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1162; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1163; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 1164; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1165; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1166; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1167; AVX2-NEXT: vzeroupper 1168; AVX2-NEXT: retq 1169; 1170; AVX512-LABEL: test_v16f64_zero: 1171; AVX512: # %bb.0: 1172; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1173; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 1174; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1175; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1176; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1177; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1178; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1179; AVX512-NEXT: vzeroupper 1180; AVX512-NEXT: retq 1181 %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0) 1182 ret double %1 1183} 1184 1185; 1186; vXf64 (undef) 1187; 1188 1189define double @test_v2f64_undef(<2 x double> %a0) { 1190; SSE-LABEL: test_v2f64_undef: 1191; SSE: # %bb.0: 1192; SSE-NEXT: movapd %xmm0, %xmm1 1193; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1194; SSE-NEXT: addsd %xmm1, %xmm0 1195; SSE-NEXT: retq 1196; 1197; AVX1-SLOW-LABEL: test_v2f64_undef: 1198; AVX1-SLOW: # %bb.0: 1199; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1200; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1201; AVX1-SLOW-NEXT: retq 1202; 1203; AVX1-FAST-LABEL: test_v2f64_undef: 1204; AVX1-FAST: # %bb.0: 1205; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1206; AVX1-FAST-NEXT: retq 1207; 1208; AVX2-LABEL: test_v2f64_undef: 1209; AVX2: # %bb.0: 1210; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1211; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1212; AVX2-NEXT: retq 1213; 1214; AVX512-LABEL: test_v2f64_undef: 1215; AVX512: # %bb.0: 1216; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1217; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1218; AVX512-NEXT: retq 1219 %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0) 1220 ret double %1 1221} 1222 1223define double @test_v4f64_undef(<4 x double> %a0) { 1224; SSE-LABEL: test_v4f64_undef: 1225; SSE: # %bb.0: 1226; SSE-NEXT: addpd %xmm1, %xmm0 1227; SSE-NEXT: movapd %xmm0, %xmm1 1228; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1229; SSE-NEXT: addsd %xmm1, %xmm0 1230; SSE-NEXT: retq 1231; 1232; AVX1-SLOW-LABEL: test_v4f64_undef: 1233; AVX1-SLOW: # %bb.0: 1234; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1235; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1236; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1237; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1238; AVX1-SLOW-NEXT: vzeroupper 1239; AVX1-SLOW-NEXT: retq 1240; 1241; AVX1-FAST-LABEL: test_v4f64_undef: 1242; AVX1-FAST: # %bb.0: 1243; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1244; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 1245; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1246; AVX1-FAST-NEXT: vzeroupper 1247; AVX1-FAST-NEXT: retq 1248; 1249; AVX2-LABEL: test_v4f64_undef: 1250; AVX2: # %bb.0: 1251; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 1252; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1253; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1254; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1255; AVX2-NEXT: vzeroupper 1256; AVX2-NEXT: retq 1257; 1258; AVX512-LABEL: test_v4f64_undef: 1259; AVX512: # %bb.0: 1260; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1261; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1262; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1263; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1264; AVX512-NEXT: vzeroupper 1265; AVX512-NEXT: retq 1266 %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0) 1267 ret double %1 1268} 1269 1270define double @test_v8f64_undef(<8 x double> %a0) { 1271; SSE-LABEL: test_v8f64_undef: 1272; SSE: # %bb.0: 1273; SSE-NEXT: addpd %xmm3, %xmm1 1274; SSE-NEXT: addpd %xmm2, %xmm0 1275; SSE-NEXT: addpd %xmm1, %xmm0 1276; SSE-NEXT: movapd %xmm0, %xmm1 1277; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1278; SSE-NEXT: addsd %xmm1, %xmm0 1279; SSE-NEXT: retq 1280; 1281; AVX1-SLOW-LABEL: test_v8f64_undef: 1282; AVX1-SLOW: # %bb.0: 1283; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1284; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1285; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1286; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1287; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1288; AVX1-SLOW-NEXT: vzeroupper 1289; AVX1-SLOW-NEXT: retq 1290; 1291; AVX1-FAST-LABEL: test_v8f64_undef: 1292; AVX1-FAST: # %bb.0: 1293; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1294; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1295; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1296; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1297; AVX1-FAST-NEXT: vzeroupper 1298; AVX1-FAST-NEXT: retq 1299; 1300; AVX2-LABEL: test_v8f64_undef: 1301; AVX2: # %bb.0: 1302; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1303; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 1304; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1305; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1306; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1307; AVX2-NEXT: vzeroupper 1308; AVX2-NEXT: retq 1309; 1310; AVX512-LABEL: test_v8f64_undef: 1311; AVX512: # %bb.0: 1312; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 1313; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1314; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1315; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1316; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1317; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1318; AVX512-NEXT: vzeroupper 1319; AVX512-NEXT: retq 1320 %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0) 1321 ret double %1 1322} 1323 1324define double @test_v16f64_undef(<16 x double> %a0) { 1325; SSE-LABEL: test_v16f64_undef: 1326; SSE: # %bb.0: 1327; SSE-NEXT: addpd %xmm6, %xmm2 1328; SSE-NEXT: addpd %xmm4, %xmm0 1329; SSE-NEXT: addpd %xmm2, %xmm0 1330; SSE-NEXT: addpd %xmm7, %xmm3 1331; SSE-NEXT: addpd %xmm5, %xmm1 1332; SSE-NEXT: addpd %xmm3, %xmm1 1333; SSE-NEXT: addpd %xmm1, %xmm0 1334; SSE-NEXT: movapd %xmm0, %xmm1 1335; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1336; SSE-NEXT: addsd %xmm1, %xmm0 1337; SSE-NEXT: retq 1338; 1339; AVX1-SLOW-LABEL: test_v16f64_undef: 1340; AVX1-SLOW: # %bb.0: 1341; AVX1-SLOW-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1342; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1343; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1344; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1345; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1346; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1347; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1348; AVX1-SLOW-NEXT: vzeroupper 1349; AVX1-SLOW-NEXT: retq 1350; 1351; AVX1-FAST-LABEL: test_v16f64_undef: 1352; AVX1-FAST: # %bb.0: 1353; AVX1-FAST-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1354; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1355; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1356; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1357; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1358; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1359; AVX1-FAST-NEXT: vzeroupper 1360; AVX1-FAST-NEXT: retq 1361; 1362; AVX2-LABEL: test_v16f64_undef: 1363; AVX2: # %bb.0: 1364; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1365; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1366; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1367; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 1368; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1369; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1370; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1371; AVX2-NEXT: vzeroupper 1372; AVX2-NEXT: retq 1373; 1374; AVX512-LABEL: test_v16f64_undef: 1375; AVX512: # %bb.0: 1376; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1377; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 1378; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1379; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1380; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1381; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1382; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1383; AVX512-NEXT: vzeroupper 1384; AVX512-NEXT: retq 1385 %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0) 1386 ret double %1 1387} 1388 1389declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>) 1390declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) 1391declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) 1392declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>) 1393 1394declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>) 1395declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>) 1396declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>) 1397declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>) 1398