1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX 5; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX 6; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 7; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 8 9; 10; vXf32 (accum) 11; 12 13define float @test_v2f32(float %a0, <2 x float> %a1) { 14; SSE2-LABEL: test_v2f32: 15; SSE2: # %bb.0: 16; SSE2-NEXT: movaps %xmm1, %xmm2 17; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 18; SSE2-NEXT: mulss %xmm1, %xmm2 19; SSE2-NEXT: mulss %xmm2, %xmm0 20; SSE2-NEXT: retq 21; 22; SSE41-LABEL: test_v2f32: 23; SSE41: # %bb.0: 24; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 25; SSE41-NEXT: mulss %xmm1, %xmm2 26; SSE41-NEXT: mulss %xmm2, %xmm0 27; SSE41-NEXT: retq 28; 29; AVX-LABEL: test_v2f32: 30; AVX: # %bb.0: 31; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 32; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 33; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 34; AVX-NEXT: retq 35; 36; AVX512-LABEL: test_v2f32: 37; AVX512: # %bb.0: 38; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 39; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 40; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 41; AVX512-NEXT: retq 42 %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1) 43 ret float %1 44} 45 46define float @test_v4f32(float %a0, <4 x float> %a1) { 47; SSE2-LABEL: test_v4f32: 48; SSE2: # %bb.0: 49; SSE2-NEXT: movaps %xmm1, %xmm2 50; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 51; SSE2-NEXT: mulps %xmm1, %xmm2 52; SSE2-NEXT: movaps %xmm2, %xmm1 53; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 54; SSE2-NEXT: mulss %xmm2, %xmm1 55; SSE2-NEXT: mulss %xmm1, %xmm0 56; SSE2-NEXT: retq 57; 58; SSE41-LABEL: test_v4f32: 59; SSE41: # %bb.0: 60; SSE41-NEXT: movaps %xmm1, %xmm2 61; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 62; SSE41-NEXT: mulps %xmm1, %xmm2 63; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 64; SSE41-NEXT: mulss %xmm2, %xmm1 65; SSE41-NEXT: mulss %xmm1, %xmm0 66; SSE41-NEXT: retq 67; 68; AVX-LABEL: test_v4f32: 69; AVX: # %bb.0: 70; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 71; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 72; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 73; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 74; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 75; AVX-NEXT: retq 76; 77; AVX512-LABEL: test_v4f32: 78; AVX512: # %bb.0: 79; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 80; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 81; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 82; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 83; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 84; AVX512-NEXT: retq 85 %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1) 86 ret float %1 87} 88 89define float @test_v8f32(float %a0, <8 x float> %a1) { 90; SSE2-LABEL: test_v8f32: 91; SSE2: # %bb.0: 92; SSE2-NEXT: mulps %xmm2, %xmm1 93; SSE2-NEXT: movaps %xmm1, %xmm2 94; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 95; SSE2-NEXT: mulps %xmm1, %xmm2 96; SSE2-NEXT: movaps %xmm2, %xmm1 97; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 98; SSE2-NEXT: mulss %xmm2, %xmm1 99; SSE2-NEXT: mulss %xmm1, %xmm0 100; SSE2-NEXT: retq 101; 102; SSE41-LABEL: test_v8f32: 103; SSE41: # %bb.0: 104; SSE41-NEXT: mulps %xmm2, %xmm1 105; SSE41-NEXT: movaps %xmm1, %xmm2 106; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 107; SSE41-NEXT: mulps %xmm1, %xmm2 108; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 109; SSE41-NEXT: mulss %xmm2, %xmm1 110; SSE41-NEXT: mulss %xmm1, %xmm0 111; SSE41-NEXT: retq 112; 113; AVX-LABEL: test_v8f32: 114; AVX: # %bb.0: 115; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 116; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 117; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 118; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 119; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 120; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 121; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 122; AVX-NEXT: vzeroupper 123; AVX-NEXT: retq 124; 125; AVX512-LABEL: test_v8f32: 126; AVX512: # %bb.0: 127; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 128; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 129; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 130; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 131; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 132; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 133; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 134; AVX512-NEXT: vzeroupper 135; AVX512-NEXT: retq 136 %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1) 137 ret float %1 138} 139 140define float @test_v16f32(float %a0, <16 x float> %a1) { 141; SSE2-LABEL: test_v16f32: 142; SSE2: # %bb.0: 143; SSE2-NEXT: mulps %xmm4, %xmm2 144; SSE2-NEXT: mulps %xmm3, %xmm1 145; SSE2-NEXT: mulps %xmm2, %xmm1 146; SSE2-NEXT: movaps %xmm1, %xmm2 147; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 148; SSE2-NEXT: mulps %xmm1, %xmm2 149; SSE2-NEXT: movaps %xmm2, %xmm1 150; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 151; SSE2-NEXT: mulss %xmm2, %xmm1 152; SSE2-NEXT: mulss %xmm1, %xmm0 153; SSE2-NEXT: retq 154; 155; SSE41-LABEL: test_v16f32: 156; SSE41: # %bb.0: 157; SSE41-NEXT: mulps %xmm4, %xmm2 158; SSE41-NEXT: mulps %xmm3, %xmm1 159; SSE41-NEXT: mulps %xmm2, %xmm1 160; SSE41-NEXT: movaps %xmm1, %xmm2 161; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 162; SSE41-NEXT: mulps %xmm1, %xmm2 163; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 164; SSE41-NEXT: mulss %xmm2, %xmm1 165; SSE41-NEXT: mulss %xmm1, %xmm0 166; SSE41-NEXT: retq 167; 168; AVX-LABEL: test_v16f32: 169; AVX: # %bb.0: 170; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 171; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 172; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 173; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 174; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 175; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 176; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 177; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 178; AVX-NEXT: vzeroupper 179; AVX-NEXT: retq 180; 181; AVX512-LABEL: test_v16f32: 182; AVX512: # %bb.0: 183; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 184; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm1 185; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 186; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 187; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 188; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 189; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 190; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 191; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 192; AVX512-NEXT: vzeroupper 193; AVX512-NEXT: retq 194 %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1) 195 ret float %1 196} 197 198; 199; vXf32 (one) 200; 201 202define float @test_v2f32_zero(<2 x float> %a0) { 203; SSE2-LABEL: test_v2f32_zero: 204; SSE2: # %bb.0: 205; SSE2-NEXT: movaps %xmm0, %xmm1 206; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 207; SSE2-NEXT: mulss %xmm1, %xmm0 208; SSE2-NEXT: retq 209; 210; SSE41-LABEL: test_v2f32_zero: 211; SSE41: # %bb.0: 212; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 213; SSE41-NEXT: mulss %xmm1, %xmm0 214; SSE41-NEXT: retq 215; 216; AVX-LABEL: test_v2f32_zero: 217; AVX: # %bb.0: 218; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 219; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 220; AVX-NEXT: retq 221; 222; AVX512-LABEL: test_v2f32_zero: 223; AVX512: # %bb.0: 224; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 225; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 226; AVX512-NEXT: retq 227 %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0) 228 ret float %1 229} 230 231define float @test_v4f32_zero(<4 x float> %a0) { 232; SSE2-LABEL: test_v4f32_zero: 233; SSE2: # %bb.0: 234; SSE2-NEXT: movaps %xmm0, %xmm1 235; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 236; SSE2-NEXT: mulps %xmm1, %xmm0 237; SSE2-NEXT: movaps %xmm0, %xmm1 238; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 239; SSE2-NEXT: mulss %xmm1, %xmm0 240; SSE2-NEXT: retq 241; 242; SSE41-LABEL: test_v4f32_zero: 243; SSE41: # %bb.0: 244; SSE41-NEXT: movaps %xmm0, %xmm1 245; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 246; SSE41-NEXT: mulps %xmm1, %xmm0 247; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 248; SSE41-NEXT: mulss %xmm1, %xmm0 249; SSE41-NEXT: retq 250; 251; AVX-LABEL: test_v4f32_zero: 252; AVX: # %bb.0: 253; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 254; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 255; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 256; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 257; AVX-NEXT: retq 258; 259; AVX512-LABEL: test_v4f32_zero: 260; AVX512: # %bb.0: 261; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 262; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 263; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 264; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 265; AVX512-NEXT: retq 266 %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0) 267 ret float %1 268} 269 270define float @test_v8f32_zero(<8 x float> %a0) { 271; SSE2-LABEL: test_v8f32_zero: 272; SSE2: # %bb.0: 273; SSE2-NEXT: mulps %xmm1, %xmm0 274; SSE2-NEXT: movaps %xmm0, %xmm1 275; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 276; SSE2-NEXT: mulps %xmm1, %xmm0 277; SSE2-NEXT: movaps %xmm0, %xmm1 278; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 279; SSE2-NEXT: mulss %xmm1, %xmm0 280; SSE2-NEXT: retq 281; 282; SSE41-LABEL: test_v8f32_zero: 283; SSE41: # %bb.0: 284; SSE41-NEXT: mulps %xmm1, %xmm0 285; SSE41-NEXT: movaps %xmm0, %xmm1 286; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 287; SSE41-NEXT: mulps %xmm1, %xmm0 288; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 289; SSE41-NEXT: mulss %xmm1, %xmm0 290; SSE41-NEXT: retq 291; 292; AVX-LABEL: test_v8f32_zero: 293; AVX: # %bb.0: 294; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 295; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 296; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 297; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 298; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 299; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 300; AVX-NEXT: vzeroupper 301; AVX-NEXT: retq 302; 303; AVX512-LABEL: test_v8f32_zero: 304; AVX512: # %bb.0: 305; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 306; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 307; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 308; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 309; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 310; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 311; AVX512-NEXT: vzeroupper 312; AVX512-NEXT: retq 313 %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0) 314 ret float %1 315} 316 317define float @test_v16f32_zero(<16 x float> %a0) { 318; SSE2-LABEL: test_v16f32_zero: 319; SSE2: # %bb.0: 320; SSE2-NEXT: mulps %xmm3, %xmm1 321; SSE2-NEXT: mulps %xmm2, %xmm0 322; SSE2-NEXT: mulps %xmm1, %xmm0 323; SSE2-NEXT: movaps %xmm0, %xmm1 324; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 325; SSE2-NEXT: mulps %xmm1, %xmm0 326; SSE2-NEXT: movaps %xmm0, %xmm1 327; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 328; SSE2-NEXT: mulss %xmm1, %xmm0 329; SSE2-NEXT: retq 330; 331; SSE41-LABEL: test_v16f32_zero: 332; SSE41: # %bb.0: 333; SSE41-NEXT: mulps %xmm3, %xmm1 334; SSE41-NEXT: mulps %xmm2, %xmm0 335; SSE41-NEXT: mulps %xmm1, %xmm0 336; SSE41-NEXT: movaps %xmm0, %xmm1 337; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 338; SSE41-NEXT: mulps %xmm1, %xmm0 339; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 340; SSE41-NEXT: mulss %xmm1, %xmm0 341; SSE41-NEXT: retq 342; 343; AVX-LABEL: test_v16f32_zero: 344; AVX: # %bb.0: 345; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 346; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 347; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 348; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 349; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 350; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 351; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 352; AVX-NEXT: vzeroupper 353; AVX-NEXT: retq 354; 355; AVX512-LABEL: test_v16f32_zero: 356; AVX512: # %bb.0: 357; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 358; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 359; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 360; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 361; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 362; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 363; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 364; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 365; AVX512-NEXT: vzeroupper 366; AVX512-NEXT: retq 367 %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0) 368 ret float %1 369} 370 371; 372; vXf32 (undef) 373; 374 375define float @test_v2f32_undef(<2 x float> %a0) { 376; SSE2-LABEL: test_v2f32_undef: 377; SSE2: # %bb.0: 378; SSE2-NEXT: movaps %xmm0, %xmm1 379; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 380; SSE2-NEXT: mulss %xmm1, %xmm0 381; SSE2-NEXT: retq 382; 383; SSE41-LABEL: test_v2f32_undef: 384; SSE41: # %bb.0: 385; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 386; SSE41-NEXT: mulss %xmm1, %xmm0 387; SSE41-NEXT: retq 388; 389; AVX-LABEL: test_v2f32_undef: 390; AVX: # %bb.0: 391; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 392; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 393; AVX-NEXT: retq 394; 395; AVX512-LABEL: test_v2f32_undef: 396; AVX512: # %bb.0: 397; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 398; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 399; AVX512-NEXT: retq 400 %1 = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0) 401 ret float %1 402} 403 404define float @test_v4f32_undef(<4 x float> %a0) { 405; SSE2-LABEL: test_v4f32_undef: 406; SSE2: # %bb.0: 407; SSE2-NEXT: movaps %xmm0, %xmm1 408; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 409; SSE2-NEXT: mulps %xmm1, %xmm0 410; SSE2-NEXT: movaps %xmm0, %xmm1 411; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 412; SSE2-NEXT: mulss %xmm1, %xmm0 413; SSE2-NEXT: retq 414; 415; SSE41-LABEL: test_v4f32_undef: 416; SSE41: # %bb.0: 417; SSE41-NEXT: movaps %xmm0, %xmm1 418; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 419; SSE41-NEXT: mulps %xmm1, %xmm0 420; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 421; SSE41-NEXT: mulss %xmm1, %xmm0 422; SSE41-NEXT: retq 423; 424; AVX-LABEL: test_v4f32_undef: 425; AVX: # %bb.0: 426; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 427; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 428; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 429; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 430; AVX-NEXT: retq 431; 432; AVX512-LABEL: test_v4f32_undef: 433; AVX512: # %bb.0: 434; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 435; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 436; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 437; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 438; AVX512-NEXT: retq 439 %1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0) 440 ret float %1 441} 442 443define float @test_v8f32_undef(<8 x float> %a0) { 444; SSE2-LABEL: test_v8f32_undef: 445; SSE2: # %bb.0: 446; SSE2-NEXT: mulps %xmm1, %xmm0 447; SSE2-NEXT: movaps %xmm0, %xmm1 448; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 449; SSE2-NEXT: mulps %xmm1, %xmm0 450; SSE2-NEXT: movaps %xmm0, %xmm1 451; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 452; SSE2-NEXT: mulss %xmm1, %xmm0 453; SSE2-NEXT: retq 454; 455; SSE41-LABEL: test_v8f32_undef: 456; SSE41: # %bb.0: 457; SSE41-NEXT: mulps %xmm1, %xmm0 458; SSE41-NEXT: movaps %xmm0, %xmm1 459; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 460; SSE41-NEXT: mulps %xmm1, %xmm0 461; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 462; SSE41-NEXT: mulss %xmm1, %xmm0 463; SSE41-NEXT: retq 464; 465; AVX-LABEL: test_v8f32_undef: 466; AVX: # %bb.0: 467; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 468; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 469; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 470; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 471; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 472; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 473; AVX-NEXT: vzeroupper 474; AVX-NEXT: retq 475; 476; AVX512-LABEL: test_v8f32_undef: 477; AVX512: # %bb.0: 478; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 479; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 480; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 481; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 482; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 483; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 484; AVX512-NEXT: vzeroupper 485; AVX512-NEXT: retq 486 %1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0) 487 ret float %1 488} 489 490define float @test_v16f32_undef(<16 x float> %a0) { 491; SSE2-LABEL: test_v16f32_undef: 492; SSE2: # %bb.0: 493; SSE2-NEXT: mulps %xmm3, %xmm1 494; SSE2-NEXT: mulps %xmm2, %xmm0 495; SSE2-NEXT: mulps %xmm1, %xmm0 496; SSE2-NEXT: movaps %xmm0, %xmm1 497; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 498; SSE2-NEXT: mulps %xmm1, %xmm0 499; SSE2-NEXT: movaps %xmm0, %xmm1 500; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 501; SSE2-NEXT: mulss %xmm1, %xmm0 502; SSE2-NEXT: retq 503; 504; SSE41-LABEL: test_v16f32_undef: 505; SSE41: # %bb.0: 506; SSE41-NEXT: mulps %xmm3, %xmm1 507; SSE41-NEXT: mulps %xmm2, %xmm0 508; SSE41-NEXT: mulps %xmm1, %xmm0 509; SSE41-NEXT: movaps %xmm0, %xmm1 510; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 511; SSE41-NEXT: mulps %xmm1, %xmm0 512; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 513; SSE41-NEXT: mulss %xmm1, %xmm0 514; SSE41-NEXT: retq 515; 516; AVX-LABEL: test_v16f32_undef: 517; AVX: # %bb.0: 518; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 519; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 520; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 521; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 522; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 523; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 524; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 525; AVX-NEXT: vzeroupper 526; AVX-NEXT: retq 527; 528; AVX512-LABEL: test_v16f32_undef: 529; AVX512: # %bb.0: 530; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 531; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 532; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 533; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 534; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 535; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 536; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 537; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 538; AVX512-NEXT: vzeroupper 539; AVX512-NEXT: retq 540 %1 = call fast float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0) 541 ret float %1 542} 543 544; 545; vXf64 (accum) 546; 547 548define double @test_v2f64(double %a0, <2 x double> %a1) { 549; SSE-LABEL: test_v2f64: 550; SSE: # %bb.0: 551; SSE-NEXT: movapd %xmm1, %xmm2 552; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 553; SSE-NEXT: mulsd %xmm1, %xmm2 554; SSE-NEXT: mulsd %xmm2, %xmm0 555; SSE-NEXT: retq 556; 557; AVX-LABEL: test_v2f64: 558; AVX: # %bb.0: 559; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 560; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 561; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 562; AVX-NEXT: retq 563; 564; AVX512-LABEL: test_v2f64: 565; AVX512: # %bb.0: 566; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 567; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 568; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 569; AVX512-NEXT: retq 570 %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1) 571 ret double %1 572} 573 574define double @test_v4f64(double %a0, <4 x double> %a1) { 575; SSE-LABEL: test_v4f64: 576; SSE: # %bb.0: 577; SSE-NEXT: mulpd %xmm2, %xmm1 578; SSE-NEXT: movapd %xmm1, %xmm2 579; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 580; SSE-NEXT: mulsd %xmm1, %xmm2 581; SSE-NEXT: mulsd %xmm2, %xmm0 582; SSE-NEXT: retq 583; 584; AVX-LABEL: test_v4f64: 585; AVX: # %bb.0: 586; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 587; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1 588; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 589; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 590; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 591; AVX-NEXT: vzeroupper 592; AVX-NEXT: retq 593; 594; AVX512-LABEL: test_v4f64: 595; AVX512: # %bb.0: 596; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 597; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1 598; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 599; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 600; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 601; AVX512-NEXT: vzeroupper 602; AVX512-NEXT: retq 603 %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1) 604 ret double %1 605} 606 607define double @test_v8f64(double %a0, <8 x double> %a1) { 608; SSE-LABEL: test_v8f64: 609; SSE: # %bb.0: 610; SSE-NEXT: mulpd %xmm4, %xmm2 611; SSE-NEXT: mulpd %xmm3, %xmm1 612; SSE-NEXT: mulpd %xmm2, %xmm1 613; SSE-NEXT: movapd %xmm1, %xmm2 614; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 615; SSE-NEXT: mulsd %xmm1, %xmm2 616; SSE-NEXT: mulsd %xmm2, %xmm0 617; SSE-NEXT: retq 618; 619; AVX-LABEL: test_v8f64: 620; AVX: # %bb.0: 621; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 622; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 623; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1 624; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 625; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 626; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 627; AVX-NEXT: vzeroupper 628; AVX-NEXT: retq 629; 630; AVX512-LABEL: test_v8f64: 631; AVX512: # %bb.0: 632; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 633; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 634; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 635; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1 636; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 637; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 638; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 639; AVX512-NEXT: vzeroupper 640; AVX512-NEXT: retq 641 %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1) 642 ret double %1 643} 644 645define double @test_v16f64(double %a0, <16 x double> %a1) { 646; SSE-LABEL: test_v16f64: 647; SSE: # %bb.0: 648; SSE-NEXT: mulpd %xmm6, %xmm2 649; SSE-NEXT: mulpd %xmm7, %xmm3 650; SSE-NEXT: mulpd %xmm5, %xmm1 651; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 652; SSE-NEXT: mulpd %xmm3, %xmm1 653; SSE-NEXT: mulpd %xmm2, %xmm4 654; SSE-NEXT: mulpd %xmm1, %xmm4 655; SSE-NEXT: movapd %xmm4, %xmm1 656; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] 657; SSE-NEXT: mulsd %xmm4, %xmm1 658; SSE-NEXT: mulsd %xmm1, %xmm0 659; SSE-NEXT: retq 660; 661; AVX-LABEL: test_v16f64: 662; AVX: # %bb.0: 663; AVX-NEXT: vmulpd %ymm4, %ymm2, %ymm2 664; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1 665; AVX-NEXT: vmulpd %ymm2, %ymm1, %ymm1 666; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 667; AVX-NEXT: vmulpd %xmm2, %xmm1, %xmm1 668; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 669; AVX-NEXT: vmulsd %xmm2, %xmm1, %xmm1 670; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 671; AVX-NEXT: vzeroupper 672; AVX-NEXT: retq 673; 674; AVX512-LABEL: test_v16f64: 675; AVX512: # %bb.0: 676; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 677; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 678; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm1 679; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 680; AVX512-NEXT: vmulpd %xmm2, %xmm1, %xmm1 681; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 682; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 683; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 684; AVX512-NEXT: vzeroupper 685; AVX512-NEXT: retq 686 %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1) 687 ret double %1 688} 689 690; 691; vXf64 (one) 692; 693 694define double @test_v2f64_zero(<2 x double> %a0) { 695; SSE-LABEL: test_v2f64_zero: 696; SSE: # %bb.0: 697; SSE-NEXT: movapd %xmm0, %xmm1 698; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 699; SSE-NEXT: mulsd %xmm1, %xmm0 700; SSE-NEXT: retq 701; 702; AVX-LABEL: test_v2f64_zero: 703; AVX: # %bb.0: 704; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 705; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 706; AVX-NEXT: retq 707; 708; AVX512-LABEL: test_v2f64_zero: 709; AVX512: # %bb.0: 710; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 711; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 712; AVX512-NEXT: retq 713 %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0) 714 ret double %1 715} 716 717define double @test_v4f64_zero(<4 x double> %a0) { 718; SSE-LABEL: test_v4f64_zero: 719; SSE: # %bb.0: 720; SSE-NEXT: mulpd %xmm1, %xmm0 721; SSE-NEXT: movapd %xmm0, %xmm1 722; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 723; SSE-NEXT: mulsd %xmm1, %xmm0 724; SSE-NEXT: retq 725; 726; AVX-LABEL: test_v4f64_zero: 727; AVX: # %bb.0: 728; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 729; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 730; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 731; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 732; AVX-NEXT: vzeroupper 733; AVX-NEXT: retq 734; 735; AVX512-LABEL: test_v4f64_zero: 736; AVX512: # %bb.0: 737; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 738; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 739; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 740; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 741; AVX512-NEXT: vzeroupper 742; AVX512-NEXT: retq 743 %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0) 744 ret double %1 745} 746 747define double @test_v8f64_zero(<8 x double> %a0) { 748; SSE-LABEL: test_v8f64_zero: 749; SSE: # %bb.0: 750; SSE-NEXT: mulpd %xmm3, %xmm1 751; SSE-NEXT: mulpd %xmm2, %xmm0 752; SSE-NEXT: mulpd %xmm1, %xmm0 753; SSE-NEXT: movapd %xmm0, %xmm1 754; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 755; SSE-NEXT: mulsd %xmm1, %xmm0 756; SSE-NEXT: retq 757; 758; AVX-LABEL: test_v8f64_zero: 759; AVX: # %bb.0: 760; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 761; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 762; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 763; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 764; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 765; AVX-NEXT: vzeroupper 766; AVX-NEXT: retq 767; 768; AVX512-LABEL: test_v8f64_zero: 769; AVX512: # %bb.0: 770; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 771; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 772; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 773; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 774; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 775; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 776; AVX512-NEXT: vzeroupper 777; AVX512-NEXT: retq 778 %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0) 779 ret double %1 780} 781 782define double @test_v16f64_zero(<16 x double> %a0) { 783; SSE-LABEL: test_v16f64_zero: 784; SSE: # %bb.0: 785; SSE-NEXT: mulpd %xmm6, %xmm2 786; SSE-NEXT: mulpd %xmm4, %xmm0 787; SSE-NEXT: mulpd %xmm2, %xmm0 788; SSE-NEXT: mulpd %xmm7, %xmm3 789; SSE-NEXT: mulpd %xmm5, %xmm1 790; SSE-NEXT: mulpd %xmm3, %xmm1 791; SSE-NEXT: mulpd %xmm1, %xmm0 792; SSE-NEXT: movapd %xmm0, %xmm1 793; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 794; SSE-NEXT: mulsd %xmm1, %xmm0 795; SSE-NEXT: retq 796; 797; AVX-LABEL: test_v16f64_zero: 798; AVX: # %bb.0: 799; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1 800; AVX-NEXT: vmulpd %ymm2, %ymm0, %ymm0 801; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 802; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 803; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 804; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 805; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 806; AVX-NEXT: vzeroupper 807; AVX-NEXT: retq 808; 809; AVX512-LABEL: test_v16f64_zero: 810; AVX512: # %bb.0: 811; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 812; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 813; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 814; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 815; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 816; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 817; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 818; AVX512-NEXT: vzeroupper 819; AVX512-NEXT: retq 820 %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0) 821 ret double %1 822} 823 824; 825; vXf64 (undef) 826; 827 828define double @test_v2f64_undef(<2 x double> %a0) { 829; SSE-LABEL: test_v2f64_undef: 830; SSE: # %bb.0: 831; SSE-NEXT: movapd %xmm0, %xmm1 832; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 833; SSE-NEXT: mulsd %xmm1, %xmm0 834; SSE-NEXT: retq 835; 836; AVX-LABEL: test_v2f64_undef: 837; AVX: # %bb.0: 838; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 839; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 840; AVX-NEXT: retq 841; 842; AVX512-LABEL: test_v2f64_undef: 843; AVX512: # %bb.0: 844; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 845; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 846; AVX512-NEXT: retq 847 %1 = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0) 848 ret double %1 849} 850 851define double @test_v4f64_undef(<4 x double> %a0) { 852; SSE-LABEL: test_v4f64_undef: 853; SSE: # %bb.0: 854; SSE-NEXT: mulpd %xmm1, %xmm0 855; SSE-NEXT: movapd %xmm0, %xmm1 856; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 857; SSE-NEXT: mulsd %xmm1, %xmm0 858; SSE-NEXT: retq 859; 860; AVX-LABEL: test_v4f64_undef: 861; AVX: # %bb.0: 862; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 863; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 864; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 865; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 866; AVX-NEXT: vzeroupper 867; AVX-NEXT: retq 868; 869; AVX512-LABEL: test_v4f64_undef: 870; AVX512: # %bb.0: 871; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 872; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 873; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 874; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 875; AVX512-NEXT: vzeroupper 876; AVX512-NEXT: retq 877 %1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0) 878 ret double %1 879} 880 881define double @test_v8f64_undef(<8 x double> %a0) { 882; SSE-LABEL: test_v8f64_undef: 883; SSE: # %bb.0: 884; SSE-NEXT: mulpd %xmm3, %xmm1 885; SSE-NEXT: mulpd %xmm2, %xmm0 886; SSE-NEXT: mulpd %xmm1, %xmm0 887; SSE-NEXT: movapd %xmm0, %xmm1 888; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 889; SSE-NEXT: mulsd %xmm1, %xmm0 890; SSE-NEXT: retq 891; 892; AVX-LABEL: test_v8f64_undef: 893; AVX: # %bb.0: 894; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 895; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 896; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 897; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 898; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 899; AVX-NEXT: vzeroupper 900; AVX-NEXT: retq 901; 902; AVX512-LABEL: test_v8f64_undef: 903; AVX512: # %bb.0: 904; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 905; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 906; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 907; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 908; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 909; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 910; AVX512-NEXT: vzeroupper 911; AVX512-NEXT: retq 912 %1 = call fast double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0) 913 ret double %1 914} 915 916define double @test_v16f64_undef(<16 x double> %a0) { 917; SSE-LABEL: test_v16f64_undef: 918; SSE: # %bb.0: 919; SSE-NEXT: mulpd %xmm6, %xmm2 920; SSE-NEXT: mulpd %xmm4, %xmm0 921; SSE-NEXT: mulpd %xmm2, %xmm0 922; SSE-NEXT: mulpd %xmm7, %xmm3 923; SSE-NEXT: mulpd %xmm5, %xmm1 924; SSE-NEXT: mulpd %xmm3, %xmm1 925; SSE-NEXT: mulpd %xmm1, %xmm0 926; SSE-NEXT: movapd %xmm0, %xmm1 927; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 928; SSE-NEXT: mulsd %xmm1, %xmm0 929; SSE-NEXT: retq 930; 931; AVX-LABEL: test_v16f64_undef: 932; AVX: # %bb.0: 933; AVX-NEXT: vmulpd %ymm3, %ymm1, %ymm1 934; AVX-NEXT: vmulpd %ymm2, %ymm0, %ymm0 935; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 936; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 937; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 938; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 939; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 940; AVX-NEXT: vzeroupper 941; AVX-NEXT: retq 942; 943; AVX512-LABEL: test_v16f64_undef: 944; AVX512: # %bb.0: 945; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 946; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 947; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 948; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 949; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 950; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 951; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 952; AVX512-NEXT: vzeroupper 953; AVX512-NEXT: retq 954 %1 = call fast double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0) 955 ret double %1 956} 957 958declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>) 959declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) 960declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>) 961declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>) 962 963declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>) 964declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>) 965declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>) 966declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>) 967