1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 8 9; 10; vXf32 (accum) 11; 12 13define float @test_v2f32(float %a0, <2 x float> %a1) { 14; SSE2-LABEL: test_v2f32: 15; SSE2: # %bb.0: 16; SSE2-NEXT: mulss %xmm1, %xmm0 17; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 18; SSE2-NEXT: mulss %xmm1, %xmm0 19; SSE2-NEXT: retq 20; 21; SSE41-LABEL: test_v2f32: 22; SSE41: # %bb.0: 23; SSE41-NEXT: mulss %xmm1, %xmm0 24; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 25; SSE41-NEXT: mulss %xmm1, %xmm0 26; SSE41-NEXT: retq 27; 28; AVX-LABEL: test_v2f32: 29; AVX: # %bb.0: 30; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 31; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 32; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 33; AVX-NEXT: retq 34; 35; AVX512-LABEL: test_v2f32: 36; AVX512: # %bb.0: 37; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 38; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 39; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 40; AVX512-NEXT: retq 41 %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float %a0, <2 x float> %a1) 42 ret float %1 43} 44 45define float @test_v4f32(float %a0, <4 x float> %a1) { 46; SSE2-LABEL: test_v4f32: 47; SSE2: # %bb.0: 48; SSE2-NEXT: mulss %xmm1, %xmm0 49; SSE2-NEXT: movaps %xmm1, %xmm2 50; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 51; SSE2-NEXT: mulss %xmm2, %xmm0 52; SSE2-NEXT: movaps %xmm1, %xmm2 53; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 54; SSE2-NEXT: mulss %xmm2, %xmm0 55; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 56; SSE2-NEXT: mulss %xmm1, %xmm0 57; SSE2-NEXT: retq 58; 59; SSE41-LABEL: test_v4f32: 60; SSE41: # %bb.0: 61; SSE41-NEXT: mulss %xmm1, %xmm0 62; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 63; SSE41-NEXT: mulss %xmm2, %xmm0 64; SSE41-NEXT: movaps %xmm1, %xmm2 65; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 66; SSE41-NEXT: mulss %xmm2, %xmm0 67; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 68; SSE41-NEXT: mulss %xmm1, %xmm0 69; SSE41-NEXT: retq 70; 71; AVX-LABEL: test_v4f32: 72; AVX: # %bb.0: 73; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 74; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 75; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 76; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 77; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 78; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 79; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 80; AVX-NEXT: retq 81; 82; AVX512-LABEL: test_v4f32: 83; AVX512: # %bb.0: 84; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 85; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 86; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 87; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 88; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 89; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 90; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 91; AVX512-NEXT: retq 92 %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1) 93 ret float %1 94} 95 96define float @test_v8f32(float %a0, <8 x float> %a1) { 97; SSE2-LABEL: test_v8f32: 98; SSE2: # %bb.0: 99; SSE2-NEXT: mulss %xmm1, %xmm0 100; SSE2-NEXT: movaps %xmm1, %xmm3 101; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] 102; SSE2-NEXT: mulss %xmm3, %xmm0 103; SSE2-NEXT: movaps %xmm1, %xmm3 104; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] 105; SSE2-NEXT: mulss %xmm3, %xmm0 106; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 107; SSE2-NEXT: mulss %xmm1, %xmm0 108; SSE2-NEXT: mulss %xmm2, %xmm0 109; SSE2-NEXT: movaps %xmm2, %xmm1 110; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 111; SSE2-NEXT: mulss %xmm1, %xmm0 112; SSE2-NEXT: movaps %xmm2, %xmm1 113; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 114; SSE2-NEXT: mulss %xmm1, %xmm0 115; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 116; SSE2-NEXT: mulss %xmm2, %xmm0 117; SSE2-NEXT: retq 118; 119; SSE41-LABEL: test_v8f32: 120; SSE41: # %bb.0: 121; SSE41-NEXT: mulss %xmm1, %xmm0 122; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 123; SSE41-NEXT: mulss %xmm3, %xmm0 124; SSE41-NEXT: movaps %xmm1, %xmm3 125; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] 126; SSE41-NEXT: mulss %xmm3, %xmm0 127; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 128; SSE41-NEXT: mulss %xmm1, %xmm0 129; SSE41-NEXT: mulss %xmm2, %xmm0 130; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 131; SSE41-NEXT: mulss %xmm1, %xmm0 132; SSE41-NEXT: movaps %xmm2, %xmm1 133; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 134; SSE41-NEXT: mulss %xmm1, %xmm0 135; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 136; SSE41-NEXT: mulss %xmm2, %xmm0 137; SSE41-NEXT: retq 138; 139; AVX-LABEL: test_v8f32: 140; AVX: # %bb.0: 141; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 142; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 143; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 144; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 145; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 146; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] 147; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 148; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 149; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 150; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 151; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 152; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 153; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 154; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 155; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 156; AVX-NEXT: vzeroupper 157; AVX-NEXT: retq 158; 159; AVX512-LABEL: test_v8f32: 160; AVX512: # %bb.0: 161; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 162; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 163; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 164; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 165; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 166; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] 167; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 168; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 169; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 170; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 171; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 172; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 173; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 174; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 175; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 176; AVX512-NEXT: vzeroupper 177; AVX512-NEXT: retq 178 %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float %a0, <8 x float> %a1) 179 ret float %1 180} 181 182define float @test_v16f32(float %a0, <16 x float> %a1) { 183; SSE2-LABEL: test_v16f32: 184; SSE2: # %bb.0: 185; SSE2-NEXT: mulss %xmm1, %xmm0 186; SSE2-NEXT: movaps %xmm1, %xmm5 187; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] 188; SSE2-NEXT: mulss %xmm5, %xmm0 189; SSE2-NEXT: movaps %xmm1, %xmm5 190; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 191; SSE2-NEXT: mulss %xmm5, %xmm0 192; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 193; SSE2-NEXT: mulss %xmm1, %xmm0 194; SSE2-NEXT: mulss %xmm2, %xmm0 195; SSE2-NEXT: movaps %xmm2, %xmm1 196; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 197; SSE2-NEXT: mulss %xmm1, %xmm0 198; SSE2-NEXT: movaps %xmm2, %xmm1 199; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 200; SSE2-NEXT: mulss %xmm1, %xmm0 201; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 202; SSE2-NEXT: mulss %xmm2, %xmm0 203; SSE2-NEXT: mulss %xmm3, %xmm0 204; SSE2-NEXT: movaps %xmm3, %xmm1 205; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] 206; SSE2-NEXT: mulss %xmm1, %xmm0 207; SSE2-NEXT: movaps %xmm3, %xmm1 208; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 209; SSE2-NEXT: mulss %xmm1, %xmm0 210; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 211; SSE2-NEXT: mulss %xmm3, %xmm0 212; SSE2-NEXT: mulss %xmm4, %xmm0 213; SSE2-NEXT: movaps %xmm4, %xmm1 214; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] 215; SSE2-NEXT: mulss %xmm1, %xmm0 216; SSE2-NEXT: movaps %xmm4, %xmm1 217; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] 218; SSE2-NEXT: mulss %xmm1, %xmm0 219; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] 220; SSE2-NEXT: mulss %xmm4, %xmm0 221; SSE2-NEXT: retq 222; 223; SSE41-LABEL: test_v16f32: 224; SSE41: # %bb.0: 225; SSE41-NEXT: mulss %xmm1, %xmm0 226; SSE41-NEXT: movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] 227; SSE41-NEXT: mulss %xmm5, %xmm0 228; SSE41-NEXT: movaps %xmm1, %xmm5 229; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 230; SSE41-NEXT: mulss %xmm5, %xmm0 231; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 232; SSE41-NEXT: mulss %xmm1, %xmm0 233; SSE41-NEXT: mulss %xmm2, %xmm0 234; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 235; SSE41-NEXT: mulss %xmm1, %xmm0 236; SSE41-NEXT: movaps %xmm2, %xmm1 237; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 238; SSE41-NEXT: mulss %xmm1, %xmm0 239; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 240; SSE41-NEXT: mulss %xmm2, %xmm0 241; SSE41-NEXT: mulss %xmm3, %xmm0 242; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 243; SSE41-NEXT: mulss %xmm1, %xmm0 244; SSE41-NEXT: movaps %xmm3, %xmm1 245; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 246; SSE41-NEXT: mulss %xmm1, %xmm0 247; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 248; SSE41-NEXT: mulss %xmm3, %xmm0 249; SSE41-NEXT: mulss %xmm4, %xmm0 250; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] 251; SSE41-NEXT: mulss %xmm1, %xmm0 252; SSE41-NEXT: movaps %xmm4, %xmm1 253; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] 254; SSE41-NEXT: mulss %xmm1, %xmm0 255; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] 256; SSE41-NEXT: mulss %xmm4, %xmm0 257; SSE41-NEXT: retq 258; 259; AVX-LABEL: test_v16f32: 260; AVX: # %bb.0: 261; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 262; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 263; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 264; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] 265; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 266; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] 267; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 268; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 269; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 270; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 271; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 272; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] 273; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 274; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 275; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 276; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 277; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 278; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 279; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] 280; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 281; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] 282; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 283; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 284; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 285; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 286; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 287; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 288; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 289; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 290; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 291; AVX-NEXT: vzeroupper 292; AVX-NEXT: retq 293; 294; AVX512-LABEL: test_v16f32: 295; AVX512: # %bb.0: 296; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 297; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 298; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 299; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 300; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 301; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] 302; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 303; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 304; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 305; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 306; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 307; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] 308; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 309; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 310; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 311; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 312; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 313; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 314; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 315; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] 316; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 317; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 318; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 319; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 320; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 321; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 322; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 323; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 324; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 325; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 326; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 327; AVX512-NEXT: vzeroupper 328; AVX512-NEXT: retq 329 %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float %a0, <16 x float> %a1) 330 ret float %1 331} 332 333; 334; vXf32 (one) 335; 336 337define float @test_v2f32_one(<2 x float> %a0) { 338; SSE2-LABEL: test_v2f32_one: 339; SSE2: # %bb.0: 340; SSE2-NEXT: movaps %xmm0, %xmm1 341; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 342; SSE2-NEXT: mulss %xmm1, %xmm0 343; SSE2-NEXT: retq 344; 345; SSE41-LABEL: test_v2f32_one: 346; SSE41: # %bb.0: 347; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 348; SSE41-NEXT: mulss %xmm1, %xmm0 349; SSE41-NEXT: retq 350; 351; AVX-LABEL: test_v2f32_one: 352; AVX: # %bb.0: 353; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 354; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 355; AVX-NEXT: retq 356; 357; AVX512-LABEL: test_v2f32_one: 358; AVX512: # %bb.0: 359; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 360; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 361; AVX512-NEXT: retq 362 %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %a0) 363 ret float %1 364} 365 366define float @test_v4f32_one(<4 x float> %a0) { 367; SSE2-LABEL: test_v4f32_one: 368; SSE2: # %bb.0: 369; SSE2-NEXT: movaps %xmm0, %xmm1 370; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 371; SSE2-NEXT: mulss %xmm0, %xmm1 372; SSE2-NEXT: movaps %xmm0, %xmm2 373; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 374; SSE2-NEXT: mulss %xmm1, %xmm2 375; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 376; SSE2-NEXT: mulss %xmm2, %xmm0 377; SSE2-NEXT: retq 378; 379; SSE41-LABEL: test_v4f32_one: 380; SSE41: # %bb.0: 381; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 382; SSE41-NEXT: mulss %xmm0, %xmm1 383; SSE41-NEXT: movaps %xmm0, %xmm2 384; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 385; SSE41-NEXT: mulss %xmm1, %xmm2 386; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 387; SSE41-NEXT: mulss %xmm2, %xmm0 388; SSE41-NEXT: retq 389; 390; AVX-LABEL: test_v4f32_one: 391; AVX: # %bb.0: 392; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 393; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1 394; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 395; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 396; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 397; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 398; AVX-NEXT: retq 399; 400; AVX512-LABEL: test_v4f32_one: 401; AVX512: # %bb.0: 402; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 403; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1 404; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 405; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 406; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 407; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 408; AVX512-NEXT: retq 409 %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0) 410 ret float %1 411} 412 413define float @test_v8f32_one(<8 x float> %a0) { 414; SSE2-LABEL: test_v8f32_one: 415; SSE2: # %bb.0: 416; SSE2-NEXT: movaps %xmm0, %xmm2 417; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] 418; SSE2-NEXT: mulss %xmm0, %xmm2 419; SSE2-NEXT: movaps %xmm0, %xmm3 420; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 421; SSE2-NEXT: mulss %xmm2, %xmm3 422; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 423; SSE2-NEXT: mulss %xmm3, %xmm0 424; SSE2-NEXT: mulss %xmm1, %xmm0 425; SSE2-NEXT: movaps %xmm1, %xmm2 426; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 427; SSE2-NEXT: mulss %xmm2, %xmm0 428; SSE2-NEXT: movaps %xmm1, %xmm2 429; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 430; SSE2-NEXT: mulss %xmm2, %xmm0 431; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 432; SSE2-NEXT: mulss %xmm1, %xmm0 433; SSE2-NEXT: retq 434; 435; SSE41-LABEL: test_v8f32_one: 436; SSE41: # %bb.0: 437; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 438; SSE41-NEXT: mulss %xmm0, %xmm2 439; SSE41-NEXT: movaps %xmm0, %xmm3 440; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 441; SSE41-NEXT: mulss %xmm2, %xmm3 442; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 443; SSE41-NEXT: mulss %xmm3, %xmm0 444; SSE41-NEXT: mulss %xmm1, %xmm0 445; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 446; SSE41-NEXT: mulss %xmm2, %xmm0 447; SSE41-NEXT: movaps %xmm1, %xmm2 448; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 449; SSE41-NEXT: mulss %xmm2, %xmm0 450; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 451; SSE41-NEXT: mulss %xmm1, %xmm0 452; SSE41-NEXT: retq 453; 454; AVX-LABEL: test_v8f32_one: 455; AVX: # %bb.0: 456; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 457; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1 458; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 459; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 460; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] 461; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 462; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 463; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm1 464; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 465; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 466; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 467; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 468; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 469; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 470; AVX-NEXT: vzeroupper 471; AVX-NEXT: retq 472; 473; AVX512-LABEL: test_v8f32_one: 474; AVX512: # %bb.0: 475; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 476; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1 477; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 478; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 479; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] 480; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 481; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 482; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 483; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 484; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 485; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 486; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 487; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 488; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 489; AVX512-NEXT: vzeroupper 490; AVX512-NEXT: retq 491 %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a0) 492 ret float %1 493} 494 495define float @test_v16f32_one(<16 x float> %a0) { 496; SSE2-LABEL: test_v16f32_one: 497; SSE2: # %bb.0: 498; SSE2-NEXT: movaps %xmm0, %xmm4 499; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] 500; SSE2-NEXT: mulss %xmm0, %xmm4 501; SSE2-NEXT: movaps %xmm0, %xmm5 502; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 503; SSE2-NEXT: mulss %xmm4, %xmm5 504; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 505; SSE2-NEXT: mulss %xmm5, %xmm0 506; SSE2-NEXT: mulss %xmm1, %xmm0 507; SSE2-NEXT: movaps %xmm1, %xmm4 508; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] 509; SSE2-NEXT: mulss %xmm4, %xmm0 510; SSE2-NEXT: movaps %xmm1, %xmm4 511; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 512; SSE2-NEXT: mulss %xmm4, %xmm0 513; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 514; SSE2-NEXT: mulss %xmm1, %xmm0 515; SSE2-NEXT: mulss %xmm2, %xmm0 516; SSE2-NEXT: movaps %xmm2, %xmm1 517; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 518; SSE2-NEXT: mulss %xmm1, %xmm0 519; SSE2-NEXT: movaps %xmm2, %xmm1 520; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 521; SSE2-NEXT: mulss %xmm1, %xmm0 522; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 523; SSE2-NEXT: mulss %xmm2, %xmm0 524; SSE2-NEXT: mulss %xmm3, %xmm0 525; SSE2-NEXT: movaps %xmm3, %xmm1 526; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] 527; SSE2-NEXT: mulss %xmm1, %xmm0 528; SSE2-NEXT: movaps %xmm3, %xmm1 529; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 530; SSE2-NEXT: mulss %xmm1, %xmm0 531; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 532; SSE2-NEXT: mulss %xmm3, %xmm0 533; SSE2-NEXT: retq 534; 535; SSE41-LABEL: test_v16f32_one: 536; SSE41: # %bb.0: 537; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 538; SSE41-NEXT: mulss %xmm0, %xmm4 539; SSE41-NEXT: movaps %xmm0, %xmm5 540; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 541; SSE41-NEXT: mulss %xmm4, %xmm5 542; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 543; SSE41-NEXT: mulss %xmm5, %xmm0 544; SSE41-NEXT: mulss %xmm1, %xmm0 545; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 546; SSE41-NEXT: mulss %xmm4, %xmm0 547; SSE41-NEXT: movaps %xmm1, %xmm4 548; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 549; SSE41-NEXT: mulss %xmm4, %xmm0 550; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 551; SSE41-NEXT: mulss %xmm1, %xmm0 552; SSE41-NEXT: mulss %xmm2, %xmm0 553; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 554; SSE41-NEXT: mulss %xmm1, %xmm0 555; SSE41-NEXT: movaps %xmm2, %xmm1 556; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 557; SSE41-NEXT: mulss %xmm1, %xmm0 558; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 559; SSE41-NEXT: mulss %xmm2, %xmm0 560; SSE41-NEXT: mulss %xmm3, %xmm0 561; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 562; SSE41-NEXT: mulss %xmm1, %xmm0 563; SSE41-NEXT: movaps %xmm3, %xmm1 564; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 565; SSE41-NEXT: mulss %xmm1, %xmm0 566; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 567; SSE41-NEXT: mulss %xmm3, %xmm0 568; SSE41-NEXT: retq 569; 570; AVX-LABEL: test_v16f32_one: 571; AVX: # %bb.0: 572; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 573; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm2 574; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 575; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 576; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] 577; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 578; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 579; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm2 580; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 581; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 582; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 583; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 584; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 585; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0 586; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 587; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 588; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 589; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 590; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 591; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] 592; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 593; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 594; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 595; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 596; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 597; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 598; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 599; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 600; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 601; AVX-NEXT: vzeroupper 602; AVX-NEXT: retq 603; 604; AVX512-LABEL: test_v16f32_one: 605; AVX512: # %bb.0: 606; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 607; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1 608; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 609; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 610; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] 611; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 612; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 613; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 614; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 615; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 616; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] 617; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 618; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 619; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 620; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 621; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 622; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 623; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 624; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] 625; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 626; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 627; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 628; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 629; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 630; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 631; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 632; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 633; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 634; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 635; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 636; AVX512-NEXT: vzeroupper 637; AVX512-NEXT: retq 638 %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float 1.0, <16 x float> %a0) 639 ret float %1 640} 641 642; 643; vXf32 (undef) 644; 645 646define float @test_v2f32_undef(<2 x float> %a0) { 647; SSE2-LABEL: test_v2f32_undef: 648; SSE2: # %bb.0: 649; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 650; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 651; SSE2-NEXT: retq 652; 653; SSE41-LABEL: test_v2f32_undef: 654; SSE41: # %bb.0: 655; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 656; SSE41-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 657; SSE41-NEXT: retq 658; 659; AVX-LABEL: test_v2f32_undef: 660; AVX: # %bb.0: 661; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 662; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 663; AVX-NEXT: retq 664; 665; AVX512-LABEL: test_v2f32_undef: 666; AVX512: # %bb.0: 667; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 668; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 669; AVX512-NEXT: retq 670 %1 = call float @llvm.vector.reduce.fmul.f32.v2f32(float undef, <2 x float> %a0) 671 ret float %1 672} 673 674define float @test_v4f32_undef(<4 x float> %a0) { 675; SSE2-LABEL: test_v4f32_undef: 676; SSE2: # %bb.0: 677; SSE2-NEXT: movaps %xmm0, %xmm1 678; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 679; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 680; SSE2-NEXT: movaps %xmm0, %xmm2 681; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 682; SSE2-NEXT: mulss %xmm1, %xmm2 683; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 684; SSE2-NEXT: mulss %xmm2, %xmm0 685; SSE2-NEXT: retq 686; 687; SSE41-LABEL: test_v4f32_undef: 688; SSE41: # %bb.0: 689; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 690; SSE41-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 691; SSE41-NEXT: movaps %xmm0, %xmm2 692; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 693; SSE41-NEXT: mulss %xmm1, %xmm2 694; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 695; SSE41-NEXT: mulss %xmm2, %xmm0 696; SSE41-NEXT: retq 697; 698; AVX-LABEL: test_v4f32_undef: 699; AVX: # %bb.0: 700; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 701; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 702; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 703; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 704; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 705; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 706; AVX-NEXT: retq 707; 708; AVX512-LABEL: test_v4f32_undef: 709; AVX512: # %bb.0: 710; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 711; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 712; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 713; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 714; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 715; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 716; AVX512-NEXT: retq 717 %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0) 718 ret float %1 719} 720 721define float @test_v8f32_undef(<8 x float> %a0) { 722; SSE2-LABEL: test_v8f32_undef: 723; SSE2: # %bb.0: 724; SSE2-NEXT: movaps %xmm0, %xmm2 725; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] 726; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 727; SSE2-NEXT: movaps %xmm0, %xmm3 728; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 729; SSE2-NEXT: mulss %xmm2, %xmm3 730; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 731; SSE2-NEXT: mulss %xmm3, %xmm0 732; SSE2-NEXT: mulss %xmm1, %xmm0 733; SSE2-NEXT: movaps %xmm1, %xmm2 734; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 735; SSE2-NEXT: mulss %xmm2, %xmm0 736; SSE2-NEXT: movaps %xmm1, %xmm2 737; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 738; SSE2-NEXT: mulss %xmm2, %xmm0 739; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 740; SSE2-NEXT: mulss %xmm1, %xmm0 741; SSE2-NEXT: retq 742; 743; SSE41-LABEL: test_v8f32_undef: 744; SSE41: # %bb.0: 745; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 746; SSE41-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 747; SSE41-NEXT: movaps %xmm0, %xmm3 748; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 749; SSE41-NEXT: mulss %xmm2, %xmm3 750; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 751; SSE41-NEXT: mulss %xmm3, %xmm0 752; SSE41-NEXT: mulss %xmm1, %xmm0 753; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 754; SSE41-NEXT: mulss %xmm2, %xmm0 755; SSE41-NEXT: movaps %xmm1, %xmm2 756; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 757; SSE41-NEXT: mulss %xmm2, %xmm0 758; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 759; SSE41-NEXT: mulss %xmm1, %xmm0 760; SSE41-NEXT: retq 761; 762; AVX-LABEL: test_v8f32_undef: 763; AVX: # %bb.0: 764; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 765; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 766; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 767; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 768; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] 769; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 770; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 771; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm1 772; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 773; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 774; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 775; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 776; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 777; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 778; AVX-NEXT: vzeroupper 779; AVX-NEXT: retq 780; 781; AVX512-LABEL: test_v8f32_undef: 782; AVX512: # %bb.0: 783; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 784; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 785; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 786; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 787; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] 788; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 789; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 790; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 791; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 792; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 793; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 794; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 795; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 796; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 797; AVX512-NEXT: vzeroupper 798; AVX512-NEXT: retq 799 %1 = call float @llvm.vector.reduce.fmul.f32.v8f32(float undef, <8 x float> %a0) 800 ret float %1 801} 802 803define float @test_v16f32_undef(<16 x float> %a0) { 804; SSE2-LABEL: test_v16f32_undef: 805; SSE2: # %bb.0: 806; SSE2-NEXT: movaps %xmm0, %xmm4 807; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] 808; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 809; SSE2-NEXT: movaps %xmm0, %xmm5 810; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 811; SSE2-NEXT: mulss %xmm4, %xmm5 812; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 813; SSE2-NEXT: mulss %xmm5, %xmm0 814; SSE2-NEXT: mulss %xmm1, %xmm0 815; SSE2-NEXT: movaps %xmm1, %xmm4 816; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] 817; SSE2-NEXT: mulss %xmm4, %xmm0 818; SSE2-NEXT: movaps %xmm1, %xmm4 819; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 820; SSE2-NEXT: mulss %xmm4, %xmm0 821; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 822; SSE2-NEXT: mulss %xmm1, %xmm0 823; SSE2-NEXT: mulss %xmm2, %xmm0 824; SSE2-NEXT: movaps %xmm2, %xmm1 825; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 826; SSE2-NEXT: mulss %xmm1, %xmm0 827; SSE2-NEXT: movaps %xmm2, %xmm1 828; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 829; SSE2-NEXT: mulss %xmm1, %xmm0 830; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 831; SSE2-NEXT: mulss %xmm2, %xmm0 832; SSE2-NEXT: mulss %xmm3, %xmm0 833; SSE2-NEXT: movaps %xmm3, %xmm1 834; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] 835; SSE2-NEXT: mulss %xmm1, %xmm0 836; SSE2-NEXT: movaps %xmm3, %xmm1 837; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 838; SSE2-NEXT: mulss %xmm1, %xmm0 839; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 840; SSE2-NEXT: mulss %xmm3, %xmm0 841; SSE2-NEXT: retq 842; 843; SSE41-LABEL: test_v16f32_undef: 844; SSE41: # %bb.0: 845; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 846; SSE41-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 847; SSE41-NEXT: movaps %xmm0, %xmm5 848; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 849; SSE41-NEXT: mulss %xmm4, %xmm5 850; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 851; SSE41-NEXT: mulss %xmm5, %xmm0 852; SSE41-NEXT: mulss %xmm1, %xmm0 853; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 854; SSE41-NEXT: mulss %xmm4, %xmm0 855; SSE41-NEXT: movaps %xmm1, %xmm4 856; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] 857; SSE41-NEXT: mulss %xmm4, %xmm0 858; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 859; SSE41-NEXT: mulss %xmm1, %xmm0 860; SSE41-NEXT: mulss %xmm2, %xmm0 861; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 862; SSE41-NEXT: mulss %xmm1, %xmm0 863; SSE41-NEXT: movaps %xmm2, %xmm1 864; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 865; SSE41-NEXT: mulss %xmm1, %xmm0 866; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 867; SSE41-NEXT: mulss %xmm2, %xmm0 868; SSE41-NEXT: mulss %xmm3, %xmm0 869; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 870; SSE41-NEXT: mulss %xmm1, %xmm0 871; SSE41-NEXT: movaps %xmm3, %xmm1 872; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 873; SSE41-NEXT: mulss %xmm1, %xmm0 874; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 875; SSE41-NEXT: mulss %xmm3, %xmm0 876; SSE41-NEXT: retq 877; 878; AVX-LABEL: test_v16f32_undef: 879; AVX: # %bb.0: 880; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 881; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 882; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 883; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 884; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] 885; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 886; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 887; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm2 888; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 889; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 890; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 891; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 892; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 893; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0 894; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 895; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 896; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 897; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 898; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 899; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] 900; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 901; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 902; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 903; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 904; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 905; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 906; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 907; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 908; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 909; AVX-NEXT: vzeroupper 910; AVX-NEXT: retq 911; 912; AVX512-LABEL: test_v16f32_undef: 913; AVX512: # %bb.0: 914; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 915; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 916; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 917; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 918; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] 919; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 920; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 921; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 922; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 923; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 924; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] 925; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 926; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 927; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 928; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 929; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 930; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] 931; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 932; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] 933; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 934; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 935; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 936; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 937; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 938; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 939; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 940; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 941; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 942; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 943; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 944; AVX512-NEXT: vzeroupper 945; AVX512-NEXT: retq 946 %1 = call float @llvm.vector.reduce.fmul.f32.v16f32(float undef, <16 x float> %a0) 947 ret float %1 948} 949 950; 951; vXf64 (accum) 952; 953 954define double @test_v2f64(double %a0, <2 x double> %a1) { 955; SSE-LABEL: test_v2f64: 956; SSE: # %bb.0: 957; SSE-NEXT: mulsd %xmm1, %xmm0 958; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 959; SSE-NEXT: mulsd %xmm1, %xmm0 960; SSE-NEXT: retq 961; 962; AVX-LABEL: test_v2f64: 963; AVX: # %bb.0: 964; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 965; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 966; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 967; AVX-NEXT: retq 968; 969; AVX512-LABEL: test_v2f64: 970; AVX512: # %bb.0: 971; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 972; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 973; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 974; AVX512-NEXT: retq 975 %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double %a0, <2 x double> %a1) 976 ret double %1 977} 978 979define double @test_v4f64(double %a0, <4 x double> %a1) { 980; SSE-LABEL: test_v4f64: 981; SSE: # %bb.0: 982; SSE-NEXT: mulsd %xmm1, %xmm0 983; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 984; SSE-NEXT: mulsd %xmm1, %xmm0 985; SSE-NEXT: mulsd %xmm2, %xmm0 986; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 987; SSE-NEXT: mulsd %xmm2, %xmm0 988; SSE-NEXT: retq 989; 990; AVX-LABEL: test_v4f64: 991; AVX: # %bb.0: 992; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 993; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 994; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 995; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 996; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 997; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 998; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 999; AVX-NEXT: vzeroupper 1000; AVX-NEXT: retq 1001; 1002; AVX512-LABEL: test_v4f64: 1003; AVX512: # %bb.0: 1004; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1005; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1006; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1007; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 1008; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1009; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1010; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1011; AVX512-NEXT: vzeroupper 1012; AVX512-NEXT: retq 1013 %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double %a0, <4 x double> %a1) 1014 ret double %1 1015} 1016 1017define double @test_v8f64(double %a0, <8 x double> %a1) { 1018; SSE-LABEL: test_v8f64: 1019; SSE: # %bb.0: 1020; SSE-NEXT: mulsd %xmm1, %xmm0 1021; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1022; SSE-NEXT: mulsd %xmm1, %xmm0 1023; SSE-NEXT: mulsd %xmm2, %xmm0 1024; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1025; SSE-NEXT: mulsd %xmm2, %xmm0 1026; SSE-NEXT: mulsd %xmm3, %xmm0 1027; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1028; SSE-NEXT: mulsd %xmm3, %xmm0 1029; SSE-NEXT: mulsd %xmm4, %xmm0 1030; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1031; SSE-NEXT: mulsd %xmm4, %xmm0 1032; SSE-NEXT: retq 1033; 1034; AVX-LABEL: test_v8f64: 1035; AVX: # %bb.0: 1036; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1037; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] 1038; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1039; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1040; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1041; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1042; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1043; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1044; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] 1045; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1046; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1047; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1048; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1049; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1050; AVX-NEXT: vzeroupper 1051; AVX-NEXT: retq 1052; 1053; AVX512-LABEL: test_v8f64: 1054; AVX512: # %bb.0: 1055; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1056; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1057; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1058; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1059; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1060; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1061; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1062; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1063; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1064; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1065; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1066; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1067; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1068; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1069; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1070; AVX512-NEXT: vzeroupper 1071; AVX512-NEXT: retq 1072 %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double %a0, <8 x double> %a1) 1073 ret double %1 1074} 1075 1076define double @test_v16f64(double %a0, <16 x double> %a1) { 1077; SSE2-LABEL: test_v16f64: 1078; SSE2: # %bb.0: 1079; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 1080; SSE2-NEXT: mulsd %xmm1, %xmm0 1081; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1082; SSE2-NEXT: mulsd %xmm1, %xmm0 1083; SSE2-NEXT: mulsd %xmm2, %xmm0 1084; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1085; SSE2-NEXT: mulsd %xmm2, %xmm0 1086; SSE2-NEXT: mulsd %xmm3, %xmm0 1087; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1088; SSE2-NEXT: mulsd %xmm3, %xmm0 1089; SSE2-NEXT: mulsd %xmm4, %xmm0 1090; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1091; SSE2-NEXT: mulsd %xmm4, %xmm0 1092; SSE2-NEXT: mulsd %xmm5, %xmm0 1093; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1094; SSE2-NEXT: mulsd %xmm5, %xmm0 1095; SSE2-NEXT: mulsd %xmm6, %xmm0 1096; SSE2-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1097; SSE2-NEXT: mulsd %xmm6, %xmm0 1098; SSE2-NEXT: mulsd %xmm7, %xmm0 1099; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1100; SSE2-NEXT: mulsd %xmm7, %xmm0 1101; SSE2-NEXT: mulsd %xmm8, %xmm0 1102; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] 1103; SSE2-NEXT: mulsd %xmm8, %xmm0 1104; SSE2-NEXT: retq 1105; 1106; SSE41-LABEL: test_v16f64: 1107; SSE41: # %bb.0: 1108; SSE41-NEXT: mulsd %xmm1, %xmm0 1109; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1110; SSE41-NEXT: mulsd %xmm1, %xmm0 1111; SSE41-NEXT: mulsd %xmm2, %xmm0 1112; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1113; SSE41-NEXT: mulsd %xmm2, %xmm0 1114; SSE41-NEXT: mulsd %xmm3, %xmm0 1115; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1116; SSE41-NEXT: mulsd %xmm3, %xmm0 1117; SSE41-NEXT: mulsd %xmm4, %xmm0 1118; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1119; SSE41-NEXT: mulsd %xmm4, %xmm0 1120; SSE41-NEXT: mulsd %xmm5, %xmm0 1121; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1122; SSE41-NEXT: mulsd %xmm5, %xmm0 1123; SSE41-NEXT: mulsd %xmm6, %xmm0 1124; SSE41-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1125; SSE41-NEXT: mulsd %xmm6, %xmm0 1126; SSE41-NEXT: mulsd %xmm7, %xmm0 1127; SSE41-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1128; SSE41-NEXT: mulsd %xmm7, %xmm0 1129; SSE41-NEXT: mulsd {{[0-9]+}}(%rsp), %xmm0 1130; SSE41-NEXT: mulsd {{[0-9]+}}(%rsp), %xmm0 1131; SSE41-NEXT: retq 1132; 1133; AVX-LABEL: test_v16f64: 1134; AVX: # %bb.0: 1135; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1136; AVX-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] 1137; AVX-NEXT: vmulsd %xmm5, %xmm0, %xmm0 1138; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1139; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1140; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1141; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1142; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1143; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] 1144; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1145; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1146; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1147; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1148; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1149; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1150; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] 1151; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1152; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1153; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1154; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1155; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1156; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm0 1157; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm4[1,0] 1158; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1159; AVX-NEXT: vextractf128 $1, %ymm4, %xmm1 1160; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1161; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1162; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1163; AVX-NEXT: vzeroupper 1164; AVX-NEXT: retq 1165; 1166; AVX512-LABEL: test_v16f64: 1167; AVX512: # %bb.0: 1168; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1169; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] 1170; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1171; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3 1172; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1173; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] 1174; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1175; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm3 1176; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1177; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] 1178; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1179; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1180; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1181; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1182; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1183; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1184; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] 1185; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1186; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm1 1187; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1188; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1189; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1190; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm1 1191; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1192; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1193; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1194; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm1 1195; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1196; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1197; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1198; AVX512-NEXT: vzeroupper 1199; AVX512-NEXT: retq 1200 %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double %a0, <16 x double> %a1) 1201 ret double %1 1202} 1203 1204; 1205; vXf64 (one) 1206; 1207 1208define double @test_v2f64_one(<2 x double> %a0) { 1209; SSE-LABEL: test_v2f64_one: 1210; SSE: # %bb.0: 1211; SSE-NEXT: movapd %xmm0, %xmm1 1212; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1213; SSE-NEXT: mulsd %xmm1, %xmm0 1214; SSE-NEXT: retq 1215; 1216; AVX-LABEL: test_v2f64_one: 1217; AVX: # %bb.0: 1218; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1219; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1220; AVX-NEXT: retq 1221; 1222; AVX512-LABEL: test_v2f64_one: 1223; AVX512: # %bb.0: 1224; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1225; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1226; AVX512-NEXT: retq 1227 %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %a0) 1228 ret double %1 1229} 1230 1231define double @test_v4f64_one(<4 x double> %a0) { 1232; SSE-LABEL: test_v4f64_one: 1233; SSE: # %bb.0: 1234; SSE-NEXT: movapd %xmm0, %xmm2 1235; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 1236; SSE-NEXT: mulsd %xmm2, %xmm0 1237; SSE-NEXT: mulsd %xmm1, %xmm0 1238; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1239; SSE-NEXT: mulsd %xmm1, %xmm0 1240; SSE-NEXT: retq 1241; 1242; AVX-LABEL: test_v4f64_one: 1243; AVX: # %bb.0: 1244; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1245; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm1 1246; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1247; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm1 1248; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1249; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 1250; AVX-NEXT: vzeroupper 1251; AVX-NEXT: retq 1252; 1253; AVX512-LABEL: test_v4f64_one: 1254; AVX512: # %bb.0: 1255; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1256; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm1 1257; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1258; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1 1259; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1260; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 1261; AVX512-NEXT: vzeroupper 1262; AVX512-NEXT: retq 1263 %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a0) 1264 ret double %1 1265} 1266 1267define double @test_v8f64_one(<8 x double> %a0) { 1268; SSE-LABEL: test_v8f64_one: 1269; SSE: # %bb.0: 1270; SSE-NEXT: movapd %xmm0, %xmm4 1271; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] 1272; SSE-NEXT: mulsd %xmm4, %xmm0 1273; SSE-NEXT: mulsd %xmm1, %xmm0 1274; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1275; SSE-NEXT: mulsd %xmm1, %xmm0 1276; SSE-NEXT: mulsd %xmm2, %xmm0 1277; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1278; SSE-NEXT: mulsd %xmm2, %xmm0 1279; SSE-NEXT: mulsd %xmm3, %xmm0 1280; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1281; SSE-NEXT: mulsd %xmm3, %xmm0 1282; SSE-NEXT: retq 1283; 1284; AVX-LABEL: test_v8f64_one: 1285; AVX: # %bb.0: 1286; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 1287; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm2 1288; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1289; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm2 1290; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1291; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm0 1292; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1293; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1294; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1295; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1296; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1297; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1298; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1299; AVX-NEXT: vzeroupper 1300; AVX-NEXT: retq 1301; 1302; AVX512-LABEL: test_v8f64_one: 1303; AVX512: # %bb.0: 1304; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1305; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm1 1306; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 1307; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1308; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1309; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1310; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 1311; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1312; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1313; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1314; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1315; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1 1316; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1317; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 1318; AVX512-NEXT: vzeroupper 1319; AVX512-NEXT: retq 1320 %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double 1.0, <8 x double> %a0) 1321 ret double %1 1322} 1323 1324define double @test_v16f64_one(<16 x double> %a0) { 1325; SSE-LABEL: test_v16f64_one: 1326; SSE: # %bb.0: 1327; SSE-NEXT: movapd %xmm0, %xmm8 1328; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] 1329; SSE-NEXT: mulsd %xmm8, %xmm0 1330; SSE-NEXT: mulsd %xmm1, %xmm0 1331; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1332; SSE-NEXT: mulsd %xmm1, %xmm0 1333; SSE-NEXT: mulsd %xmm2, %xmm0 1334; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1335; SSE-NEXT: mulsd %xmm2, %xmm0 1336; SSE-NEXT: mulsd %xmm3, %xmm0 1337; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1338; SSE-NEXT: mulsd %xmm3, %xmm0 1339; SSE-NEXT: mulsd %xmm4, %xmm0 1340; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1341; SSE-NEXT: mulsd %xmm4, %xmm0 1342; SSE-NEXT: mulsd %xmm5, %xmm0 1343; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1344; SSE-NEXT: mulsd %xmm5, %xmm0 1345; SSE-NEXT: mulsd %xmm6, %xmm0 1346; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1347; SSE-NEXT: mulsd %xmm6, %xmm0 1348; SSE-NEXT: mulsd %xmm7, %xmm0 1349; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1350; SSE-NEXT: mulsd %xmm7, %xmm0 1351; SSE-NEXT: retq 1352; 1353; AVX-LABEL: test_v16f64_one: 1354; AVX: # %bb.0: 1355; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] 1356; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm4 1357; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1358; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm4 1359; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1360; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm0 1361; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1362; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] 1363; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm0 1364; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1365; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1366; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1367; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1368; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1369; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] 1370; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1371; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1372; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1373; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1374; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1375; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1376; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] 1377; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1378; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1379; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1380; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1381; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1382; AVX-NEXT: vzeroupper 1383; AVX-NEXT: retq 1384; 1385; AVX512-LABEL: test_v16f64_one: 1386; AVX512: # %bb.0: 1387; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 1388; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm2 1389; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 1390; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1391; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] 1392; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1393; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 1394; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1395; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] 1396; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1397; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1398; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm2 1399; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1400; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm0 1401; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1402; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1403; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1404; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1405; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1406; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1407; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1408; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1409; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1410; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1411; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1412; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1413; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1414; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1415; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1416; AVX512-NEXT: vzeroupper 1417; AVX512-NEXT: retq 1418 %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double 1.0, <16 x double> %a0) 1419 ret double %1 1420} 1421 1422; 1423; vXf64 (undef) 1424; 1425 1426define double @test_v2f64_undef(<2 x double> %a0) { 1427; SSE-LABEL: test_v2f64_undef: 1428; SSE: # %bb.0: 1429; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1430; SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1431; SSE-NEXT: retq 1432; 1433; AVX-LABEL: test_v2f64_undef: 1434; AVX: # %bb.0: 1435; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1436; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1437; AVX-NEXT: retq 1438; 1439; AVX512-LABEL: test_v2f64_undef: 1440; AVX512: # %bb.0: 1441; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1442; AVX512-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1443; AVX512-NEXT: retq 1444 %1 = call double @llvm.vector.reduce.fmul.f64.v2f64(double undef, <2 x double> %a0) 1445 ret double %1 1446} 1447 1448define double @test_v4f64_undef(<4 x double> %a0) { 1449; SSE-LABEL: test_v4f64_undef: 1450; SSE: # %bb.0: 1451; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1452; SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1453; SSE-NEXT: mulsd %xmm1, %xmm0 1454; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1455; SSE-NEXT: mulsd %xmm1, %xmm0 1456; SSE-NEXT: retq 1457; 1458; AVX-LABEL: test_v4f64_undef: 1459; AVX: # %bb.0: 1460; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1461; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1462; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1463; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm1 1464; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1465; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 1466; AVX-NEXT: vzeroupper 1467; AVX-NEXT: retq 1468; 1469; AVX512-LABEL: test_v4f64_undef: 1470; AVX512: # %bb.0: 1471; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1472; AVX512-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1473; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 1474; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1 1475; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1476; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 1477; AVX512-NEXT: vzeroupper 1478; AVX512-NEXT: retq 1479 %1 = call double @llvm.vector.reduce.fmul.f64.v4f64(double undef, <4 x double> %a0) 1480 ret double %1 1481} 1482 1483define double @test_v8f64_undef(<8 x double> %a0) { 1484; SSE-LABEL: test_v8f64_undef: 1485; SSE: # %bb.0: 1486; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1487; SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1488; SSE-NEXT: mulsd %xmm1, %xmm0 1489; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1490; SSE-NEXT: mulsd %xmm1, %xmm0 1491; SSE-NEXT: mulsd %xmm2, %xmm0 1492; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1493; SSE-NEXT: mulsd %xmm2, %xmm0 1494; SSE-NEXT: mulsd %xmm3, %xmm0 1495; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1496; SSE-NEXT: mulsd %xmm3, %xmm0 1497; SSE-NEXT: retq 1498; 1499; AVX-LABEL: test_v8f64_undef: 1500; AVX: # %bb.0: 1501; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 1502; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1503; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1504; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm2 1505; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1506; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm0 1507; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1508; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1509; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1510; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1511; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1512; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1513; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1514; AVX-NEXT: vzeroupper 1515; AVX-NEXT: retq 1516; 1517; AVX512-LABEL: test_v8f64_undef: 1518; AVX512: # %bb.0: 1519; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1520; AVX512-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1521; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 1522; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1523; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1524; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1525; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 1526; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1527; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1528; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1 1529; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1530; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1 1531; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1532; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0 1533; AVX512-NEXT: vzeroupper 1534; AVX512-NEXT: retq 1535 %1 = call double @llvm.vector.reduce.fmul.f64.v8f64(double undef, <8 x double> %a0) 1536 ret double %1 1537} 1538 1539define double @test_v16f64_undef(<16 x double> %a0) { 1540; SSE-LABEL: test_v16f64_undef: 1541; SSE: # %bb.0: 1542; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1543; SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1544; SSE-NEXT: mulsd %xmm1, %xmm0 1545; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 1546; SSE-NEXT: mulsd %xmm1, %xmm0 1547; SSE-NEXT: mulsd %xmm2, %xmm0 1548; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1549; SSE-NEXT: mulsd %xmm2, %xmm0 1550; SSE-NEXT: mulsd %xmm3, %xmm0 1551; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 1552; SSE-NEXT: mulsd %xmm3, %xmm0 1553; SSE-NEXT: mulsd %xmm4, %xmm0 1554; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] 1555; SSE-NEXT: mulsd %xmm4, %xmm0 1556; SSE-NEXT: mulsd %xmm5, %xmm0 1557; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] 1558; SSE-NEXT: mulsd %xmm5, %xmm0 1559; SSE-NEXT: mulsd %xmm6, %xmm0 1560; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 1561; SSE-NEXT: mulsd %xmm6, %xmm0 1562; SSE-NEXT: mulsd %xmm7, %xmm0 1563; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 1564; SSE-NEXT: mulsd %xmm7, %xmm0 1565; SSE-NEXT: retq 1566; 1567; AVX-LABEL: test_v16f64_undef: 1568; AVX: # %bb.0: 1569; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] 1570; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 1571; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 1572; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm4 1573; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1574; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm0 1575; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1576; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] 1577; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm0 1578; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 1579; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1580; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1581; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1582; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1583; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] 1584; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1585; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 1586; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1587; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1588; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1589; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0 1590; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] 1591; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1592; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1 1593; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1594; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1595; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1596; AVX-NEXT: vzeroupper 1597; AVX-NEXT: retq 1598; 1599; AVX512-LABEL: test_v16f64_undef: 1600; AVX512: # %bb.0: 1601; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 1602; AVX512-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1603; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3 1604; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1605; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] 1606; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1607; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3 1608; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1609; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] 1610; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2 1611; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 1612; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm2 1613; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1614; AVX512-NEXT: vmulsd %xmm0, %xmm2, %xmm0 1615; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1616; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1617; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1618; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 1619; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1620; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1621; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1622; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 1623; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1624; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] 1625; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0 1626; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1627; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1628; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 1629; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0 1630; AVX512-NEXT: vzeroupper 1631; AVX512-NEXT: retq 1632 %1 = call double @llvm.vector.reduce.fmul.f64.v16f64(double undef, <16 x double> %a0) 1633 ret double %1 1634} 1635 1636declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>) 1637declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) 1638declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>) 1639declare float @llvm.vector.reduce.fmul.f32.v16f32(float, <16 x float>) 1640 1641declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>) 1642declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>) 1643declare double @llvm.vector.reduce.fmul.f64.v8f64(double, <8 x double>) 1644declare double @llvm.vector.reduce.fmul.f64.v16f64(double, <16 x double>) 1645