1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 5; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 6; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL 7 8; 9; Basic matrix multiply tests based on the pattern: 10; 11; using matrix_ty = float __attribute__((matrix_type(2,2))); 12; matrix_ty test_mul2x2(matrix_ty a0, matrix_ty a1) nounwind { 13; return a0 * a1; 14; } 15; 16 17define <4 x float> @test_mul2x2_f32(<4 x float> %a0, <4 x float> %a1) nounwind { 18; SSE-LABEL: test_mul2x2_f32: 19; SSE: # %bb.0: # %entry 20; SSE-NEXT: movaps %xmm1, %xmm2 21; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 22; SSE-NEXT: mulps %xmm0, %xmm2 23; SSE-NEXT: movaps %xmm1, %xmm3 24; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] 25; SSE-NEXT: mulps %xmm0, %xmm3 26; SSE-NEXT: movaps %xmm0, %xmm4 27; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] 28; SSE-NEXT: movaps %xmm1, %xmm0 29; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 30; SSE-NEXT: mulps %xmm4, %xmm0 31; SSE-NEXT: addps %xmm2, %xmm0 32; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 33; SSE-NEXT: mulps %xmm4, %xmm1 34; SSE-NEXT: addps %xmm3, %xmm1 35; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 36; SSE-NEXT: retq 37; 38; AVX1-LABEL: test_mul2x2_f32: 39; AVX1: # %bb.0: # %entry 40; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 41; AVX1-NEXT: vmovsldup {{.*#+}} xmm3 = xmm1[0,0,2,2] 42; AVX1-NEXT: vmulps %xmm3, %xmm0, %xmm3 43; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 44; AVX1-NEXT: vmulps %xmm4, %xmm2, %xmm4 45; AVX1-NEXT: vaddps %xmm4, %xmm3, %xmm3 46; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2] 47; AVX1-NEXT: vmulps %xmm4, %xmm0, %xmm0 48; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 49; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 50; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 51; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] 52; AVX1-NEXT: retq 53; 54; AVX2-LABEL: test_mul2x2_f32: 55; AVX2: # %bb.0: # %entry 56; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 57; AVX2-NEXT: vbroadcastss %xmm1, %xmm3 58; AVX2-NEXT: vmulps %xmm3, %xmm0, %xmm3 59; AVX2-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 60; AVX2-NEXT: vmulps %xmm4, %xmm2, %xmm4 61; AVX2-NEXT: vaddps %xmm4, %xmm3, %xmm3 62; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2] 63; AVX2-NEXT: vmulps %xmm4, %xmm0, %xmm0 64; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 65; AVX2-NEXT: vmulps %xmm1, %xmm2, %xmm1 66; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 67; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] 68; AVX2-NEXT: retq 69; 70; AVX512-LABEL: test_mul2x2_f32: 71; AVX512: # %bb.0: # %entry 72; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 73; AVX512-NEXT: vbroadcastss %xmm1, %xmm3 74; AVX512-NEXT: vmulps %xmm3, %xmm0, %xmm3 75; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 76; AVX512-NEXT: vmulps %xmm4, %xmm2, %xmm4 77; AVX512-NEXT: vaddps %xmm4, %xmm3, %xmm3 78; AVX512-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2] 79; AVX512-NEXT: vmulps %xmm4, %xmm0, %xmm0 80; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 81; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 82; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 83; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] 84; AVX512-NEXT: retq 85entry: 86 %split = shufflevector <4 x float> %a0, <4 x float> poison, <2 x i32> <i32 0, i32 1> 87 %split1 = shufflevector <4 x float> %a0, <4 x float> poison, <2 x i32> <i32 2, i32 3> 88 %splat.splat = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> zeroinitializer 89 %0 = fmul <2 x float> %split, %splat.splat 90 %splat.splat6 = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> <i32 1, i32 1> 91 %1 = fmul <2 x float> %split1, %splat.splat6 92 %2 = fadd <2 x float> %0, %1 93 %splat.splat9 = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> <i32 2, i32 2> 94 %3 = fmul <2 x float> %split, %splat.splat9 95 %splat.splat12 = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> <i32 3, i32 3> 96 %4 = fmul <2 x float> %split1, %splat.splat12 97 %5 = fadd <2 x float> %3, %4 98 %6 = shufflevector <2 x float> %2, <2 x float> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 99 ret <4 x float> %6 100} 101 102define <4 x double> @test_mul2x2_f64(<4 x double> %a0, <4 x double> %a1) nounwind { 103; SSE-LABEL: test_mul2x2_f64: 104; SSE: # %bb.0: # %entry 105; SSE-NEXT: movapd %xmm2, %xmm4 106; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm2[0] 107; SSE-NEXT: mulpd %xmm0, %xmm4 108; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 109; SSE-NEXT: mulpd %xmm1, %xmm2 110; SSE-NEXT: addpd %xmm2, %xmm4 111; SSE-NEXT: movapd %xmm3, %xmm2 112; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] 113; SSE-NEXT: mulpd %xmm0, %xmm2 114; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] 115; SSE-NEXT: mulpd %xmm3, %xmm1 116; SSE-NEXT: addpd %xmm2, %xmm1 117; SSE-NEXT: movapd %xmm4, %xmm0 118; SSE-NEXT: retq 119; 120; AVX-LABEL: test_mul2x2_f64: 121; AVX: # %bb.0: # %entry 122; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 123; AVX-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0] 124; AVX-NEXT: vmulpd %xmm3, %xmm0, %xmm3 125; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,1] 126; AVX-NEXT: vmulpd %xmm4, %xmm2, %xmm4 127; AVX-NEXT: vaddpd %xmm4, %xmm3, %xmm3 128; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 129; AVX-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] 130; AVX-NEXT: vmulpd %xmm4, %xmm0, %xmm0 131; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,1] 132; AVX-NEXT: vmulpd %xmm1, %xmm2, %xmm1 133; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 134; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 135; AVX-NEXT: retq 136entry: 137 %split = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> <i32 0, i32 1> 138 %split1 = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> <i32 2, i32 3> 139 %splat.splat = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> zeroinitializer 140 %0 = fmul <2 x double> %split, %splat.splat 141 %splat.splat6 = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> <i32 1, i32 1> 142 %1 = fmul <2 x double> %split1, %splat.splat6 143 %2 = fadd <2 x double> %0, %1 144 %splat.splat9 = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> <i32 2, i32 2> 145 %3 = fmul <2 x double> %split, %splat.splat9 146 %splat.splat12 = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> <i32 3, i32 3> 147 %4 = fmul <2 x double> %split1, %splat.splat12 148 %5 = fadd <2 x double> %3, %4 149 %6 = shufflevector <2 x double> %2, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 150 ret <4 x double> %6 151} 152 153define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind { 154; SSE-LABEL: test_mul3x3_f32: 155; SSE: # %bb.0: # %entry 156; SSE-NEXT: movq %rdi, %rax 157; SSE-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero 158; SSE-NEXT: movss {{.*#+}} xmm10 = mem[0],zero,zero,zero 159; SSE-NEXT: movss {{.*#+}} xmm9 = mem[0],zero,zero,zero 160; SSE-NEXT: movss {{.*#+}} xmm11 = mem[0],zero,zero,zero 161; SSE-NEXT: movss {{.*#+}} xmm12 = mem[0],zero,zero,zero 162; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 163; SSE-NEXT: movaps %xmm2, %xmm13 164; SSE-NEXT: mulss %xmm12, %xmm13 165; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0,0,1,1] 166; SSE-NEXT: mulps %xmm0, %xmm12 167; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 168; SSE-NEXT: movaps %xmm5, %xmm1 169; SSE-NEXT: mulss %xmm11, %xmm1 170; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0,0,1,1] 171; SSE-NEXT: mulps %xmm3, %xmm11 172; SSE-NEXT: addps %xmm12, %xmm11 173; SSE-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero 174; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] 175; SSE-NEXT: movaps %xmm9, %xmm7 176; SSE-NEXT: mulss %xmm4, %xmm7 177; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0,0,1,1] 178; SSE-NEXT: mulps %xmm6, %xmm4 179; SSE-NEXT: addps %xmm11, %xmm4 180; SSE-NEXT: movss {{.*#+}} xmm11 = mem[0],zero,zero,zero 181; SSE-NEXT: addss %xmm13, %xmm1 182; SSE-NEXT: addss %xmm7, %xmm1 183; SSE-NEXT: movaps %xmm2, %xmm7 184; SSE-NEXT: mulss %xmm11, %xmm7 185; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0,0,1,1] 186; SSE-NEXT: mulps %xmm0, %xmm11 187; SSE-NEXT: movaps %xmm5, %xmm12 188; SSE-NEXT: mulss %xmm10, %xmm12 189; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0,0,1,1] 190; SSE-NEXT: mulps %xmm3, %xmm10 191; SSE-NEXT: addps %xmm11, %xmm10 192; SSE-NEXT: movaps %xmm9, %xmm11 193; SSE-NEXT: mulss %xmm8, %xmm11 194; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0,0,1,1] 195; SSE-NEXT: mulps %xmm6, %xmm8 196; SSE-NEXT: addps %xmm10, %xmm8 197; SSE-NEXT: addss %xmm7, %xmm12 198; SSE-NEXT: addss %xmm11, %xmm12 199; SSE-NEXT: movaps %xmm8, %xmm7 200; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0] 201; SSE-NEXT: movss {{.*#+}} xmm10 = mem[0],zero,zero,zero 202; SSE-NEXT: mulss %xmm10, %xmm2 203; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0,0,1,1] 204; SSE-NEXT: mulps %xmm0, %xmm10 205; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 206; SSE-NEXT: mulss %xmm0, %xmm5 207; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] 208; SSE-NEXT: mulps %xmm3, %xmm0 209; SSE-NEXT: addps %xmm10, %xmm0 210; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 211; SSE-NEXT: mulss %xmm3, %xmm9 212; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0,0,1,1] 213; SSE-NEXT: mulps %xmm6, %xmm3 214; SSE-NEXT: addps %xmm0, %xmm3 215; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm3[0,1] 216; SSE-NEXT: addss %xmm2, %xmm5 217; SSE-NEXT: addss %xmm9, %xmm5 218; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] 219; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] 220; SSE-NEXT: movss %xmm5, 32(%rdi) 221; SSE-NEXT: movaps %xmm7, 16(%rdi) 222; SSE-NEXT: movaps %xmm4, (%rdi) 223; SSE-NEXT: retq 224; 225; AVX1-LABEL: test_mul3x3_f32: 226; AVX1: # %bb.0: # %entry 227; AVX1-NEXT: movq %rdi, %rax 228; AVX1-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero 229; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 230; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm1 231; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm9 232; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] 233; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm4 234; AVX1-NEXT: vmulps %xmm4, %xmm3, %xmm10 235; AVX1-NEXT: vaddps %xmm10, %xmm9, %xmm9 236; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 237; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm7 238; AVX1-NEXT: vmulps %xmm7, %xmm6, %xmm10 239; AVX1-NEXT: vaddps %xmm10, %xmm9, %xmm9 240; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 241; AVX1-NEXT: vmulss %xmm4, %xmm5, %xmm4 242; AVX1-NEXT: vaddss %xmm4, %xmm1, %xmm1 243; AVX1-NEXT: vmulss %xmm7, %xmm8, %xmm4 244; AVX1-NEXT: vaddss %xmm4, %xmm1, %xmm1 245; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm9[0,1],xmm1[0],xmm9[3] 246; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm4 247; AVX1-NEXT: vmulps %xmm4, %xmm0, %xmm7 248; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm9 249; AVX1-NEXT: vmulps %xmm3, %xmm9, %xmm10 250; AVX1-NEXT: vaddps %xmm7, %xmm10, %xmm7 251; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm10 252; AVX1-NEXT: vmulps %xmm6, %xmm10, %xmm11 253; AVX1-NEXT: vaddps %xmm7, %xmm11, %xmm7 254; AVX1-NEXT: vmulss %xmm4, %xmm2, %xmm4 255; AVX1-NEXT: vmulss %xmm5, %xmm9, %xmm9 256; AVX1-NEXT: vaddss %xmm4, %xmm9, %xmm4 257; AVX1-NEXT: vmulss %xmm10, %xmm8, %xmm9 258; AVX1-NEXT: vaddss %xmm4, %xmm9, %xmm4 259; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm7[0,1],xmm4[0],xmm7[3] 260; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm9 261; AVX1-NEXT: vmulps %xmm0, %xmm9, %xmm0 262; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm10 263; AVX1-NEXT: vmulps %xmm3, %xmm10, %xmm3 264; AVX1-NEXT: vaddps %xmm3, %xmm0, %xmm0 265; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm3 266; AVX1-NEXT: vmulps %xmm3, %xmm6, %xmm6 267; AVX1-NEXT: vaddps %xmm6, %xmm0, %xmm0 268; AVX1-NEXT: vmulss %xmm2, %xmm9, %xmm2 269; AVX1-NEXT: vmulss %xmm5, %xmm10, %xmm5 270; AVX1-NEXT: vaddss %xmm5, %xmm2, %xmm2 271; AVX1-NEXT: vmulss %xmm3, %xmm8, %xmm3 272; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2 273; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[0] 274; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,2,2,3] 275; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 276; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 277; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 278; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 279; AVX1-NEXT: vmovss %xmm2, 32(%rdi) 280; AVX1-NEXT: vmovaps %ymm0, (%rdi) 281; AVX1-NEXT: vzeroupper 282; AVX1-NEXT: retq 283; 284; AVX2-LABEL: test_mul3x3_f32: 285; AVX2: # %bb.0: # %entry 286; AVX2-NEXT: movq %rdi, %rax 287; AVX2-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero 288; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 289; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm1 290; AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm9 291; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] 292; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm4 293; AVX2-NEXT: vmulps %xmm4, %xmm3, %xmm10 294; AVX2-NEXT: vaddps %xmm10, %xmm9, %xmm9 295; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] 296; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm7 297; AVX2-NEXT: vmulps %xmm7, %xmm6, %xmm10 298; AVX2-NEXT: vaddps %xmm10, %xmm9, %xmm9 299; AVX2-NEXT: vmulss %xmm1, %xmm2, %xmm1 300; AVX2-NEXT: vmulss %xmm4, %xmm5, %xmm4 301; AVX2-NEXT: vaddss %xmm4, %xmm1, %xmm1 302; AVX2-NEXT: vmulss %xmm7, %xmm8, %xmm4 303; AVX2-NEXT: vaddss %xmm4, %xmm1, %xmm1 304; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm9[0,1],xmm1[0],xmm9[3] 305; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm4 306; AVX2-NEXT: vmulps %xmm4, %xmm0, %xmm7 307; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm9 308; AVX2-NEXT: vmulps %xmm3, %xmm9, %xmm10 309; AVX2-NEXT: vaddps %xmm7, %xmm10, %xmm7 310; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm10 311; AVX2-NEXT: vmulps %xmm6, %xmm10, %xmm11 312; AVX2-NEXT: vaddps %xmm7, %xmm11, %xmm7 313; AVX2-NEXT: vmulss %xmm4, %xmm2, %xmm4 314; AVX2-NEXT: vmulss %xmm5, %xmm9, %xmm9 315; AVX2-NEXT: vaddss %xmm4, %xmm9, %xmm4 316; AVX2-NEXT: vmulss %xmm10, %xmm8, %xmm9 317; AVX2-NEXT: vaddss %xmm4, %xmm9, %xmm4 318; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm7[0,1],xmm4[0],xmm7[3] 319; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm7 320; AVX2-NEXT: vmulps %xmm7, %xmm0, %xmm0 321; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm9 322; AVX2-NEXT: vmulps %xmm3, %xmm9, %xmm3 323; AVX2-NEXT: vaddps %xmm3, %xmm0, %xmm0 324; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm3 325; AVX2-NEXT: vmulps %xmm3, %xmm6, %xmm6 326; AVX2-NEXT: vaddps %xmm6, %xmm0, %xmm0 327; AVX2-NEXT: vmulss %xmm7, %xmm2, %xmm2 328; AVX2-NEXT: vmulss %xmm5, %xmm9, %xmm5 329; AVX2-NEXT: vaddss %xmm5, %xmm2, %xmm2 330; AVX2-NEXT: vmulss %xmm3, %xmm8, %xmm3 331; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 332; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 333; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = [0,1,2,4,5,6,u,u] 334; AVX2-NEXT: vpermps %ymm1, %ymm3, %ymm1 335; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 336; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 337; AVX2-NEXT: vmovss %xmm2, 32(%rdi) 338; AVX2-NEXT: vmovaps %ymm0, (%rdi) 339; AVX2-NEXT: vzeroupper 340; AVX2-NEXT: retq 341; 342; AVX512F-LABEL: test_mul3x3_f32: 343; AVX512F: # %bb.0: # %entry 344; AVX512F-NEXT: valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2] 345; AVX512F-NEXT: vbroadcastss %xmm1, %xmm3 346; AVX512F-NEXT: vmulps %xmm3, %xmm0, %xmm3 347; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm5 348; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3] 349; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm4 350; AVX512F-NEXT: vaddps %xmm4, %xmm3, %xmm4 351; AVX512F-NEXT: vshufpd {{.*#+}} xmm3 = xmm5[1,0] 352; AVX512F-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3] 353; AVX512F-NEXT: vshufpd {{.*#+}} xmm8 = xmm1[1,0] 354; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2] 355; AVX512F-NEXT: vmulps %xmm3, %xmm9, %xmm9 356; AVX512F-NEXT: vaddps %xmm4, %xmm9, %xmm9 357; AVX512F-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] 358; AVX512F-NEXT: vmulss %xmm1, %xmm4, %xmm10 359; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm5[1,1,3,3] 360; AVX512F-NEXT: vmulss %xmm6, %xmm5, %xmm6 361; AVX512F-NEXT: vaddss %xmm6, %xmm10, %xmm6 362; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm10 363; AVX512F-NEXT: vmulss %xmm8, %xmm10, %xmm8 364; AVX512F-NEXT: vaddss %xmm6, %xmm8, %xmm6 365; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm9[0,1],xmm6[0],xmm9[3] 366; AVX512F-NEXT: vmulps %xmm7, %xmm0, %xmm8 367; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm9 368; AVX512F-NEXT: vmovsldup {{.*#+}} xmm11 = xmm9[0,0,2,2] 369; AVX512F-NEXT: vmulps %xmm2, %xmm11, %xmm11 370; AVX512F-NEXT: vaddps %xmm11, %xmm8, %xmm8 371; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm9[1,1,3,3] 372; AVX512F-NEXT: vmulps %xmm3, %xmm11, %xmm12 373; AVX512F-NEXT: vaddps %xmm12, %xmm8, %xmm8 374; AVX512F-NEXT: vmulss %xmm7, %xmm4, %xmm7 375; AVX512F-NEXT: vmulss %xmm5, %xmm9, %xmm12 376; AVX512F-NEXT: vaddss %xmm7, %xmm12, %xmm7 377; AVX512F-NEXT: vmulss %xmm11, %xmm10, %xmm11 378; AVX512F-NEXT: vaddss %xmm7, %xmm11, %xmm7 379; AVX512F-NEXT: vinsertps {{.*#+}} xmm7 = xmm8[0,1],xmm7[0],xmm8[3] 380; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm9[3,3,3,3] 381; AVX512F-NEXT: vshufpd {{.*#+}} xmm11 = xmm9[1,0] 382; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,2,2,2] 383; AVX512F-NEXT: vmulps %xmm0, %xmm9, %xmm0 384; AVX512F-NEXT: vmulps %xmm2, %xmm8, %xmm2 385; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0 386; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm1 387; AVX512F-NEXT: vbroadcastss %xmm1, %xmm2 388; AVX512F-NEXT: vmulps %xmm2, %xmm3, %xmm2 389; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0 390; AVX512F-NEXT: vmulss %xmm4, %xmm11, %xmm2 391; AVX512F-NEXT: vmulss %xmm5, %xmm8, %xmm3 392; AVX512F-NEXT: vaddss %xmm3, %xmm2, %xmm2 393; AVX512F-NEXT: vmulss %xmm1, %xmm10, %xmm1 394; AVX512F-NEXT: vaddss %xmm1, %xmm2, %xmm1 395; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] 396; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm2 397; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0] 398; AVX512F-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 399; AVX512F-NEXT: retq 400; 401; AVX512VL-LABEL: test_mul3x3_f32: 402; AVX512VL: # %bb.0: # %entry 403; AVX512VL-NEXT: valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2] 404; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm3 405; AVX512VL-NEXT: vmulps %xmm3, %xmm0, %xmm3 406; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm4 407; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] 408; AVX512VL-NEXT: vmulps %xmm5, %xmm2, %xmm6 409; AVX512VL-NEXT: vaddps %xmm6, %xmm3, %xmm3 410; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm4[1,0] 411; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3] 412; AVX512VL-NEXT: vshufpd {{.*#+}} xmm8 = xmm1[1,0] 413; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2] 414; AVX512VL-NEXT: vmulps %xmm6, %xmm9, %xmm9 415; AVX512VL-NEXT: vaddps %xmm3, %xmm9, %xmm3 416; AVX512VL-NEXT: vshufpd {{.*#+}} xmm9 = xmm0[1,0] 417; AVX512VL-NEXT: vmulss %xmm1, %xmm9, %xmm10 418; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm4[1,1,3,3] 419; AVX512VL-NEXT: vmulss %xmm5, %xmm4, %xmm5 420; AVX512VL-NEXT: vaddss %xmm5, %xmm10, %xmm5 421; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm10 422; AVX512VL-NEXT: vmulss %xmm8, %xmm10, %xmm8 423; AVX512VL-NEXT: vaddss %xmm5, %xmm8, %xmm5 424; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] 425; AVX512VL-NEXT: vmulps %xmm7, %xmm0, %xmm5 426; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm8 427; AVX512VL-NEXT: vmovsldup {{.*#+}} xmm11 = xmm8[0,0,2,2] 428; AVX512VL-NEXT: vmulps %xmm2, %xmm11, %xmm11 429; AVX512VL-NEXT: vaddps %xmm5, %xmm11, %xmm5 430; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm11 = xmm8[1,1,3,3] 431; AVX512VL-NEXT: vmulps %xmm6, %xmm11, %xmm12 432; AVX512VL-NEXT: vaddps %xmm5, %xmm12, %xmm5 433; AVX512VL-NEXT: vmulss %xmm7, %xmm9, %xmm7 434; AVX512VL-NEXT: vmulss %xmm4, %xmm8, %xmm12 435; AVX512VL-NEXT: vaddss %xmm7, %xmm12, %xmm7 436; AVX512VL-NEXT: vmulss %xmm11, %xmm10, %xmm11 437; AVX512VL-NEXT: vaddss %xmm7, %xmm11, %xmm7 438; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3] 439; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3] 440; AVX512VL-NEXT: vshufpd {{.*#+}} xmm11 = xmm8[1,0] 441; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,2,2,2] 442; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm0 443; AVX512VL-NEXT: vmulps %xmm7, %xmm2, %xmm2 444; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 445; AVX512VL-NEXT: vextractf32x4 $2, %zmm1, %xmm1 446; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm2 447; AVX512VL-NEXT: vmulps %xmm2, %xmm6, %xmm2 448; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 449; AVX512VL-NEXT: vmulss %xmm11, %xmm9, %xmm2 450; AVX512VL-NEXT: vmulss %xmm7, %xmm4, %xmm4 451; AVX512VL-NEXT: vaddss %xmm4, %xmm2, %xmm2 452; AVX512VL-NEXT: vmulss %xmm1, %xmm10, %xmm1 453; AVX512VL-NEXT: vaddss %xmm1, %xmm2, %xmm1 454; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] 455; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm3, %zmm2 456; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0] 457; AVX512VL-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 458; AVX512VL-NEXT: retq 459entry: 460 %block = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> <i32 0, i32 1> 461 %splat.splat = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> zeroinitializer 462 %0 = fmul <2 x float> %block, %splat.splat 463 %block6 = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> <i32 3, i32 4> 464 %splat.splat8 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 1, i32 1> 465 %1 = fmul <2 x float> %block6, %splat.splat8 466 %2 = fadd <2 x float> %0, %1 467 %block9 = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> <i32 6, i32 7> 468 %splat.splat11 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 2, i32 2> 469 %3 = fmul <2 x float> %block9, %splat.splat11 470 %4 = fadd <2 x float> %2, %3 471 %5 = shufflevector <2 x float> %4, <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef> 472 %block12 = shufflevector <9 x float> %a0, <9 x float> poison, <1 x i32> <i32 2> 473 %splat.splatinsert13 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> zeroinitializer 474 %6 = fmul <1 x float> %block12, %splat.splatinsert13 475 %block15 = shufflevector <9 x float> %a0, <9 x float> poison, <1 x i32> <i32 5> 476 %splat.splatinsert16 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 1> 477 %7 = fmul <1 x float> %block15, %splat.splatinsert16 478 %8 = fadd <1 x float> %6, %7 479 %block18 = shufflevector <9 x float> %a0, <9 x float> poison, <1 x i32> <i32 8> 480 %splat.splatinsert19 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 2> 481 %9 = fmul <1 x float> %block18, %splat.splatinsert19 482 %10 = fadd <1 x float> %8, %9 483 %11 = shufflevector <1 x float> %10, <1 x float> poison, <3 x i32> <i32 0, i32 undef, i32 undef> 484 %12 = shufflevector <3 x float> %5, <3 x float> %11, <3 x i32> <i32 0, i32 1, i32 3> 485 %splat.splat23 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 3, i32 3> 486 %13 = fmul <2 x float> %block, %splat.splat23 487 %splat.splat26 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 4, i32 4> 488 %14 = fmul <2 x float> %block6, %splat.splat26 489 %15 = fadd <2 x float> %13, %14 490 %splat.splat29 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 5, i32 5> 491 %16 = fmul <2 x float> %block9, %splat.splat29 492 %17 = fadd <2 x float> %15, %16 493 %18 = shufflevector <2 x float> %17, <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef> 494 %splat.splatinsert31 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 3> 495 %19 = fmul <1 x float> %block12, %splat.splatinsert31 496 %splat.splatinsert34 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 4> 497 %20 = fmul <1 x float> %block15, %splat.splatinsert34 498 %21 = fadd <1 x float> %19, %20 499 %splat.splatinsert37 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 5> 500 %22 = fmul <1 x float> %block18, %splat.splatinsert37 501 %23 = fadd <1 x float> %21, %22 502 %24 = shufflevector <1 x float> %23, <1 x float> poison, <3 x i32> <i32 0, i32 undef, i32 undef> 503 %25 = shufflevector <3 x float> %18, <3 x float> %24, <3 x i32> <i32 0, i32 1, i32 3> 504 %splat.splat41 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 6, i32 6> 505 %26 = fmul <2 x float> %block, %splat.splat41 506 %splat.splat44 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 7, i32 7> 507 %27 = fmul <2 x float> %block6, %splat.splat44 508 %28 = fadd <2 x float> %26, %27 509 %splat.splat47 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 8, i32 8> 510 %29 = fmul <2 x float> %block9, %splat.splat47 511 %30 = fadd <2 x float> %28, %29 512 %31 = shufflevector <2 x float> %30, <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef> 513 %splat.splatinsert49 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 6> 514 %32 = fmul <1 x float> %block12, %splat.splatinsert49 515 %splat.splatinsert52 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 7> 516 %33 = fmul <1 x float> %block15, %splat.splatinsert52 517 %34 = fadd <1 x float> %32, %33 518 %35 = fmul <9 x float> %a0, %a1 519 %36 = shufflevector <9 x float> %35, <9 x float> poison, <1 x i32> <i32 8> 520 %37 = fadd <1 x float> %34, %36 521 %38 = shufflevector <1 x float> %37, <1 x float> poison, <3 x i32> <i32 0, i32 undef, i32 undef> 522 %39 = shufflevector <3 x float> %31, <3 x float> %38, <3 x i32> <i32 0, i32 1, i32 3> 523 %40 = shufflevector <3 x float> %12, <3 x float> %25, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> 524 %41 = shufflevector <3 x float> %39, <3 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef> 525 %42 = shufflevector <6 x float> %40, <6 x float> %41, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 526 ret <9 x float> %42 527} 528 529define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwind { 530; SSE-LABEL: test_mul3x3_f64: 531; SSE: # %bb.0: # %entry 532; SSE-NEXT: movq %rdi, %rax 533; SSE-NEXT: movsd {{.*#+}} xmm8 = mem[0],zero 534; SSE-NEXT: movsd {{.*#+}} xmm10 = mem[0],zero 535; SSE-NEXT: movsd {{.*#+}} xmm9 = mem[0],zero 536; SSE-NEXT: movsd {{.*#+}} xmm11 = mem[0],zero 537; SSE-NEXT: movsd {{.*#+}} xmm12 = mem[0],zero 538; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 539; SSE-NEXT: movapd %xmm2, %xmm13 540; SSE-NEXT: mulsd %xmm12, %xmm13 541; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0,0] 542; SSE-NEXT: mulpd %xmm0, %xmm12 543; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] 544; SSE-NEXT: movapd %xmm5, %xmm1 545; SSE-NEXT: mulsd %xmm11, %xmm1 546; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0,0] 547; SSE-NEXT: mulpd %xmm3, %xmm11 548; SSE-NEXT: addpd %xmm12, %xmm11 549; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero 550; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 551; SSE-NEXT: movapd %xmm9, %xmm7 552; SSE-NEXT: mulsd %xmm4, %xmm7 553; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0,0] 554; SSE-NEXT: mulpd %xmm6, %xmm4 555; SSE-NEXT: addpd %xmm11, %xmm4 556; SSE-NEXT: movsd {{.*#+}} xmm11 = mem[0],zero 557; SSE-NEXT: addsd %xmm13, %xmm1 558; SSE-NEXT: addsd %xmm7, %xmm1 559; SSE-NEXT: movapd %xmm2, %xmm12 560; SSE-NEXT: mulsd %xmm11, %xmm12 561; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0,0] 562; SSE-NEXT: mulpd %xmm0, %xmm11 563; SSE-NEXT: movapd %xmm5, %xmm7 564; SSE-NEXT: mulsd %xmm10, %xmm7 565; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0,0] 566; SSE-NEXT: mulpd %xmm3, %xmm10 567; SSE-NEXT: addpd %xmm11, %xmm10 568; SSE-NEXT: movapd %xmm9, %xmm11 569; SSE-NEXT: mulsd %xmm8, %xmm11 570; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0,0] 571; SSE-NEXT: mulpd %xmm6, %xmm8 572; SSE-NEXT: addpd %xmm10, %xmm8 573; SSE-NEXT: addsd %xmm12, %xmm7 574; SSE-NEXT: addsd %xmm11, %xmm7 575; SSE-NEXT: movsd {{.*#+}} xmm10 = mem[0],zero 576; SSE-NEXT: mulsd %xmm10, %xmm2 577; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0,0] 578; SSE-NEXT: mulpd %xmm0, %xmm10 579; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 580; SSE-NEXT: mulsd %xmm0, %xmm5 581; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] 582; SSE-NEXT: mulpd %xmm3, %xmm0 583; SSE-NEXT: addpd %xmm10, %xmm0 584; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero 585; SSE-NEXT: mulsd %xmm3, %xmm9 586; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0,0] 587; SSE-NEXT: mulpd %xmm6, %xmm3 588; SSE-NEXT: addpd %xmm0, %xmm3 589; SSE-NEXT: addsd %xmm2, %xmm5 590; SSE-NEXT: addsd %xmm9, %xmm5 591; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm8[0] 592; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm7[0] 593; SSE-NEXT: movsd %xmm5, 64(%rdi) 594; SSE-NEXT: movapd %xmm3, 48(%rdi) 595; SSE-NEXT: movapd %xmm4, (%rdi) 596; SSE-NEXT: movapd %xmm8, 32(%rdi) 597; SSE-NEXT: movapd %xmm1, 16(%rdi) 598; SSE-NEXT: retq 599; 600; AVX1-LABEL: test_mul3x3_f64: 601; AVX1: # %bb.0: # %entry 602; AVX1-NEXT: movq %rdi, %rax 603; AVX1-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero 604; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] 605; AVX1-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] 606; AVX1-NEXT: vmulpd %xmm1, %xmm9, %xmm0 607; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] 608; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] 609; AVX1-NEXT: vmulpd %xmm4, %xmm3, %xmm10 610; AVX1-NEXT: vaddpd %xmm0, %xmm10, %xmm0 611; AVX1-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 612; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] 613; AVX1-NEXT: vmulpd %xmm7, %xmm6, %xmm10 614; AVX1-NEXT: vaddpd %xmm0, %xmm10, %xmm0 615; AVX1-NEXT: vmulsd %xmm2, %xmm9, %xmm9 616; AVX1-NEXT: vmulsd %xmm4, %xmm5, %xmm4 617; AVX1-NEXT: vaddsd %xmm4, %xmm9, %xmm4 618; AVX1-NEXT: vmulsd %xmm7, %xmm8, %xmm7 619; AVX1-NEXT: vaddsd %xmm7, %xmm4, %xmm4 620; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 621; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] 622; AVX1-NEXT: vmulpd %xmm7, %xmm1, %xmm9 623; AVX1-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] 624; AVX1-NEXT: vmulpd %xmm3, %xmm10, %xmm11 625; AVX1-NEXT: vaddpd %xmm11, %xmm9, %xmm9 626; AVX1-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] 627; AVX1-NEXT: vmulpd %xmm6, %xmm11, %xmm12 628; AVX1-NEXT: vaddpd %xmm12, %xmm9, %xmm9 629; AVX1-NEXT: vmulsd %xmm7, %xmm2, %xmm7 630; AVX1-NEXT: vmulsd %xmm5, %xmm10, %xmm10 631; AVX1-NEXT: vaddsd %xmm7, %xmm10, %xmm7 632; AVX1-NEXT: vmulsd %xmm11, %xmm8, %xmm10 633; AVX1-NEXT: vaddsd %xmm7, %xmm10, %xmm7 634; AVX1-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] 635; AVX1-NEXT: vmulpd %xmm1, %xmm10, %xmm1 636; AVX1-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] 637; AVX1-NEXT: vmulpd %xmm3, %xmm11, %xmm3 638; AVX1-NEXT: vaddpd %xmm3, %xmm1, %xmm1 639; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] 640; AVX1-NEXT: vmulpd %xmm3, %xmm6, %xmm6 641; AVX1-NEXT: vaddpd %xmm6, %xmm1, %xmm1 642; AVX1-NEXT: vmulsd %xmm2, %xmm10, %xmm2 643; AVX1-NEXT: vmulsd %xmm5, %xmm11, %xmm5 644; AVX1-NEXT: vaddsd %xmm5, %xmm2, %xmm2 645; AVX1-NEXT: vmulsd %xmm3, %xmm8, %xmm3 646; AVX1-NEXT: vaddsd %xmm3, %xmm2, %xmm2 647; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 648; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2] 649; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm3 650; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 651; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3] 652; AVX1-NEXT: vmovsd %xmm2, 64(%rdi) 653; AVX1-NEXT: vmovapd %ymm1, 32(%rdi) 654; AVX1-NEXT: vmovapd %ymm0, (%rdi) 655; AVX1-NEXT: vzeroupper 656; AVX1-NEXT: retq 657; 658; AVX2-LABEL: test_mul3x3_f64: 659; AVX2: # %bb.0: # %entry 660; AVX2-NEXT: movq %rdi, %rax 661; AVX2-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero 662; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] 663; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] 664; AVX2-NEXT: vmulpd %xmm1, %xmm9, %xmm0 665; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] 666; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] 667; AVX2-NEXT: vmulpd %xmm4, %xmm3, %xmm10 668; AVX2-NEXT: vaddpd %xmm0, %xmm10, %xmm0 669; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 670; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] 671; AVX2-NEXT: vmulpd %xmm7, %xmm6, %xmm10 672; AVX2-NEXT: vaddpd %xmm0, %xmm10, %xmm0 673; AVX2-NEXT: vmulsd %xmm2, %xmm9, %xmm9 674; AVX2-NEXT: vmulsd %xmm4, %xmm5, %xmm4 675; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4 676; AVX2-NEXT: vmulsd %xmm7, %xmm8, %xmm7 677; AVX2-NEXT: vaddsd %xmm7, %xmm4, %xmm4 678; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 679; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] 680; AVX2-NEXT: vmulpd %xmm7, %xmm1, %xmm9 681; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] 682; AVX2-NEXT: vmulpd %xmm3, %xmm10, %xmm11 683; AVX2-NEXT: vaddpd %xmm11, %xmm9, %xmm9 684; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] 685; AVX2-NEXT: vmulpd %xmm6, %xmm11, %xmm12 686; AVX2-NEXT: vaddpd %xmm12, %xmm9, %xmm9 687; AVX2-NEXT: vmulsd %xmm7, %xmm2, %xmm7 688; AVX2-NEXT: vmulsd %xmm5, %xmm10, %xmm10 689; AVX2-NEXT: vaddsd %xmm7, %xmm10, %xmm7 690; AVX2-NEXT: vmulsd %xmm11, %xmm8, %xmm10 691; AVX2-NEXT: vaddsd %xmm7, %xmm10, %xmm7 692; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] 693; AVX2-NEXT: vmulpd %xmm1, %xmm10, %xmm1 694; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] 695; AVX2-NEXT: vmulpd %xmm3, %xmm11, %xmm3 696; AVX2-NEXT: vaddpd %xmm3, %xmm1, %xmm1 697; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] 698; AVX2-NEXT: vmulpd %xmm3, %xmm6, %xmm6 699; AVX2-NEXT: vaddpd %xmm6, %xmm1, %xmm1 700; AVX2-NEXT: vmulsd %xmm2, %xmm10, %xmm2 701; AVX2-NEXT: vmulsd %xmm5, %xmm11, %xmm5 702; AVX2-NEXT: vaddsd %xmm5, %xmm2, %xmm2 703; AVX2-NEXT: vmulsd %xmm3, %xmm8, %xmm3 704; AVX2-NEXT: vaddsd %xmm3, %xmm2, %xmm2 705; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 706; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2] 707; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm3 708; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 709; AVX2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3] 710; AVX2-NEXT: vmovsd %xmm2, 64(%rdi) 711; AVX2-NEXT: vmovapd %ymm1, 32(%rdi) 712; AVX2-NEXT: vmovapd %ymm0, (%rdi) 713; AVX2-NEXT: vzeroupper 714; AVX2-NEXT: retq 715; 716; AVX512F-LABEL: test_mul3x3_f64: 717; AVX512F: # %bb.0: # %entry 718; AVX512F-NEXT: movq %rdi, %rax 719; AVX512F-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero 720; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 721; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] 722; AVX512F-NEXT: vmulpd %xmm0, %xmm9, %xmm10 723; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm4[0] 724; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] 725; AVX512F-NEXT: vmulpd %xmm3, %xmm1, %xmm4 726; AVX512F-NEXT: vaddpd %xmm4, %xmm10, %xmm4 727; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 728; AVX512F-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] 729; AVX512F-NEXT: vmulpd %xmm7, %xmm6, %xmm10 730; AVX512F-NEXT: vaddpd %xmm4, %xmm10, %xmm4 731; AVX512F-NEXT: vmulsd %xmm2, %xmm9, %xmm9 732; AVX512F-NEXT: vmulsd %xmm3, %xmm5, %xmm3 733; AVX512F-NEXT: vaddsd %xmm3, %xmm9, %xmm3 734; AVX512F-NEXT: vmulsd %xmm7, %xmm8, %xmm7 735; AVX512F-NEXT: vaddsd %xmm7, %xmm3, %xmm3 736; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 737; AVX512F-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] 738; AVX512F-NEXT: vmulpd %xmm4, %xmm0, %xmm7 739; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] 740; AVX512F-NEXT: vmulpd %xmm1, %xmm9, %xmm10 741; AVX512F-NEXT: vaddpd %xmm7, %xmm10, %xmm7 742; AVX512F-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] 743; AVX512F-NEXT: vmulpd %xmm6, %xmm10, %xmm11 744; AVX512F-NEXT: vaddpd %xmm7, %xmm11, %xmm7 745; AVX512F-NEXT: vmulsd %xmm4, %xmm2, %xmm4 746; AVX512F-NEXT: vmulsd %xmm5, %xmm9, %xmm9 747; AVX512F-NEXT: vaddsd %xmm4, %xmm9, %xmm4 748; AVX512F-NEXT: vmulsd %xmm10, %xmm8, %xmm9 749; AVX512F-NEXT: vaddsd %xmm4, %xmm9, %xmm4 750; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 751; AVX512F-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] 752; AVX512F-NEXT: vmulpd %xmm7, %xmm0, %xmm0 753; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] 754; AVX512F-NEXT: vmulpd %xmm1, %xmm9, %xmm1 755; AVX512F-NEXT: vaddpd %xmm1, %xmm0, %xmm0 756; AVX512F-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] 757; AVX512F-NEXT: vmulpd %xmm1, %xmm6, %xmm6 758; AVX512F-NEXT: vaddpd %xmm6, %xmm0, %xmm0 759; AVX512F-NEXT: vmulsd %xmm7, %xmm2, %xmm2 760; AVX512F-NEXT: vmulsd %xmm5, %xmm9, %xmm5 761; AVX512F-NEXT: vaddsd %xmm5, %xmm2, %xmm2 762; AVX512F-NEXT: vmulsd %xmm1, %xmm8, %xmm1 763; AVX512F-NEXT: vaddsd %xmm1, %xmm2, %xmm1 764; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm3, %zmm2 765; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] 766; AVX512F-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3 767; AVX512F-NEXT: vmovsd %xmm1, 64(%rdi) 768; AVX512F-NEXT: vmovapd %zmm3, (%rdi) 769; AVX512F-NEXT: vzeroupper 770; AVX512F-NEXT: retq 771; 772; AVX512VL-LABEL: test_mul3x3_f64: 773; AVX512VL: # %bb.0: # %entry 774; AVX512VL-NEXT: movq %rdi, %rax 775; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero 776; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 777; AVX512VL-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] 778; AVX512VL-NEXT: vmulpd %xmm1, %xmm0, %xmm9 779; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] 780; AVX512VL-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] 781; AVX512VL-NEXT: vmulpd %xmm4, %xmm3, %xmm10 782; AVX512VL-NEXT: vaddpd %xmm10, %xmm9, %xmm9 783; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] 784; AVX512VL-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] 785; AVX512VL-NEXT: vmulpd %xmm7, %xmm6, %xmm10 786; AVX512VL-NEXT: vaddpd %xmm10, %xmm9, %xmm9 787; AVX512VL-NEXT: vmulsd %xmm1, %xmm2, %xmm1 788; AVX512VL-NEXT: vmulsd %xmm4, %xmm5, %xmm4 789; AVX512VL-NEXT: vaddsd %xmm4, %xmm1, %xmm1 790; AVX512VL-NEXT: vmulsd %xmm7, %xmm8, %xmm4 791; AVX512VL-NEXT: vaddsd %xmm4, %xmm1, %xmm1 792; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 793; AVX512VL-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] 794; AVX512VL-NEXT: vmulpd %xmm4, %xmm0, %xmm7 795; AVX512VL-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] 796; AVX512VL-NEXT: vmulpd %xmm3, %xmm9, %xmm10 797; AVX512VL-NEXT: vaddpd %xmm7, %xmm10, %xmm7 798; AVX512VL-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] 799; AVX512VL-NEXT: vmulpd %xmm6, %xmm10, %xmm11 800; AVX512VL-NEXT: vaddpd %xmm7, %xmm11, %xmm7 801; AVX512VL-NEXT: vmulsd %xmm4, %xmm2, %xmm4 802; AVX512VL-NEXT: vmulsd %xmm5, %xmm9, %xmm9 803; AVX512VL-NEXT: vaddsd %xmm4, %xmm9, %xmm4 804; AVX512VL-NEXT: vmulsd %xmm10, %xmm8, %xmm9 805; AVX512VL-NEXT: vaddsd %xmm4, %xmm9, %xmm4 806; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 807; AVX512VL-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] 808; AVX512VL-NEXT: vmulpd %xmm7, %xmm0, %xmm0 809; AVX512VL-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] 810; AVX512VL-NEXT: vmulpd %xmm3, %xmm9, %xmm3 811; AVX512VL-NEXT: vaddpd %xmm3, %xmm0, %xmm0 812; AVX512VL-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] 813; AVX512VL-NEXT: vmulpd %xmm3, %xmm6, %xmm6 814; AVX512VL-NEXT: vaddpd %xmm6, %xmm0, %xmm0 815; AVX512VL-NEXT: vmulsd %xmm7, %xmm2, %xmm2 816; AVX512VL-NEXT: vmulsd %xmm5, %xmm9, %xmm5 817; AVX512VL-NEXT: vaddsd %xmm5, %xmm2, %xmm2 818; AVX512VL-NEXT: vmulsd %xmm3, %xmm8, %xmm3 819; AVX512VL-NEXT: vaddsd %xmm3, %xmm2, %xmm2 820; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1 821; AVX512VL-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] 822; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm1, %zmm3 823; AVX512VL-NEXT: vmovsd %xmm2, 64(%rdi) 824; AVX512VL-NEXT: vmovapd %zmm3, (%rdi) 825; AVX512VL-NEXT: vzeroupper 826; AVX512VL-NEXT: retq 827entry: 828 %block = shufflevector <9 x double> %a0, <9 x double> poison, <2 x i32> <i32 0, i32 1> 829 %splat.splat = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> zeroinitializer 830 %0 = fmul <2 x double> %block, %splat.splat 831 %block6 = shufflevector <9 x double> %a0, <9 x double> poison, <2 x i32> <i32 3, i32 4> 832 %splat.splat8 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 1, i32 1> 833 %1 = fmul <2 x double> %block6, %splat.splat8 834 %2 = fadd <2 x double> %0, %1 835 %block9 = shufflevector <9 x double> %a0, <9 x double> poison, <2 x i32> <i32 6, i32 7> 836 %splat.splat11 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 2, i32 2> 837 %3 = fmul <2 x double> %block9, %splat.splat11 838 %4 = fadd <2 x double> %2, %3 839 %5 = shufflevector <2 x double> %4, <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 undef> 840 %block12 = shufflevector <9 x double> %a0, <9 x double> poison, <1 x i32> <i32 2> 841 %splat.splatinsert13 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> zeroinitializer 842 %6 = fmul <1 x double> %block12, %splat.splatinsert13 843 %block15 = shufflevector <9 x double> %a0, <9 x double> poison, <1 x i32> <i32 5> 844 %splat.splatinsert16 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 1> 845 %7 = fmul <1 x double> %block15, %splat.splatinsert16 846 %8 = fadd <1 x double> %6, %7 847 %block18 = shufflevector <9 x double> %a0, <9 x double> poison, <1 x i32> <i32 8> 848 %splat.splatinsert19 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 2> 849 %9 = fmul <1 x double> %block18, %splat.splatinsert19 850 %10 = fadd <1 x double> %8, %9 851 %11 = shufflevector <1 x double> %10, <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef> 852 %12 = shufflevector <3 x double> %5, <3 x double> %11, <3 x i32> <i32 0, i32 1, i32 3> 853 %splat.splat23 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 3, i32 3> 854 %13 = fmul <2 x double> %block, %splat.splat23 855 %splat.splat26 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 4, i32 4> 856 %14 = fmul <2 x double> %block6, %splat.splat26 857 %15 = fadd <2 x double> %13, %14 858 %splat.splat29 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 5, i32 5> 859 %16 = fmul <2 x double> %block9, %splat.splat29 860 %17 = fadd <2 x double> %15, %16 861 %18 = shufflevector <2 x double> %17, <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 undef> 862 %splat.splatinsert31 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 3> 863 %19 = fmul <1 x double> %block12, %splat.splatinsert31 864 %splat.splatinsert34 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 4> 865 %20 = fmul <1 x double> %block15, %splat.splatinsert34 866 %21 = fadd <1 x double> %19, %20 867 %splat.splatinsert37 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 5> 868 %22 = fmul <1 x double> %block18, %splat.splatinsert37 869 %23 = fadd <1 x double> %21, %22 870 %24 = shufflevector <1 x double> %23, <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef> 871 %25 = shufflevector <3 x double> %18, <3 x double> %24, <3 x i32> <i32 0, i32 1, i32 3> 872 %splat.splat41 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 6, i32 6> 873 %26 = fmul <2 x double> %block, %splat.splat41 874 %splat.splat44 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 7, i32 7> 875 %27 = fmul <2 x double> %block6, %splat.splat44 876 %28 = fadd <2 x double> %26, %27 877 %splat.splat47 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 8, i32 8> 878 %29 = fmul <2 x double> %block9, %splat.splat47 879 %30 = fadd <2 x double> %28, %29 880 %31 = shufflevector <2 x double> %30, <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 undef> 881 %splat.splatinsert49 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 6> 882 %32 = fmul <1 x double> %block12, %splat.splatinsert49 883 %splat.splatinsert52 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 7> 884 %33 = fmul <1 x double> %block15, %splat.splatinsert52 885 %34 = fadd <1 x double> %32, %33 886 %35 = fmul <9 x double> %a0, %a1 887 %36 = shufflevector <9 x double> %35, <9 x double> poison, <1 x i32> <i32 8> 888 %37 = fadd <1 x double> %34, %36 889 %38 = shufflevector <1 x double> %37, <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef> 890 %39 = shufflevector <3 x double> %31, <3 x double> %38, <3 x i32> <i32 0, i32 1, i32 3> 891 %40 = shufflevector <3 x double> %12, <3 x double> %25, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5> 892 %41 = shufflevector <3 x double> %39, <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef> 893 %42 = shufflevector <6 x double> %40, <6 x double> %41, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 894 ret <9 x double> %42 895} 896 897define <16 x float> @test_mul4x4_f32(<16 x float> %a0, <16 x float> %a1) nounwind { 898; SSE-LABEL: test_mul4x4_f32: 899; SSE: # %bb.0: # %entry 900; SSE-NEXT: movaps %xmm0, %xmm9 901; SSE-NEXT: movaps %xmm4, %xmm0 902; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm4[0,0] 903; SSE-NEXT: mulps %xmm9, %xmm0 904; SSE-NEXT: movaps %xmm4, %xmm8 905; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm4[1,1] 906; SSE-NEXT: mulps %xmm1, %xmm8 907; SSE-NEXT: addps %xmm0, %xmm8 908; SSE-NEXT: movaps %xmm4, %xmm0 909; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2] 910; SSE-NEXT: mulps %xmm2, %xmm0 911; SSE-NEXT: addps %xmm8, %xmm0 912; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] 913; SSE-NEXT: mulps %xmm3, %xmm4 914; SSE-NEXT: addps %xmm4, %xmm0 915; SSE-NEXT: movaps %xmm5, %xmm4 916; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm5[0,0] 917; SSE-NEXT: mulps %xmm9, %xmm4 918; SSE-NEXT: movaps %xmm5, %xmm10 919; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm5[1,1] 920; SSE-NEXT: mulps %xmm1, %xmm10 921; SSE-NEXT: addps %xmm4, %xmm10 922; SSE-NEXT: movaps %xmm5, %xmm8 923; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,2],xmm5[2,2] 924; SSE-NEXT: mulps %xmm2, %xmm8 925; SSE-NEXT: addps %xmm10, %xmm8 926; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3,3,3] 927; SSE-NEXT: mulps %xmm3, %xmm5 928; SSE-NEXT: addps %xmm5, %xmm8 929; SSE-NEXT: movaps %xmm6, %xmm4 930; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm6[0,0] 931; SSE-NEXT: mulps %xmm9, %xmm4 932; SSE-NEXT: movaps %xmm6, %xmm10 933; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm6[1,1] 934; SSE-NEXT: mulps %xmm1, %xmm10 935; SSE-NEXT: addps %xmm4, %xmm10 936; SSE-NEXT: movaps %xmm6, %xmm5 937; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm6[2,2] 938; SSE-NEXT: mulps %xmm2, %xmm5 939; SSE-NEXT: addps %xmm10, %xmm5 940; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3,3,3] 941; SSE-NEXT: mulps %xmm3, %xmm6 942; SSE-NEXT: addps %xmm6, %xmm5 943; SSE-NEXT: movaps %xmm7, %xmm4 944; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm7[0,0] 945; SSE-NEXT: mulps %xmm9, %xmm4 946; SSE-NEXT: movaps %xmm7, %xmm6 947; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm7[1,1] 948; SSE-NEXT: mulps %xmm1, %xmm6 949; SSE-NEXT: addps %xmm4, %xmm6 950; SSE-NEXT: movaps %xmm7, %xmm1 951; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[2,2] 952; SSE-NEXT: mulps %xmm2, %xmm1 953; SSE-NEXT: addps %xmm6, %xmm1 954; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3,3,3] 955; SSE-NEXT: mulps %xmm7, %xmm3 956; SSE-NEXT: addps %xmm1, %xmm3 957; SSE-NEXT: movaps %xmm8, %xmm1 958; SSE-NEXT: movaps %xmm5, %xmm2 959; SSE-NEXT: retq 960; 961; AVX1-LABEL: test_mul4x4_f32: 962; AVX1: # %bb.0: # %entry 963; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 964; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 965; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm2[0,0,0,0] 966; AVX1-NEXT: vmulps %xmm6, %xmm0, %xmm6 967; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1] 968; AVX1-NEXT: vmulps %xmm7, %xmm5, %xmm7 969; AVX1-NEXT: vaddps %xmm7, %xmm6, %xmm6 970; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] 971; AVX1-NEXT: vmulps %xmm7, %xmm1, %xmm7 972; AVX1-NEXT: vaddps %xmm7, %xmm6, %xmm6 973; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3] 974; AVX1-NEXT: vmulps %xmm7, %xmm4, %xmm7 975; AVX1-NEXT: vaddps %xmm7, %xmm6, %xmm6 976; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 977; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[0,0,0,0] 978; AVX1-NEXT: vmulps %xmm7, %xmm0, %xmm7 979; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm2[1,1,1,1] 980; AVX1-NEXT: vmulps %xmm5, %xmm8, %xmm8 981; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7 982; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm2[2,2,2,2] 983; AVX1-NEXT: vmulps %xmm1, %xmm8, %xmm8 984; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7 985; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 986; AVX1-NEXT: vmulps %xmm2, %xmm4, %xmm2 987; AVX1-NEXT: vaddps %xmm2, %xmm7, %xmm2 988; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm3[0,0,0,0] 989; AVX1-NEXT: vmulps %xmm7, %xmm0, %xmm7 990; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] 991; AVX1-NEXT: vmulps %xmm5, %xmm8, %xmm8 992; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7 993; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2] 994; AVX1-NEXT: vmulps %xmm1, %xmm8, %xmm8 995; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7 996; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[3,3,3,3] 997; AVX1-NEXT: vmulps %xmm4, %xmm8, %xmm8 998; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7 999; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1000; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[0,0,0,0] 1001; AVX1-NEXT: vmulps %xmm0, %xmm8, %xmm0 1002; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] 1003; AVX1-NEXT: vmulps %xmm5, %xmm8, %xmm5 1004; AVX1-NEXT: vaddps %xmm5, %xmm0, %xmm0 1005; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2] 1006; AVX1-NEXT: vmulps %xmm5, %xmm1, %xmm1 1007; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 1008; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] 1009; AVX1-NEXT: vmulps %xmm1, %xmm4, %xmm1 1010; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm1 1011; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm0 1012; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 1013; AVX1-NEXT: retq 1014; 1015; AVX2-LABEL: test_mul4x4_f32: 1016; AVX2: # %bb.0: # %entry 1017; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm5 1018; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4 1019; AVX2-NEXT: vbroadcastss %xmm2, %xmm6 1020; AVX2-NEXT: vmulps %xmm6, %xmm0, %xmm6 1021; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1] 1022; AVX2-NEXT: vmulps %xmm7, %xmm5, %xmm7 1023; AVX2-NEXT: vaddps %xmm7, %xmm6, %xmm6 1024; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] 1025; AVX2-NEXT: vmulps %xmm7, %xmm1, %xmm7 1026; AVX2-NEXT: vaddps %xmm7, %xmm6, %xmm6 1027; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3] 1028; AVX2-NEXT: vmulps %xmm7, %xmm4, %xmm7 1029; AVX2-NEXT: vaddps %xmm7, %xmm6, %xmm6 1030; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 1031; AVX2-NEXT: vbroadcastss %xmm2, %xmm7 1032; AVX2-NEXT: vmulps %xmm7, %xmm0, %xmm7 1033; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm2[1,1,1,1] 1034; AVX2-NEXT: vmulps %xmm5, %xmm8, %xmm8 1035; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7 1036; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm2[2,2,2,2] 1037; AVX2-NEXT: vmulps %xmm1, %xmm8, %xmm8 1038; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7 1039; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 1040; AVX2-NEXT: vmulps %xmm2, %xmm4, %xmm2 1041; AVX2-NEXT: vaddps %xmm2, %xmm7, %xmm2 1042; AVX2-NEXT: vbroadcastss %xmm3, %xmm7 1043; AVX2-NEXT: vmulps %xmm7, %xmm0, %xmm7 1044; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] 1045; AVX2-NEXT: vmulps %xmm5, %xmm8, %xmm8 1046; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7 1047; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2] 1048; AVX2-NEXT: vmulps %xmm1, %xmm8, %xmm8 1049; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7 1050; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[3,3,3,3] 1051; AVX2-NEXT: vmulps %xmm4, %xmm8, %xmm8 1052; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7 1053; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3 1054; AVX2-NEXT: vbroadcastss %xmm3, %xmm8 1055; AVX2-NEXT: vmulps %xmm0, %xmm8, %xmm0 1056; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] 1057; AVX2-NEXT: vmulps %xmm5, %xmm8, %xmm5 1058; AVX2-NEXT: vaddps %xmm5, %xmm0, %xmm0 1059; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2] 1060; AVX2-NEXT: vmulps %xmm5, %xmm1, %xmm1 1061; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 1062; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] 1063; AVX2-NEXT: vmulps %xmm1, %xmm4, %xmm1 1064; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm1 1065; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm0 1066; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 1067; AVX2-NEXT: retq 1068; 1069; AVX512F-LABEL: test_mul4x4_f32: 1070; AVX512F: # %bb.0: # %entry 1071; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm4 1072; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm3 1073; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm2 1074; AVX512F-NEXT: vbroadcastss %xmm1, %xmm5 1075; AVX512F-NEXT: vmulps %xmm5, %xmm0, %xmm5 1076; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1,1,1] 1077; AVX512F-NEXT: vmulps %xmm6, %xmm4, %xmm6 1078; AVX512F-NEXT: vaddps %xmm6, %xmm5, %xmm5 1079; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm1[2,2,2,2] 1080; AVX512F-NEXT: vmulps %xmm6, %xmm3, %xmm6 1081; AVX512F-NEXT: vaddps %xmm6, %xmm5, %xmm5 1082; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3] 1083; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm6 1084; AVX512F-NEXT: vaddps %xmm6, %xmm5, %xmm5 1085; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm6 1086; AVX512F-NEXT: vbroadcastss %xmm6, %xmm7 1087; AVX512F-NEXT: vmulps %xmm7, %xmm0, %xmm7 1088; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm6[1,1,1,1] 1089; AVX512F-NEXT: vmulps %xmm4, %xmm8, %xmm8 1090; AVX512F-NEXT: vaddps %xmm7, %xmm8, %xmm7 1091; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2] 1092; AVX512F-NEXT: vmulps %xmm3, %xmm8, %xmm8 1093; AVX512F-NEXT: vaddps %xmm7, %xmm8, %xmm7 1094; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3] 1095; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm6 1096; AVX512F-NEXT: vaddps %xmm6, %xmm7, %xmm6 1097; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm7 1098; AVX512F-NEXT: vbroadcastss %xmm7, %xmm8 1099; AVX512F-NEXT: vmulps %xmm0, %xmm8, %xmm8 1100; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm7[1,1,1,1] 1101; AVX512F-NEXT: vmulps %xmm4, %xmm9, %xmm9 1102; AVX512F-NEXT: vaddps %xmm9, %xmm8, %xmm8 1103; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm7[2,2,2,2] 1104; AVX512F-NEXT: vmulps %xmm3, %xmm9, %xmm9 1105; AVX512F-NEXT: vaddps %xmm9, %xmm8, %xmm8 1106; AVX512F-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,3,3,3] 1107; AVX512F-NEXT: vmulps %xmm7, %xmm2, %xmm7 1108; AVX512F-NEXT: vaddps %xmm7, %xmm8, %xmm7 1109; AVX512F-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1110; AVX512F-NEXT: vbroadcastss %xmm1, %xmm8 1111; AVX512F-NEXT: vmulps %xmm0, %xmm8, %xmm0 1112; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] 1113; AVX512F-NEXT: vmulps %xmm4, %xmm8, %xmm4 1114; AVX512F-NEXT: vaddps %xmm4, %xmm0, %xmm0 1115; AVX512F-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2] 1116; AVX512F-NEXT: vmulps %xmm4, %xmm3, %xmm3 1117; AVX512F-NEXT: vaddps %xmm3, %xmm0, %xmm0 1118; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 1119; AVX512F-NEXT: vmulps %xmm1, %xmm2, %xmm1 1120; AVX512F-NEXT: vaddps %xmm1, %xmm0, %xmm0 1121; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 1122; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm1 1123; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 1124; AVX512F-NEXT: retq 1125; 1126; AVX512VL-LABEL: test_mul4x4_f32: 1127; AVX512VL: # %bb.0: # %entry 1128; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2 1129; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3 1130; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm4 1131; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm5 1132; AVX512VL-NEXT: vmulps %xmm5, %xmm0, %xmm5 1133; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1,1,1] 1134; AVX512VL-NEXT: vmulps %xmm6, %xmm2, %xmm6 1135; AVX512VL-NEXT: vaddps %xmm6, %xmm5, %xmm5 1136; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm1[2,2,2,2] 1137; AVX512VL-NEXT: vmulps %xmm6, %xmm3, %xmm6 1138; AVX512VL-NEXT: vaddps %xmm6, %xmm5, %xmm5 1139; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3] 1140; AVX512VL-NEXT: vmulps %xmm6, %xmm4, %xmm6 1141; AVX512VL-NEXT: vaddps %xmm6, %xmm5, %xmm5 1142; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm6 1143; AVX512VL-NEXT: vbroadcastss %xmm6, %xmm7 1144; AVX512VL-NEXT: vmulps %xmm7, %xmm0, %xmm7 1145; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm6[1,1,1,1] 1146; AVX512VL-NEXT: vmulps %xmm2, %xmm8, %xmm8 1147; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7 1148; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2] 1149; AVX512VL-NEXT: vmulps %xmm3, %xmm8, %xmm8 1150; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7 1151; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3] 1152; AVX512VL-NEXT: vmulps %xmm6, %xmm4, %xmm6 1153; AVX512VL-NEXT: vaddps %xmm6, %xmm7, %xmm6 1154; AVX512VL-NEXT: vextractf32x4 $2, %zmm1, %xmm7 1155; AVX512VL-NEXT: vbroadcastss %xmm7, %xmm8 1156; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm8 1157; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm7[1,1,1,1] 1158; AVX512VL-NEXT: vmulps %xmm2, %xmm9, %xmm9 1159; AVX512VL-NEXT: vaddps %xmm9, %xmm8, %xmm8 1160; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm7[2,2,2,2] 1161; AVX512VL-NEXT: vmulps %xmm3, %xmm9, %xmm9 1162; AVX512VL-NEXT: vaddps %xmm9, %xmm8, %xmm8 1163; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,3,3,3] 1164; AVX512VL-NEXT: vmulps %xmm7, %xmm4, %xmm7 1165; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7 1166; AVX512VL-NEXT: vextractf32x4 $3, %zmm1, %xmm1 1167; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm8 1168; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm0 1169; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] 1170; AVX512VL-NEXT: vmulps %xmm2, %xmm8, %xmm2 1171; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 1172; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,2,2,2] 1173; AVX512VL-NEXT: vmulps %xmm2, %xmm3, %xmm2 1174; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 1175; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 1176; AVX512VL-NEXT: vmulps %xmm1, %xmm4, %xmm1 1177; AVX512VL-NEXT: vaddps %xmm1, %xmm0, %xmm0 1178; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 1179; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm1 1180; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 1181; AVX512VL-NEXT: retq 1182entry: 1183 %split = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1184 %split1 = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1185 %split2 = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11> 1186 %split3 = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15> 1187 %splat.splat = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> zeroinitializer 1188 %0 = fmul <4 x float> %split, %splat.splat 1189 %splat.splat10 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1190 %1 = fmul <4 x float> %split1, %splat.splat10 1191 %2 = fadd <4 x float> %0, %1 1192 %splat.splat13 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2> 1193 %3 = fmul <4 x float> %split2, %splat.splat13 1194 %4 = fadd <4 x float> %2, %3 1195 %splat.splat16 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1196 %5 = fmul <4 x float> %split3, %splat.splat16 1197 %6 = fadd <4 x float> %4, %5 1198 %splat.splat19 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4> 1199 %7 = fmul <4 x float> %split, %splat.splat19 1200 %splat.splat22 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5> 1201 %8 = fmul <4 x float> %split1, %splat.splat22 1202 %9 = fadd <4 x float> %7, %8 1203 %splat.splat25 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6> 1204 %10 = fmul <4 x float> %split2, %splat.splat25 1205 %11 = fadd <4 x float> %9, %10 1206 %splat.splat28 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 1207 %12 = fmul <4 x float> %split3, %splat.splat28 1208 %13 = fadd <4 x float> %11, %12 1209 %splat.splat31 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 8, i32 8, i32 8, i32 8> 1210 %14 = fmul <4 x float> %split, %splat.splat31 1211 %splat.splat34 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 9, i32 9, i32 9, i32 9> 1212 %15 = fmul <4 x float> %split1, %splat.splat34 1213 %16 = fadd <4 x float> %14, %15 1214 %splat.splat37 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 10, i32 10, i32 10, i32 10> 1215 %17 = fmul <4 x float> %split2, %splat.splat37 1216 %18 = fadd <4 x float> %16, %17 1217 %splat.splat40 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 11, i32 11, i32 11, i32 11> 1218 %19 = fmul <4 x float> %split3, %splat.splat40 1219 %20 = fadd <4 x float> %18, %19 1220 %splat.splat43 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 12, i32 12> 1221 %21 = fmul <4 x float> %split, %splat.splat43 1222 %splat.splat46 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 13, i32 13, i32 13, i32 13> 1223 %22 = fmul <4 x float> %split1, %splat.splat46 1224 %23 = fadd <4 x float> %21, %22 1225 %splat.splat49 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 14, i32 14, i32 14, i32 14> 1226 %24 = fmul <4 x float> %split2, %splat.splat49 1227 %25 = fadd <4 x float> %23, %24 1228 %splat.splat52 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 15, i32 15, i32 15, i32 15> 1229 %26 = fmul <4 x float> %split3, %splat.splat52 1230 %27 = fadd <4 x float> %25, %26 1231 %28 = shufflevector <4 x float> %6, <4 x float> %13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1232 %29 = shufflevector <4 x float> %20, <4 x float> %27, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1233 %30 = shufflevector <8 x float> %28, <8 x float> %29, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1234 ret <16 x float> %30 1235} 1236 1237define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) nounwind { 1238; SSE-LABEL: test_mul4x4_f64: 1239; SSE: # %bb.0: # %entry 1240; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1241; SSE-NEXT: movapd %xmm5, %xmm6 1242; SSE-NEXT: movapd %xmm4, %xmm5 1243; SSE-NEXT: movq %rdi, %rax 1244; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 1245; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 1246; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 1247; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 1248; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 1249; SSE-NEXT: movapd %xmm10, %xmm13 1250; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm10[0] 1251; SSE-NEXT: movapd %xmm1, %xmm14 1252; SSE-NEXT: mulpd %xmm13, %xmm14 1253; SSE-NEXT: mulpd %xmm0, %xmm13 1254; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1,1] 1255; SSE-NEXT: movapd %xmm3, %xmm15 1256; SSE-NEXT: mulpd %xmm10, %xmm15 1257; SSE-NEXT: addpd %xmm14, %xmm15 1258; SSE-NEXT: mulpd %xmm2, %xmm10 1259; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1260; SSE-NEXT: addpd %xmm13, %xmm10 1261; SSE-NEXT: movapd %xmm8, %xmm13 1262; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm8[0] 1263; SSE-NEXT: movapd %xmm4, %xmm14 1264; SSE-NEXT: mulpd %xmm13, %xmm14 1265; SSE-NEXT: addpd %xmm10, %xmm14 1266; SSE-NEXT: movapd %xmm6, %xmm4 1267; SSE-NEXT: mulpd %xmm6, %xmm13 1268; SSE-NEXT: addpd %xmm15, %xmm13 1269; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] 1270; SSE-NEXT: movapd %xmm7, %xmm10 1271; SSE-NEXT: mulpd %xmm8, %xmm10 1272; SSE-NEXT: addpd %xmm13, %xmm10 1273; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 1274; SSE-NEXT: mulpd %xmm6, %xmm8 1275; SSE-NEXT: addpd %xmm14, %xmm8 1276; SSE-NEXT: movapd %xmm12, %xmm13 1277; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm12[0] 1278; SSE-NEXT: movapd %xmm1, %xmm14 1279; SSE-NEXT: mulpd %xmm13, %xmm14 1280; SSE-NEXT: mulpd %xmm0, %xmm13 1281; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1,1] 1282; SSE-NEXT: movapd %xmm3, %xmm15 1283; SSE-NEXT: mulpd %xmm12, %xmm15 1284; SSE-NEXT: addpd %xmm14, %xmm15 1285; SSE-NEXT: mulpd %xmm2, %xmm12 1286; SSE-NEXT: addpd %xmm13, %xmm12 1287; SSE-NEXT: movapd %xmm9, %xmm13 1288; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm9[0] 1289; SSE-NEXT: movapd %xmm5, %xmm14 1290; SSE-NEXT: mulpd %xmm13, %xmm14 1291; SSE-NEXT: addpd %xmm12, %xmm14 1292; SSE-NEXT: mulpd %xmm4, %xmm13 1293; SSE-NEXT: movapd %xmm4, %xmm2 1294; SSE-NEXT: addpd %xmm15, %xmm13 1295; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1,1] 1296; SSE-NEXT: movapd %xmm7, %xmm12 1297; SSE-NEXT: mulpd %xmm9, %xmm12 1298; SSE-NEXT: addpd %xmm13, %xmm12 1299; SSE-NEXT: mulpd %xmm6, %xmm9 1300; SSE-NEXT: addpd %xmm14, %xmm9 1301; SSE-NEXT: movapd %xmm11, %xmm14 1302; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm11[0] 1303; SSE-NEXT: movapd %xmm1, %xmm13 1304; SSE-NEXT: mulpd %xmm14, %xmm13 1305; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1,1] 1306; SSE-NEXT: movapd %xmm3, %xmm15 1307; SSE-NEXT: mulpd %xmm11, %xmm15 1308; SSE-NEXT: addpd %xmm13, %xmm15 1309; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 1310; SSE-NEXT: mulpd %xmm0, %xmm14 1311; SSE-NEXT: movapd %xmm0, %xmm6 1312; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1313; SSE-NEXT: mulpd %xmm0, %xmm11 1314; SSE-NEXT: addpd %xmm14, %xmm11 1315; SSE-NEXT: movapd %xmm13, %xmm14 1316; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm13[0] 1317; SSE-NEXT: movapd %xmm5, %xmm4 1318; SSE-NEXT: mulpd %xmm14, %xmm4 1319; SSE-NEXT: addpd %xmm11, %xmm4 1320; SSE-NEXT: mulpd %xmm2, %xmm14 1321; SSE-NEXT: addpd %xmm15, %xmm14 1322; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1,1] 1323; SSE-NEXT: movapd %xmm7, %xmm11 1324; SSE-NEXT: mulpd %xmm13, %xmm11 1325; SSE-NEXT: addpd %xmm14, %xmm11 1326; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 1327; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 1328; SSE-NEXT: mulpd %xmm15, %xmm13 1329; SSE-NEXT: addpd %xmm4, %xmm13 1330; SSE-NEXT: movapd %xmm14, %xmm4 1331; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm14[0] 1332; SSE-NEXT: mulpd %xmm4, %xmm1 1333; SSE-NEXT: mulpd %xmm6, %xmm4 1334; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1,1] 1335; SSE-NEXT: mulpd %xmm14, %xmm3 1336; SSE-NEXT: addpd %xmm1, %xmm3 1337; SSE-NEXT: mulpd %xmm0, %xmm14 1338; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 1339; SSE-NEXT: addpd %xmm4, %xmm14 1340; SSE-NEXT: movapd %xmm0, %xmm1 1341; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] 1342; SSE-NEXT: mulpd %xmm1, %xmm5 1343; SSE-NEXT: addpd %xmm14, %xmm5 1344; SSE-NEXT: mulpd %xmm2, %xmm1 1345; SSE-NEXT: addpd %xmm3, %xmm1 1346; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 1347; SSE-NEXT: mulpd %xmm0, %xmm7 1348; SSE-NEXT: addpd %xmm1, %xmm7 1349; SSE-NEXT: mulpd %xmm15, %xmm0 1350; SSE-NEXT: addpd %xmm5, %xmm0 1351; SSE-NEXT: movapd %xmm7, 112(%rdi) 1352; SSE-NEXT: movapd %xmm0, 96(%rdi) 1353; SSE-NEXT: movapd %xmm11, 80(%rdi) 1354; SSE-NEXT: movapd %xmm13, 64(%rdi) 1355; SSE-NEXT: movapd %xmm12, 48(%rdi) 1356; SSE-NEXT: movapd %xmm9, 32(%rdi) 1357; SSE-NEXT: movapd %xmm10, 16(%rdi) 1358; SSE-NEXT: movapd %xmm8, (%rdi) 1359; SSE-NEXT: retq 1360; 1361; AVX1-LABEL: test_mul4x4_f64: 1362; AVX1: # %bb.0: # %entry 1363; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = xmm4[0,0] 1364; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 1365; AVX1-NEXT: vmulpd %ymm0, %ymm8, %ymm8 1366; AVX1-NEXT: vshufpd {{.*#+}} xmm9 = xmm4[1,1] 1367; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 1368; AVX1-NEXT: vmulpd %ymm1, %ymm9, %ymm9 1369; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 1370; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] 1371; AVX1-NEXT: vmovddup {{.*#+}} ymm9 = ymm4[0,0,2,2] 1372; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm9 1373; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 1374; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,1,3,3] 1375; AVX1-NEXT: vmulpd %ymm4, %ymm3, %ymm4 1376; AVX1-NEXT: vaddpd %ymm4, %ymm8, %ymm4 1377; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = xmm5[0,0] 1378; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 1379; AVX1-NEXT: vmulpd %ymm0, %ymm8, %ymm8 1380; AVX1-NEXT: vshufpd {{.*#+}} xmm9 = xmm5[1,1] 1381; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 1382; AVX1-NEXT: vmulpd %ymm1, %ymm9, %ymm9 1383; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 1384; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] 1385; AVX1-NEXT: vmovddup {{.*#+}} ymm9 = ymm5[0,0,2,2] 1386; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm9 1387; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 1388; AVX1-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,1,3,3] 1389; AVX1-NEXT: vmulpd %ymm5, %ymm3, %ymm5 1390; AVX1-NEXT: vaddpd %ymm5, %ymm8, %ymm5 1391; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = xmm6[0,0] 1392; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 1393; AVX1-NEXT: vmulpd %ymm0, %ymm8, %ymm8 1394; AVX1-NEXT: vshufpd {{.*#+}} xmm9 = xmm6[1,1] 1395; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 1396; AVX1-NEXT: vmulpd %ymm1, %ymm9, %ymm9 1397; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 1398; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] 1399; AVX1-NEXT: vmovddup {{.*#+}} ymm9 = ymm6[0,0,2,2] 1400; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm9 1401; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 1402; AVX1-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1,1,3,3] 1403; AVX1-NEXT: vmulpd %ymm6, %ymm3, %ymm6 1404; AVX1-NEXT: vaddpd %ymm6, %ymm8, %ymm6 1405; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = xmm7[0,0] 1406; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 1407; AVX1-NEXT: vmulpd %ymm0, %ymm8, %ymm0 1408; AVX1-NEXT: vshufpd {{.*#+}} xmm8 = xmm7[1,1] 1409; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 1410; AVX1-NEXT: vmulpd %ymm1, %ymm8, %ymm1 1411; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1412; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3,2,3] 1413; AVX1-NEXT: vmovddup {{.*#+}} ymm7 = ymm1[0,0,2,2] 1414; AVX1-NEXT: vmulpd %ymm7, %ymm2, %ymm2 1415; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1416; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,1,3,3] 1417; AVX1-NEXT: vmulpd %ymm1, %ymm3, %ymm1 1418; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm3 1419; AVX1-NEXT: vmovapd %ymm4, %ymm0 1420; AVX1-NEXT: vmovapd %ymm5, %ymm1 1421; AVX1-NEXT: vmovapd %ymm6, %ymm2 1422; AVX1-NEXT: retq 1423; 1424; AVX2-LABEL: test_mul4x4_f64: 1425; AVX2: # %bb.0: # %entry 1426; AVX2-NEXT: vbroadcastsd %xmm4, %ymm8 1427; AVX2-NEXT: vmulpd %ymm0, %ymm8, %ymm8 1428; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm4[1,1,1,1] 1429; AVX2-NEXT: vmulpd %ymm1, %ymm9, %ymm9 1430; AVX2-NEXT: vaddpd %ymm9, %ymm8, %ymm8 1431; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm4[2,2,2,2] 1432; AVX2-NEXT: vmulpd %ymm2, %ymm9, %ymm9 1433; AVX2-NEXT: vaddpd %ymm9, %ymm8, %ymm8 1434; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] 1435; AVX2-NEXT: vmulpd %ymm4, %ymm3, %ymm4 1436; AVX2-NEXT: vaddpd %ymm4, %ymm8, %ymm4 1437; AVX2-NEXT: vbroadcastsd %xmm5, %ymm8 1438; AVX2-NEXT: vmulpd %ymm0, %ymm8, %ymm8 1439; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm5[1,1,1,1] 1440; AVX2-NEXT: vmulpd %ymm1, %ymm9, %ymm9 1441; AVX2-NEXT: vaddpd %ymm9, %ymm8, %ymm8 1442; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm5[2,2,2,2] 1443; AVX2-NEXT: vmulpd %ymm2, %ymm9, %ymm9 1444; AVX2-NEXT: vaddpd %ymm9, %ymm8, %ymm8 1445; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] 1446; AVX2-NEXT: vmulpd %ymm5, %ymm3, %ymm5 1447; AVX2-NEXT: vaddpd %ymm5, %ymm8, %ymm5 1448; AVX2-NEXT: vbroadcastsd %xmm6, %ymm8 1449; AVX2-NEXT: vmulpd %ymm0, %ymm8, %ymm8 1450; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm6[1,1,1,1] 1451; AVX2-NEXT: vmulpd %ymm1, %ymm9, %ymm9 1452; AVX2-NEXT: vaddpd %ymm9, %ymm8, %ymm8 1453; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm6[2,2,2,2] 1454; AVX2-NEXT: vmulpd %ymm2, %ymm9, %ymm9 1455; AVX2-NEXT: vaddpd %ymm9, %ymm8, %ymm8 1456; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] 1457; AVX2-NEXT: vmulpd %ymm6, %ymm3, %ymm6 1458; AVX2-NEXT: vaddpd %ymm6, %ymm8, %ymm6 1459; AVX2-NEXT: vbroadcastsd %xmm7, %ymm8 1460; AVX2-NEXT: vmulpd %ymm0, %ymm8, %ymm0 1461; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm7[1,1,1,1] 1462; AVX2-NEXT: vmulpd %ymm1, %ymm8, %ymm1 1463; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1464; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,2,2,2] 1465; AVX2-NEXT: vmulpd %ymm1, %ymm2, %ymm1 1466; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1467; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[3,3,3,3] 1468; AVX2-NEXT: vmulpd %ymm1, %ymm3, %ymm1 1469; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm3 1470; AVX2-NEXT: vmovapd %ymm4, %ymm0 1471; AVX2-NEXT: vmovapd %ymm5, %ymm1 1472; AVX2-NEXT: vmovapd %ymm6, %ymm2 1473; AVX2-NEXT: retq 1474; 1475; AVX512F-LABEL: test_mul4x4_f64: 1476; AVX512F: # %bb.0: # %entry 1477; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm5 1478; AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm4 1479; AVX512F-NEXT: vbroadcastsd %xmm2, %ymm6 1480; AVX512F-NEXT: vmulpd %ymm6, %ymm0, %ymm6 1481; AVX512F-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[1,1,1,1] 1482; AVX512F-NEXT: vmulpd %ymm7, %ymm5, %ymm7 1483; AVX512F-NEXT: vaddpd %ymm7, %ymm6, %ymm6 1484; AVX512F-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[2,2,2,2] 1485; AVX512F-NEXT: vmulpd %ymm7, %ymm1, %ymm7 1486; AVX512F-NEXT: vaddpd %ymm7, %ymm6, %ymm6 1487; AVX512F-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[3,3,3,3] 1488; AVX512F-NEXT: vmulpd %ymm7, %ymm4, %ymm7 1489; AVX512F-NEXT: vaddpd %ymm7, %ymm6, %ymm6 1490; AVX512F-NEXT: vextractf64x4 $1, %zmm2, %ymm2 1491; AVX512F-NEXT: vbroadcastsd %xmm2, %ymm7 1492; AVX512F-NEXT: vmulpd %ymm7, %ymm0, %ymm7 1493; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[1,1,1,1] 1494; AVX512F-NEXT: vmulpd %ymm5, %ymm8, %ymm8 1495; AVX512F-NEXT: vaddpd %ymm7, %ymm8, %ymm7 1496; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[2,2,2,2] 1497; AVX512F-NEXT: vmulpd %ymm1, %ymm8, %ymm8 1498; AVX512F-NEXT: vaddpd %ymm7, %ymm8, %ymm7 1499; AVX512F-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] 1500; AVX512F-NEXT: vmulpd %ymm2, %ymm4, %ymm2 1501; AVX512F-NEXT: vaddpd %ymm2, %ymm7, %ymm2 1502; AVX512F-NEXT: vbroadcastsd %xmm3, %ymm7 1503; AVX512F-NEXT: vmulpd %ymm7, %ymm0, %ymm7 1504; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1] 1505; AVX512F-NEXT: vmulpd %ymm5, %ymm8, %ymm8 1506; AVX512F-NEXT: vaddpd %ymm7, %ymm8, %ymm7 1507; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[2,2,2,2] 1508; AVX512F-NEXT: vmulpd %ymm1, %ymm8, %ymm8 1509; AVX512F-NEXT: vaddpd %ymm7, %ymm8, %ymm7 1510; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[3,3,3,3] 1511; AVX512F-NEXT: vmulpd %ymm4, %ymm8, %ymm8 1512; AVX512F-NEXT: vaddpd %ymm7, %ymm8, %ymm7 1513; AVX512F-NEXT: vextractf64x4 $1, %zmm3, %ymm3 1514; AVX512F-NEXT: vbroadcastsd %xmm3, %ymm8 1515; AVX512F-NEXT: vmulpd %ymm0, %ymm8, %ymm0 1516; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1] 1517; AVX512F-NEXT: vmulpd %ymm5, %ymm8, %ymm5 1518; AVX512F-NEXT: vaddpd %ymm5, %ymm0, %ymm0 1519; AVX512F-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[2,2,2,2] 1520; AVX512F-NEXT: vmulpd %ymm5, %ymm1, %ymm1 1521; AVX512F-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1522; AVX512F-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[3,3,3,3] 1523; AVX512F-NEXT: vmulpd %ymm1, %ymm4, %ymm1 1524; AVX512F-NEXT: vaddpd %ymm1, %ymm0, %ymm1 1525; AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm6, %zmm0 1526; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm7, %zmm1 1527; AVX512F-NEXT: retq 1528; 1529; AVX512VL-LABEL: test_mul4x4_f64: 1530; AVX512VL: # %bb.0: # %entry 1531; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm4 1532; AVX512VL-NEXT: vextractf64x4 $1, %zmm1, %ymm5 1533; AVX512VL-NEXT: vbroadcastsd %xmm2, %ymm6 1534; AVX512VL-NEXT: vmulpd %ymm6, %ymm0, %ymm6 1535; AVX512VL-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[1,1,1,1] 1536; AVX512VL-NEXT: vmulpd %ymm7, %ymm4, %ymm7 1537; AVX512VL-NEXT: vaddpd %ymm7, %ymm6, %ymm6 1538; AVX512VL-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[2,2,2,2] 1539; AVX512VL-NEXT: vmulpd %ymm7, %ymm1, %ymm7 1540; AVX512VL-NEXT: vaddpd %ymm7, %ymm6, %ymm6 1541; AVX512VL-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[3,3,3,3] 1542; AVX512VL-NEXT: vmulpd %ymm7, %ymm5, %ymm7 1543; AVX512VL-NEXT: vaddpd %ymm7, %ymm6, %ymm6 1544; AVX512VL-NEXT: vextractf64x4 $1, %zmm2, %ymm2 1545; AVX512VL-NEXT: vbroadcastsd %xmm2, %ymm7 1546; AVX512VL-NEXT: vmulpd %ymm7, %ymm0, %ymm7 1547; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[1,1,1,1] 1548; AVX512VL-NEXT: vmulpd %ymm4, %ymm8, %ymm8 1549; AVX512VL-NEXT: vaddpd %ymm7, %ymm8, %ymm7 1550; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[2,2,2,2] 1551; AVX512VL-NEXT: vmulpd %ymm1, %ymm8, %ymm8 1552; AVX512VL-NEXT: vaddpd %ymm7, %ymm8, %ymm7 1553; AVX512VL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] 1554; AVX512VL-NEXT: vmulpd %ymm2, %ymm5, %ymm2 1555; AVX512VL-NEXT: vaddpd %ymm2, %ymm7, %ymm2 1556; AVX512VL-NEXT: vbroadcastsd %xmm3, %ymm7 1557; AVX512VL-NEXT: vmulpd %ymm7, %ymm0, %ymm7 1558; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1] 1559; AVX512VL-NEXT: vmulpd %ymm4, %ymm8, %ymm8 1560; AVX512VL-NEXT: vaddpd %ymm7, %ymm8, %ymm7 1561; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[2,2,2,2] 1562; AVX512VL-NEXT: vmulpd %ymm1, %ymm8, %ymm8 1563; AVX512VL-NEXT: vaddpd %ymm7, %ymm8, %ymm7 1564; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[3,3,3,3] 1565; AVX512VL-NEXT: vmulpd %ymm5, %ymm8, %ymm8 1566; AVX512VL-NEXT: vaddpd %ymm7, %ymm8, %ymm7 1567; AVX512VL-NEXT: vextractf64x4 $1, %zmm3, %ymm3 1568; AVX512VL-NEXT: vbroadcastsd %xmm3, %ymm8 1569; AVX512VL-NEXT: vmulpd %ymm0, %ymm8, %ymm0 1570; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1] 1571; AVX512VL-NEXT: vmulpd %ymm4, %ymm8, %ymm4 1572; AVX512VL-NEXT: vaddpd %ymm4, %ymm0, %ymm0 1573; AVX512VL-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[2,2,2,2] 1574; AVX512VL-NEXT: vmulpd %ymm4, %ymm1, %ymm1 1575; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1576; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[3,3,3,3] 1577; AVX512VL-NEXT: vmulpd %ymm1, %ymm5, %ymm1 1578; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm1 1579; AVX512VL-NEXT: vinsertf64x4 $1, %ymm2, %zmm6, %zmm0 1580; AVX512VL-NEXT: vinsertf64x4 $1, %ymm1, %zmm7, %zmm1 1581; AVX512VL-NEXT: retq 1582entry: 1583 %split = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1584 %split1 = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1585 %split2 = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11> 1586 %split3 = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15> 1587 %splat.splat = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> zeroinitializer 1588 %0 = fmul <4 x double> %split, %splat.splat 1589 %splat.splat10 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1590 %1 = fmul <4 x double> %split1, %splat.splat10 1591 %2 = fadd <4 x double> %0, %1 1592 %splat.splat13 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2> 1593 %3 = fmul <4 x double> %split2, %splat.splat13 1594 %4 = fadd <4 x double> %2, %3 1595 %splat.splat16 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1596 %5 = fmul <4 x double> %split3, %splat.splat16 1597 %6 = fadd <4 x double> %4, %5 1598 %splat.splat19 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4> 1599 %7 = fmul <4 x double> %split, %splat.splat19 1600 %splat.splat22 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5> 1601 %8 = fmul <4 x double> %split1, %splat.splat22 1602 %9 = fadd <4 x double> %7, %8 1603 %splat.splat25 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6> 1604 %10 = fmul <4 x double> %split2, %splat.splat25 1605 %11 = fadd <4 x double> %9, %10 1606 %splat.splat28 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> 1607 %12 = fmul <4 x double> %split3, %splat.splat28 1608 %13 = fadd <4 x double> %11, %12 1609 %splat.splat31 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 8, i32 8, i32 8, i32 8> 1610 %14 = fmul <4 x double> %split, %splat.splat31 1611 %splat.splat34 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 9, i32 9, i32 9, i32 9> 1612 %15 = fmul <4 x double> %split1, %splat.splat34 1613 %16 = fadd <4 x double> %14, %15 1614 %splat.splat37 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 10, i32 10, i32 10, i32 10> 1615 %17 = fmul <4 x double> %split2, %splat.splat37 1616 %18 = fadd <4 x double> %16, %17 1617 %splat.splat40 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 11, i32 11, i32 11, i32 11> 1618 %19 = fmul <4 x double> %split3, %splat.splat40 1619 %20 = fadd <4 x double> %18, %19 1620 %splat.splat43 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 12, i32 12, i32 12, i32 12> 1621 %21 = fmul <4 x double> %split, %splat.splat43 1622 %splat.splat46 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 13, i32 13, i32 13, i32 13> 1623 %22 = fmul <4 x double> %split1, %splat.splat46 1624 %23 = fadd <4 x double> %21, %22 1625 %splat.splat49 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 14, i32 14, i32 14, i32 14> 1626 %24 = fmul <4 x double> %split2, %splat.splat49 1627 %25 = fadd <4 x double> %23, %24 1628 %splat.splat52 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 15, i32 15, i32 15, i32 15> 1629 %26 = fmul <4 x double> %split3, %splat.splat52 1630 %27 = fadd <4 x double> %25, %26 1631 %28 = shufflevector <4 x double> %6, <4 x double> %13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1632 %29 = shufflevector <4 x double> %20, <4 x double> %27, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1633 %30 = shufflevector <8 x double> %28, <8 x double> %29, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1634 ret <16 x double> %30 1635} 1636 1637define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwind { 1638; SSE-LABEL: test_mul8x8_f32: 1639; SSE: # %bb.0: # %entry 1640; SSE-NEXT: subq $120, %rsp 1641; SSE-NEXT: movaps %xmm5, %xmm11 1642; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1643; SSE-NEXT: movaps %xmm1, %xmm9 1644; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1645; SSE-NEXT: movq %rdi, %rax 1646; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 1647; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 1648; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 1649; SSE-NEXT: movaps %xmm14, %xmm15 1650; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,0],xmm14[0,0] 1651; SSE-NEXT: movaps %xmm1, %xmm5 1652; SSE-NEXT: mulps %xmm15, %xmm5 1653; SSE-NEXT: mulps %xmm0, %xmm15 1654; SSE-NEXT: movaps %xmm14, %xmm0 1655; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1] 1656; SSE-NEXT: movaps %xmm3, %xmm10 1657; SSE-NEXT: movaps %xmm3, %xmm12 1658; SSE-NEXT: mulps %xmm0, %xmm10 1659; SSE-NEXT: addps %xmm5, %xmm10 1660; SSE-NEXT: mulps %xmm2, %xmm0 1661; SSE-NEXT: addps %xmm15, %xmm0 1662; SSE-NEXT: movaps %xmm14, %xmm1 1663; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[2,2] 1664; SSE-NEXT: movaps %xmm4, %xmm2 1665; SSE-NEXT: movaps %xmm4, %xmm15 1666; SSE-NEXT: mulps %xmm1, %xmm2 1667; SSE-NEXT: addps %xmm0, %xmm2 1668; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 1669; SSE-NEXT: mulps %xmm11, %xmm1 1670; SSE-NEXT: addps %xmm10, %xmm1 1671; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3,3,3] 1672; SSE-NEXT: movaps %xmm7, %xmm3 1673; SSE-NEXT: mulps %xmm14, %xmm3 1674; SSE-NEXT: addps %xmm1, %xmm3 1675; SSE-NEXT: mulps %xmm6, %xmm14 1676; SSE-NEXT: addps %xmm2, %xmm14 1677; SSE-NEXT: movaps %xmm5, %xmm1 1678; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm5[0,0] 1679; SSE-NEXT: movaps %xmm13, %xmm2 1680; SSE-NEXT: mulps %xmm1, %xmm2 1681; SSE-NEXT: addps %xmm14, %xmm2 1682; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 1683; SSE-NEXT: addps %xmm3, %xmm1 1684; SSE-NEXT: movaps %xmm5, %xmm0 1685; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] 1686; SSE-NEXT: movaps %xmm8, %xmm3 1687; SSE-NEXT: mulps %xmm0, %xmm3 1688; SSE-NEXT: addps %xmm1, %xmm3 1689; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 1690; SSE-NEXT: addps %xmm2, %xmm0 1691; SSE-NEXT: movaps %xmm5, %xmm1 1692; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[2,2] 1693; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 1694; SSE-NEXT: mulps %xmm1, %xmm2 1695; SSE-NEXT: addps %xmm0, %xmm2 1696; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 1697; SSE-NEXT: addps %xmm3, %xmm1 1698; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3,3,3] 1699; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 1700; SSE-NEXT: mulps %xmm5, %xmm0 1701; SSE-NEXT: addps %xmm1, %xmm0 1702; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1703; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm5 1704; SSE-NEXT: addps %xmm2, %xmm5 1705; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1706; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 1707; SSE-NEXT: movaps %xmm0, %xmm1 1708; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] 1709; SSE-NEXT: movaps %xmm9, %xmm2 1710; SSE-NEXT: mulps %xmm1, %xmm2 1711; SSE-NEXT: movaps %xmm0, %xmm3 1712; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] 1713; SSE-NEXT: movaps %xmm12, %xmm4 1714; SSE-NEXT: mulps %xmm3, %xmm4 1715; SSE-NEXT: addps %xmm2, %xmm4 1716; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 1717; SSE-NEXT: mulps %xmm10, %xmm1 1718; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 1719; SSE-NEXT: mulps %xmm13, %xmm3 1720; SSE-NEXT: addps %xmm1, %xmm3 1721; SSE-NEXT: movaps %xmm0, %xmm1 1722; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] 1723; SSE-NEXT: movaps %xmm15, %xmm2 1724; SSE-NEXT: mulps %xmm1, %xmm2 1725; SSE-NEXT: addps %xmm3, %xmm2 1726; SSE-NEXT: movaps %xmm11, %xmm8 1727; SSE-NEXT: mulps %xmm11, %xmm1 1728; SSE-NEXT: addps %xmm4, %xmm1 1729; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1730; SSE-NEXT: movaps %xmm7, %xmm3 1731; SSE-NEXT: mulps %xmm0, %xmm3 1732; SSE-NEXT: addps %xmm1, %xmm3 1733; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1734; SSE-NEXT: mulps %xmm6, %xmm0 1735; SSE-NEXT: addps %xmm2, %xmm0 1736; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 1737; SSE-NEXT: movaps %xmm4, %xmm1 1738; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0] 1739; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 1740; SSE-NEXT: movaps %xmm14, %xmm2 1741; SSE-NEXT: mulps %xmm1, %xmm2 1742; SSE-NEXT: addps %xmm0, %xmm2 1743; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 1744; SSE-NEXT: addps %xmm3, %xmm1 1745; SSE-NEXT: movaps %xmm4, %xmm0 1746; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] 1747; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 1748; SSE-NEXT: movaps %xmm11, %xmm3 1749; SSE-NEXT: mulps %xmm0, %xmm3 1750; SSE-NEXT: addps %xmm1, %xmm3 1751; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 1752; SSE-NEXT: mulps %xmm1, %xmm0 1753; SSE-NEXT: addps %xmm2, %xmm0 1754; SSE-NEXT: movaps %xmm4, %xmm1 1755; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2] 1756; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 1757; SSE-NEXT: mulps %xmm1, %xmm2 1758; SSE-NEXT: addps %xmm0, %xmm2 1759; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 1760; SSE-NEXT: addps %xmm3, %xmm1 1761; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] 1762; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 1763; SSE-NEXT: mulps %xmm4, %xmm0 1764; SSE-NEXT: addps %xmm1, %xmm0 1765; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1766; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm4 1767; SSE-NEXT: addps %xmm2, %xmm4 1768; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1769; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 1770; SSE-NEXT: movaps %xmm0, %xmm1 1771; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] 1772; SSE-NEXT: movaps %xmm9, %xmm2 1773; SSE-NEXT: mulps %xmm1, %xmm2 1774; SSE-NEXT: movaps %xmm0, %xmm3 1775; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] 1776; SSE-NEXT: movaps %xmm12, %xmm4 1777; SSE-NEXT: mulps %xmm3, %xmm4 1778; SSE-NEXT: addps %xmm2, %xmm4 1779; SSE-NEXT: mulps %xmm10, %xmm1 1780; SSE-NEXT: mulps %xmm13, %xmm3 1781; SSE-NEXT: addps %xmm1, %xmm3 1782; SSE-NEXT: movaps %xmm0, %xmm1 1783; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] 1784; SSE-NEXT: movaps %xmm15, %xmm2 1785; SSE-NEXT: movaps %xmm15, %xmm5 1786; SSE-NEXT: mulps %xmm1, %xmm2 1787; SSE-NEXT: addps %xmm3, %xmm2 1788; SSE-NEXT: mulps %xmm8, %xmm1 1789; SSE-NEXT: addps %xmm4, %xmm1 1790; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1791; SSE-NEXT: movaps %xmm7, %xmm3 1792; SSE-NEXT: mulps %xmm0, %xmm3 1793; SSE-NEXT: addps %xmm1, %xmm3 1794; SSE-NEXT: mulps %xmm6, %xmm0 1795; SSE-NEXT: addps %xmm2, %xmm0 1796; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 1797; SSE-NEXT: movaps %xmm4, %xmm1 1798; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0] 1799; SSE-NEXT: movaps %xmm14, %xmm2 1800; SSE-NEXT: mulps %xmm1, %xmm2 1801; SSE-NEXT: addps %xmm0, %xmm2 1802; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 1803; SSE-NEXT: mulps %xmm14, %xmm1 1804; SSE-NEXT: addps %xmm3, %xmm1 1805; SSE-NEXT: movaps %xmm4, %xmm0 1806; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] 1807; SSE-NEXT: movaps %xmm11, %xmm3 1808; SSE-NEXT: mulps %xmm0, %xmm3 1809; SSE-NEXT: addps %xmm1, %xmm3 1810; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 1811; SSE-NEXT: addps %xmm2, %xmm0 1812; SSE-NEXT: movaps %xmm4, %xmm1 1813; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2] 1814; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 1815; SSE-NEXT: mulps %xmm1, %xmm2 1816; SSE-NEXT: addps %xmm0, %xmm2 1817; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 1818; SSE-NEXT: mulps %xmm11, %xmm1 1819; SSE-NEXT: addps %xmm3, %xmm1 1820; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] 1821; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 1822; SSE-NEXT: mulps %xmm4, %xmm0 1823; SSE-NEXT: addps %xmm1, %xmm0 1824; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1825; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 1826; SSE-NEXT: mulps %xmm0, %xmm4 1827; SSE-NEXT: addps %xmm2, %xmm4 1828; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1829; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 1830; SSE-NEXT: movaps %xmm0, %xmm1 1831; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] 1832; SSE-NEXT: movaps %xmm9, %xmm2 1833; SSE-NEXT: mulps %xmm1, %xmm2 1834; SSE-NEXT: movaps %xmm0, %xmm3 1835; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] 1836; SSE-NEXT: movaps %xmm12, %xmm4 1837; SSE-NEXT: mulps %xmm3, %xmm4 1838; SSE-NEXT: addps %xmm2, %xmm4 1839; SSE-NEXT: movaps %xmm10, %xmm15 1840; SSE-NEXT: mulps %xmm10, %xmm1 1841; SSE-NEXT: mulps %xmm13, %xmm3 1842; SSE-NEXT: addps %xmm1, %xmm3 1843; SSE-NEXT: movaps %xmm0, %xmm1 1844; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] 1845; SSE-NEXT: movaps %xmm5, %xmm2 1846; SSE-NEXT: mulps %xmm1, %xmm2 1847; SSE-NEXT: addps %xmm3, %xmm2 1848; SSE-NEXT: mulps %xmm8, %xmm1 1849; SSE-NEXT: addps %xmm4, %xmm1 1850; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1851; SSE-NEXT: movaps %xmm7, %xmm4 1852; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1853; SSE-NEXT: movaps %xmm7, %xmm3 1854; SSE-NEXT: mulps %xmm0, %xmm3 1855; SSE-NEXT: addps %xmm1, %xmm3 1856; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 1857; SSE-NEXT: mulps %xmm6, %xmm0 1858; SSE-NEXT: addps %xmm2, %xmm0 1859; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 1860; SSE-NEXT: movaps %xmm10, %xmm1 1861; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm10[0,0] 1862; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 1863; SSE-NEXT: mulps %xmm1, %xmm2 1864; SSE-NEXT: addps %xmm0, %xmm2 1865; SSE-NEXT: mulps %xmm14, %xmm1 1866; SSE-NEXT: addps %xmm3, %xmm1 1867; SSE-NEXT: movaps %xmm10, %xmm0 1868; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] 1869; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 1870; SSE-NEXT: mulps %xmm0, %xmm3 1871; SSE-NEXT: addps %xmm1, %xmm3 1872; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 1873; SSE-NEXT: addps %xmm2, %xmm0 1874; SSE-NEXT: movaps %xmm10, %xmm1 1875; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm10[2,2] 1876; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 1877; SSE-NEXT: mulps %xmm1, %xmm2 1878; SSE-NEXT: addps %xmm0, %xmm2 1879; SSE-NEXT: mulps %xmm11, %xmm1 1880; SSE-NEXT: addps %xmm3, %xmm1 1881; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3,3,3] 1882; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 1883; SSE-NEXT: movaps %xmm11, %xmm0 1884; SSE-NEXT: mulps %xmm10, %xmm0 1885; SSE-NEXT: addps %xmm1, %xmm0 1886; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1887; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm10 1888; SSE-NEXT: addps %xmm2, %xmm10 1889; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 1890; SSE-NEXT: movaps %xmm0, %xmm1 1891; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] 1892; SSE-NEXT: movaps %xmm9, %xmm2 1893; SSE-NEXT: movaps %xmm9, %xmm14 1894; SSE-NEXT: mulps %xmm1, %xmm2 1895; SSE-NEXT: movaps %xmm0, %xmm3 1896; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] 1897; SSE-NEXT: movaps %xmm12, %xmm7 1898; SSE-NEXT: mulps %xmm3, %xmm7 1899; SSE-NEXT: addps %xmm2, %xmm7 1900; SSE-NEXT: mulps %xmm15, %xmm1 1901; SSE-NEXT: mulps %xmm13, %xmm3 1902; SSE-NEXT: addps %xmm1, %xmm3 1903; SSE-NEXT: movaps %xmm0, %xmm1 1904; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] 1905; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1906; SSE-NEXT: movaps %xmm5, %xmm2 1907; SSE-NEXT: mulps %xmm1, %xmm2 1908; SSE-NEXT: addps %xmm3, %xmm2 1909; SSE-NEXT: movaps %xmm8, %xmm9 1910; SSE-NEXT: mulps %xmm8, %xmm1 1911; SSE-NEXT: addps %xmm7, %xmm1 1912; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1913; SSE-NEXT: movaps %xmm4, %xmm7 1914; SSE-NEXT: mulps %xmm0, %xmm7 1915; SSE-NEXT: addps %xmm1, %xmm7 1916; SSE-NEXT: movaps %xmm6, %xmm3 1917; SSE-NEXT: mulps %xmm6, %xmm0 1918; SSE-NEXT: addps %xmm2, %xmm0 1919; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 1920; SSE-NEXT: movaps %xmm4, %xmm1 1921; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0] 1922; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 1923; SSE-NEXT: mulps %xmm1, %xmm2 1924; SSE-NEXT: addps %xmm0, %xmm2 1925; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 1926; SSE-NEXT: addps %xmm7, %xmm1 1927; SSE-NEXT: movaps %xmm4, %xmm0 1928; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] 1929; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 1930; SSE-NEXT: mulps %xmm0, %xmm7 1931; SSE-NEXT: addps %xmm1, %xmm7 1932; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 1933; SSE-NEXT: mulps %xmm1, %xmm0 1934; SSE-NEXT: addps %xmm2, %xmm0 1935; SSE-NEXT: movaps %xmm4, %xmm1 1936; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2] 1937; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 1938; SSE-NEXT: mulps %xmm1, %xmm2 1939; SSE-NEXT: addps %xmm0, %xmm2 1940; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 1941; SSE-NEXT: addps %xmm7, %xmm1 1942; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] 1943; SSE-NEXT: movaps %xmm11, %xmm0 1944; SSE-NEXT: mulps %xmm4, %xmm0 1945; SSE-NEXT: addps %xmm1, %xmm0 1946; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 1947; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm4 1948; SSE-NEXT: addps %xmm2, %xmm4 1949; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 1950; SSE-NEXT: movaps %xmm0, %xmm1 1951; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] 1952; SSE-NEXT: movaps %xmm14, %xmm6 1953; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1954; SSE-NEXT: movaps %xmm14, %xmm2 1955; SSE-NEXT: mulps %xmm1, %xmm2 1956; SSE-NEXT: movaps %xmm0, %xmm14 1957; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm0[1,1] 1958; SSE-NEXT: movaps %xmm12, %xmm15 1959; SSE-NEXT: movaps %xmm12, %xmm13 1960; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1961; SSE-NEXT: mulps %xmm14, %xmm15 1962; SSE-NEXT: addps %xmm2, %xmm15 1963; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 1964; SSE-NEXT: mulps %xmm8, %xmm1 1965; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 1966; SSE-NEXT: mulps %xmm7, %xmm14 1967; SSE-NEXT: addps %xmm1, %xmm14 1968; SSE-NEXT: movaps %xmm0, %xmm1 1969; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] 1970; SSE-NEXT: movaps %xmm5, %xmm2 1971; SSE-NEXT: mulps %xmm1, %xmm2 1972; SSE-NEXT: addps %xmm14, %xmm2 1973; SSE-NEXT: mulps %xmm9, %xmm1 1974; SSE-NEXT: movaps %xmm9, %xmm11 1975; SSE-NEXT: addps %xmm15, %xmm1 1976; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1977; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 1978; SSE-NEXT: movaps %xmm5, %xmm14 1979; SSE-NEXT: mulps %xmm0, %xmm14 1980; SSE-NEXT: addps %xmm1, %xmm14 1981; SSE-NEXT: mulps %xmm3, %xmm0 1982; SSE-NEXT: movaps %xmm3, %xmm12 1983; SSE-NEXT: addps %xmm2, %xmm0 1984; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 1985; SSE-NEXT: movaps %xmm3, %xmm1 1986; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0] 1987; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 1988; SSE-NEXT: mulps %xmm1, %xmm15 1989; SSE-NEXT: addps %xmm0, %xmm15 1990; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 1991; SSE-NEXT: mulps %xmm0, %xmm1 1992; SSE-NEXT: addps %xmm14, %xmm1 1993; SSE-NEXT: movaps %xmm3, %xmm0 1994; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] 1995; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 1996; SSE-NEXT: mulps %xmm0, %xmm14 1997; SSE-NEXT: addps %xmm1, %xmm14 1998; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 1999; SSE-NEXT: addps %xmm15, %xmm0 2000; SSE-NEXT: movaps %xmm3, %xmm1 2001; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[2,2] 2002; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 2003; SSE-NEXT: mulps %xmm1, %xmm15 2004; SSE-NEXT: addps %xmm0, %xmm15 2005; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 2006; SSE-NEXT: addps %xmm14, %xmm1 2007; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 2008; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 2009; SSE-NEXT: mulps %xmm3, %xmm14 2010; SSE-NEXT: addps %xmm1, %xmm14 2011; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm3 2012; SSE-NEXT: addps %xmm15, %xmm3 2013; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 2014; SSE-NEXT: movaps %xmm0, %xmm1 2015; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] 2016; SSE-NEXT: mulps %xmm1, %xmm6 2017; SSE-NEXT: movaps %xmm0, %xmm15 2018; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm0[1,1] 2019; SSE-NEXT: mulps %xmm15, %xmm13 2020; SSE-NEXT: addps %xmm6, %xmm13 2021; SSE-NEXT: mulps %xmm8, %xmm1 2022; SSE-NEXT: mulps %xmm7, %xmm15 2023; SSE-NEXT: addps %xmm1, %xmm15 2024; SSE-NEXT: movaps %xmm0, %xmm1 2025; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] 2026; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 2027; SSE-NEXT: movaps %xmm6, %xmm2 2028; SSE-NEXT: mulps %xmm1, %xmm2 2029; SSE-NEXT: addps %xmm15, %xmm2 2030; SSE-NEXT: mulps %xmm9, %xmm1 2031; SSE-NEXT: addps %xmm13, %xmm1 2032; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2033; SSE-NEXT: movaps %xmm5, %xmm9 2034; SSE-NEXT: mulps %xmm0, %xmm9 2035; SSE-NEXT: addps %xmm1, %xmm9 2036; SSE-NEXT: mulps %xmm12, %xmm0 2037; SSE-NEXT: movaps %xmm12, %xmm5 2038; SSE-NEXT: addps %xmm2, %xmm0 2039; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 2040; SSE-NEXT: movaps %xmm1, %xmm2 2041; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] 2042; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 2043; SSE-NEXT: mulps %xmm2, %xmm15 2044; SSE-NEXT: addps %xmm0, %xmm15 2045; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 2046; SSE-NEXT: mulps %xmm0, %xmm2 2047; SSE-NEXT: addps %xmm9, %xmm2 2048; SSE-NEXT: movaps %xmm1, %xmm0 2049; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 2050; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 2051; SSE-NEXT: mulps %xmm0, %xmm9 2052; SSE-NEXT: addps %xmm2, %xmm9 2053; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 2054; SSE-NEXT: addps %xmm15, %xmm0 2055; SSE-NEXT: movaps %xmm1, %xmm2 2056; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2] 2057; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 2058; SSE-NEXT: mulps %xmm2, %xmm13 2059; SSE-NEXT: addps %xmm0, %xmm13 2060; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 2061; SSE-NEXT: addps %xmm9, %xmm2 2062; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 2063; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 2064; SSE-NEXT: mulps %xmm1, %xmm15 2065; SSE-NEXT: addps %xmm2, %xmm15 2066; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 2067; SSE-NEXT: addps %xmm13, %xmm1 2068; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 2069; SSE-NEXT: movaps %xmm0, %xmm2 2070; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0] 2071; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 2072; SSE-NEXT: mulps %xmm2, %xmm13 2073; SSE-NEXT: mulps %xmm8, %xmm2 2074; SSE-NEXT: movaps %xmm0, %xmm9 2075; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1] 2076; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 2077; SSE-NEXT: mulps %xmm9, %xmm8 2078; SSE-NEXT: addps %xmm13, %xmm8 2079; SSE-NEXT: mulps %xmm7, %xmm9 2080; SSE-NEXT: addps %xmm2, %xmm9 2081; SSE-NEXT: movaps %xmm0, %xmm2 2082; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm0[2,2] 2083; SSE-NEXT: mulps %xmm2, %xmm6 2084; SSE-NEXT: addps %xmm9, %xmm6 2085; SSE-NEXT: mulps %xmm11, %xmm2 2086; SSE-NEXT: addps %xmm8, %xmm2 2087; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 2088; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 2089; SSE-NEXT: mulps %xmm0, %xmm9 2090; SSE-NEXT: addps %xmm2, %xmm9 2091; SSE-NEXT: movaps %xmm9, %xmm12 2092; SSE-NEXT: mulps %xmm5, %xmm0 2093; SSE-NEXT: addps %xmm6, %xmm0 2094; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 2095; SSE-NEXT: movaps %xmm9, %xmm2 2096; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm9[0,0] 2097; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 2098; SSE-NEXT: mulps %xmm2, %xmm13 2099; SSE-NEXT: addps %xmm0, %xmm13 2100; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 2101; SSE-NEXT: addps %xmm12, %xmm2 2102; SSE-NEXT: movaps %xmm9, %xmm0 2103; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] 2104; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12 2105; SSE-NEXT: mulps %xmm0, %xmm12 2106; SSE-NEXT: addps %xmm2, %xmm12 2107; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 2108; SSE-NEXT: addps %xmm13, %xmm0 2109; SSE-NEXT: movaps %xmm9, %xmm2 2110; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm9[2,2] 2111; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 2112; SSE-NEXT: mulps %xmm2, %xmm5 2113; SSE-NEXT: addps %xmm0, %xmm5 2114; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 2115; SSE-NEXT: addps %xmm12, %xmm2 2116; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3,3,3] 2117; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 2118; SSE-NEXT: mulps %xmm9, %xmm0 2119; SSE-NEXT: addps %xmm2, %xmm0 2120; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm9 2121; SSE-NEXT: addps %xmm5, %xmm9 2122; SSE-NEXT: movaps %xmm0, 240(%rdi) 2123; SSE-NEXT: movaps %xmm9, 224(%rdi) 2124; SSE-NEXT: movaps %xmm15, 208(%rdi) 2125; SSE-NEXT: movaps %xmm1, 192(%rdi) 2126; SSE-NEXT: movaps %xmm14, 176(%rdi) 2127; SSE-NEXT: movaps %xmm3, 160(%rdi) 2128; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 2129; SSE-NEXT: movaps %xmm0, 144(%rdi) 2130; SSE-NEXT: movaps %xmm4, 128(%rdi) 2131; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2132; SSE-NEXT: movaps %xmm0, 112(%rdi) 2133; SSE-NEXT: movaps %xmm10, 96(%rdi) 2134; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2135; SSE-NEXT: movaps %xmm0, 80(%rdi) 2136; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2137; SSE-NEXT: movaps %xmm0, 64(%rdi) 2138; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2139; SSE-NEXT: movaps %xmm0, 48(%rdi) 2140; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2141; SSE-NEXT: movaps %xmm0, 32(%rdi) 2142; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2143; SSE-NEXT: movaps %xmm0, 16(%rdi) 2144; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2145; SSE-NEXT: movaps %xmm0, (%rdi) 2146; SSE-NEXT: addq $120, %rsp 2147; SSE-NEXT: retq 2148; 2149; AVX1-LABEL: test_mul8x8_f32: 2150; AVX1: # %bb.0: # %entry 2151; AVX1-NEXT: pushq %rbp 2152; AVX1-NEXT: movq %rsp, %rbp 2153; AVX1-NEXT: andq $-32, %rsp 2154; AVX1-NEXT: subq $32, %rsp 2155; AVX1-NEXT: movq %rdi, %rax 2156; AVX1-NEXT: vbroadcastss 16(%rbp), %ymm8 2157; AVX1-NEXT: vmulps %ymm0, %ymm8, %ymm8 2158; AVX1-NEXT: vbroadcastss 20(%rbp), %ymm9 2159; AVX1-NEXT: vmulps %ymm1, %ymm9, %ymm9 2160; AVX1-NEXT: vaddps %ymm9, %ymm8, %ymm8 2161; AVX1-NEXT: vbroadcastss 24(%rbp), %ymm9 2162; AVX1-NEXT: vmulps %ymm2, %ymm9, %ymm9 2163; AVX1-NEXT: vaddps %ymm9, %ymm8, %ymm8 2164; AVX1-NEXT: vbroadcastss 28(%rbp), %ymm9 2165; AVX1-NEXT: vmulps %ymm3, %ymm9, %ymm9 2166; AVX1-NEXT: vaddps %ymm9, %ymm8, %ymm8 2167; AVX1-NEXT: vbroadcastss 32(%rbp), %ymm9 2168; AVX1-NEXT: vmulps %ymm4, %ymm9, %ymm9 2169; AVX1-NEXT: vaddps %ymm9, %ymm8, %ymm8 2170; AVX1-NEXT: vbroadcastss 36(%rbp), %ymm9 2171; AVX1-NEXT: vmulps %ymm5, %ymm9, %ymm9 2172; AVX1-NEXT: vaddps %ymm9, %ymm8, %ymm8 2173; AVX1-NEXT: vbroadcastss 40(%rbp), %ymm9 2174; AVX1-NEXT: vmulps %ymm6, %ymm9, %ymm9 2175; AVX1-NEXT: vaddps %ymm9, %ymm8, %ymm8 2176; AVX1-NEXT: vbroadcastss 44(%rbp), %ymm9 2177; AVX1-NEXT: vmulps %ymm7, %ymm9, %ymm9 2178; AVX1-NEXT: vaddps %ymm9, %ymm8, %ymm8 2179; AVX1-NEXT: vbroadcastss 48(%rbp), %ymm9 2180; AVX1-NEXT: vmulps %ymm0, %ymm9, %ymm9 2181; AVX1-NEXT: vbroadcastss 52(%rbp), %ymm10 2182; AVX1-NEXT: vmulps %ymm1, %ymm10, %ymm10 2183; AVX1-NEXT: vaddps %ymm10, %ymm9, %ymm9 2184; AVX1-NEXT: vbroadcastss 56(%rbp), %ymm10 2185; AVX1-NEXT: vmulps %ymm2, %ymm10, %ymm10 2186; AVX1-NEXT: vaddps %ymm10, %ymm9, %ymm9 2187; AVX1-NEXT: vbroadcastss 60(%rbp), %ymm10 2188; AVX1-NEXT: vmulps %ymm3, %ymm10, %ymm10 2189; AVX1-NEXT: vaddps %ymm10, %ymm9, %ymm9 2190; AVX1-NEXT: vbroadcastss 64(%rbp), %ymm10 2191; AVX1-NEXT: vmulps %ymm4, %ymm10, %ymm10 2192; AVX1-NEXT: vaddps %ymm10, %ymm9, %ymm9 2193; AVX1-NEXT: vbroadcastss 68(%rbp), %ymm10 2194; AVX1-NEXT: vmulps %ymm5, %ymm10, %ymm10 2195; AVX1-NEXT: vaddps %ymm10, %ymm9, %ymm9 2196; AVX1-NEXT: vbroadcastss 72(%rbp), %ymm10 2197; AVX1-NEXT: vmulps %ymm6, %ymm10, %ymm10 2198; AVX1-NEXT: vaddps %ymm10, %ymm9, %ymm9 2199; AVX1-NEXT: vbroadcastss 76(%rbp), %ymm10 2200; AVX1-NEXT: vmulps %ymm7, %ymm10, %ymm10 2201; AVX1-NEXT: vaddps %ymm10, %ymm9, %ymm9 2202; AVX1-NEXT: vbroadcastss 80(%rbp), %ymm10 2203; AVX1-NEXT: vmulps %ymm0, %ymm10, %ymm10 2204; AVX1-NEXT: vbroadcastss 84(%rbp), %ymm11 2205; AVX1-NEXT: vmulps %ymm1, %ymm11, %ymm11 2206; AVX1-NEXT: vaddps %ymm11, %ymm10, %ymm10 2207; AVX1-NEXT: vbroadcastss 88(%rbp), %ymm11 2208; AVX1-NEXT: vmulps %ymm2, %ymm11, %ymm11 2209; AVX1-NEXT: vaddps %ymm11, %ymm10, %ymm10 2210; AVX1-NEXT: vbroadcastss 92(%rbp), %ymm11 2211; AVX1-NEXT: vmulps %ymm3, %ymm11, %ymm11 2212; AVX1-NEXT: vaddps %ymm11, %ymm10, %ymm10 2213; AVX1-NEXT: vbroadcastss 96(%rbp), %ymm11 2214; AVX1-NEXT: vmulps %ymm4, %ymm11, %ymm11 2215; AVX1-NEXT: vaddps %ymm11, %ymm10, %ymm10 2216; AVX1-NEXT: vbroadcastss 100(%rbp), %ymm11 2217; AVX1-NEXT: vmulps %ymm5, %ymm11, %ymm11 2218; AVX1-NEXT: vaddps %ymm11, %ymm10, %ymm10 2219; AVX1-NEXT: vbroadcastss 104(%rbp), %ymm11 2220; AVX1-NEXT: vmulps %ymm6, %ymm11, %ymm11 2221; AVX1-NEXT: vaddps %ymm11, %ymm10, %ymm10 2222; AVX1-NEXT: vbroadcastss 108(%rbp), %ymm11 2223; AVX1-NEXT: vmulps %ymm7, %ymm11, %ymm11 2224; AVX1-NEXT: vaddps %ymm11, %ymm10, %ymm10 2225; AVX1-NEXT: vbroadcastss 112(%rbp), %ymm11 2226; AVX1-NEXT: vmulps %ymm0, %ymm11, %ymm11 2227; AVX1-NEXT: vbroadcastss 116(%rbp), %ymm12 2228; AVX1-NEXT: vmulps %ymm1, %ymm12, %ymm12 2229; AVX1-NEXT: vaddps %ymm12, %ymm11, %ymm11 2230; AVX1-NEXT: vbroadcastss 120(%rbp), %ymm12 2231; AVX1-NEXT: vmulps %ymm2, %ymm12, %ymm12 2232; AVX1-NEXT: vaddps %ymm12, %ymm11, %ymm11 2233; AVX1-NEXT: vbroadcastss 124(%rbp), %ymm12 2234; AVX1-NEXT: vmulps %ymm3, %ymm12, %ymm12 2235; AVX1-NEXT: vaddps %ymm12, %ymm11, %ymm11 2236; AVX1-NEXT: vbroadcastss 128(%rbp), %ymm12 2237; AVX1-NEXT: vmulps %ymm4, %ymm12, %ymm12 2238; AVX1-NEXT: vaddps %ymm12, %ymm11, %ymm11 2239; AVX1-NEXT: vbroadcastss 132(%rbp), %ymm12 2240; AVX1-NEXT: vmulps %ymm5, %ymm12, %ymm12 2241; AVX1-NEXT: vaddps %ymm12, %ymm11, %ymm11 2242; AVX1-NEXT: vbroadcastss 136(%rbp), %ymm12 2243; AVX1-NEXT: vmulps %ymm6, %ymm12, %ymm12 2244; AVX1-NEXT: vaddps %ymm12, %ymm11, %ymm11 2245; AVX1-NEXT: vbroadcastss 140(%rbp), %ymm12 2246; AVX1-NEXT: vmulps %ymm7, %ymm12, %ymm12 2247; AVX1-NEXT: vaddps %ymm12, %ymm11, %ymm11 2248; AVX1-NEXT: vbroadcastss 144(%rbp), %ymm12 2249; AVX1-NEXT: vmulps %ymm0, %ymm12, %ymm12 2250; AVX1-NEXT: vbroadcastss 148(%rbp), %ymm13 2251; AVX1-NEXT: vmulps %ymm1, %ymm13, %ymm13 2252; AVX1-NEXT: vaddps %ymm13, %ymm12, %ymm12 2253; AVX1-NEXT: vbroadcastss 152(%rbp), %ymm13 2254; AVX1-NEXT: vmulps %ymm2, %ymm13, %ymm13 2255; AVX1-NEXT: vaddps %ymm13, %ymm12, %ymm12 2256; AVX1-NEXT: vbroadcastss 156(%rbp), %ymm13 2257; AVX1-NEXT: vmulps %ymm3, %ymm13, %ymm13 2258; AVX1-NEXT: vaddps %ymm13, %ymm12, %ymm12 2259; AVX1-NEXT: vbroadcastss 160(%rbp), %ymm13 2260; AVX1-NEXT: vmulps %ymm4, %ymm13, %ymm13 2261; AVX1-NEXT: vaddps %ymm13, %ymm12, %ymm12 2262; AVX1-NEXT: vbroadcastss 164(%rbp), %ymm13 2263; AVX1-NEXT: vmulps %ymm5, %ymm13, %ymm13 2264; AVX1-NEXT: vaddps %ymm13, %ymm12, %ymm12 2265; AVX1-NEXT: vbroadcastss 168(%rbp), %ymm13 2266; AVX1-NEXT: vmulps %ymm6, %ymm13, %ymm13 2267; AVX1-NEXT: vaddps %ymm13, %ymm12, %ymm12 2268; AVX1-NEXT: vbroadcastss 172(%rbp), %ymm13 2269; AVX1-NEXT: vmulps %ymm7, %ymm13, %ymm13 2270; AVX1-NEXT: vaddps %ymm13, %ymm12, %ymm12 2271; AVX1-NEXT: vbroadcastss 176(%rbp), %ymm13 2272; AVX1-NEXT: vmulps %ymm0, %ymm13, %ymm13 2273; AVX1-NEXT: vbroadcastss 180(%rbp), %ymm14 2274; AVX1-NEXT: vmulps %ymm1, %ymm14, %ymm14 2275; AVX1-NEXT: vaddps %ymm14, %ymm13, %ymm13 2276; AVX1-NEXT: vbroadcastss 184(%rbp), %ymm14 2277; AVX1-NEXT: vmulps %ymm2, %ymm14, %ymm14 2278; AVX1-NEXT: vaddps %ymm14, %ymm13, %ymm13 2279; AVX1-NEXT: vbroadcastss 188(%rbp), %ymm14 2280; AVX1-NEXT: vmulps %ymm3, %ymm14, %ymm14 2281; AVX1-NEXT: vaddps %ymm14, %ymm13, %ymm13 2282; AVX1-NEXT: vbroadcastss 192(%rbp), %ymm14 2283; AVX1-NEXT: vmulps %ymm4, %ymm14, %ymm14 2284; AVX1-NEXT: vaddps %ymm14, %ymm13, %ymm13 2285; AVX1-NEXT: vbroadcastss 196(%rbp), %ymm14 2286; AVX1-NEXT: vmulps %ymm5, %ymm14, %ymm14 2287; AVX1-NEXT: vaddps %ymm14, %ymm13, %ymm13 2288; AVX1-NEXT: vbroadcastss 200(%rbp), %ymm14 2289; AVX1-NEXT: vmulps %ymm6, %ymm14, %ymm14 2290; AVX1-NEXT: vaddps %ymm14, %ymm13, %ymm13 2291; AVX1-NEXT: vbroadcastss 204(%rbp), %ymm14 2292; AVX1-NEXT: vmulps %ymm7, %ymm14, %ymm14 2293; AVX1-NEXT: vaddps %ymm14, %ymm13, %ymm13 2294; AVX1-NEXT: vbroadcastss 208(%rbp), %ymm14 2295; AVX1-NEXT: vmulps %ymm0, %ymm14, %ymm14 2296; AVX1-NEXT: vbroadcastss 212(%rbp), %ymm15 2297; AVX1-NEXT: vmulps %ymm1, %ymm15, %ymm15 2298; AVX1-NEXT: vaddps %ymm15, %ymm14, %ymm14 2299; AVX1-NEXT: vbroadcastss 216(%rbp), %ymm15 2300; AVX1-NEXT: vmulps %ymm2, %ymm15, %ymm15 2301; AVX1-NEXT: vaddps %ymm15, %ymm14, %ymm14 2302; AVX1-NEXT: vbroadcastss 220(%rbp), %ymm15 2303; AVX1-NEXT: vmulps %ymm3, %ymm15, %ymm15 2304; AVX1-NEXT: vaddps %ymm15, %ymm14, %ymm14 2305; AVX1-NEXT: vbroadcastss 224(%rbp), %ymm15 2306; AVX1-NEXT: vmulps %ymm4, %ymm15, %ymm15 2307; AVX1-NEXT: vaddps %ymm15, %ymm14, %ymm14 2308; AVX1-NEXT: vbroadcastss 228(%rbp), %ymm15 2309; AVX1-NEXT: vmulps %ymm5, %ymm15, %ymm15 2310; AVX1-NEXT: vaddps %ymm15, %ymm14, %ymm14 2311; AVX1-NEXT: vbroadcastss 232(%rbp), %ymm15 2312; AVX1-NEXT: vmulps %ymm6, %ymm15, %ymm15 2313; AVX1-NEXT: vaddps %ymm15, %ymm14, %ymm14 2314; AVX1-NEXT: vbroadcastss 236(%rbp), %ymm15 2315; AVX1-NEXT: vmulps %ymm7, %ymm15, %ymm15 2316; AVX1-NEXT: vaddps %ymm15, %ymm14, %ymm14 2317; AVX1-NEXT: vbroadcastss 240(%rbp), %ymm15 2318; AVX1-NEXT: vmulps %ymm0, %ymm15, %ymm0 2319; AVX1-NEXT: vbroadcastss 244(%rbp), %ymm15 2320; AVX1-NEXT: vmulps %ymm1, %ymm15, %ymm1 2321; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 2322; AVX1-NEXT: vbroadcastss 248(%rbp), %ymm1 2323; AVX1-NEXT: vmulps %ymm1, %ymm2, %ymm1 2324; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 2325; AVX1-NEXT: vbroadcastss 252(%rbp), %ymm1 2326; AVX1-NEXT: vmulps %ymm1, %ymm3, %ymm1 2327; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 2328; AVX1-NEXT: vbroadcastss 256(%rbp), %ymm1 2329; AVX1-NEXT: vmulps %ymm1, %ymm4, %ymm1 2330; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 2331; AVX1-NEXT: vbroadcastss 260(%rbp), %ymm1 2332; AVX1-NEXT: vmulps %ymm1, %ymm5, %ymm1 2333; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 2334; AVX1-NEXT: vbroadcastss 264(%rbp), %ymm1 2335; AVX1-NEXT: vmulps %ymm1, %ymm6, %ymm1 2336; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 2337; AVX1-NEXT: vbroadcastss 268(%rbp), %ymm1 2338; AVX1-NEXT: vmulps %ymm1, %ymm7, %ymm1 2339; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 2340; AVX1-NEXT: vmovaps %ymm0, 224(%rdi) 2341; AVX1-NEXT: vmovaps %ymm14, 192(%rdi) 2342; AVX1-NEXT: vmovaps %ymm13, 160(%rdi) 2343; AVX1-NEXT: vmovaps %ymm12, 128(%rdi) 2344; AVX1-NEXT: vmovaps %ymm11, 96(%rdi) 2345; AVX1-NEXT: vmovaps %ymm10, 64(%rdi) 2346; AVX1-NEXT: vmovaps %ymm9, 32(%rdi) 2347; AVX1-NEXT: vmovaps %ymm8, (%rdi) 2348; AVX1-NEXT: movq %rbp, %rsp 2349; AVX1-NEXT: popq %rbp 2350; AVX1-NEXT: vzeroupper 2351; AVX1-NEXT: retq 2352; 2353; AVX2-LABEL: test_mul8x8_f32: 2354; AVX2: # %bb.0: # %entry 2355; AVX2-NEXT: pushq %rbp 2356; AVX2-NEXT: movq %rsp, %rbp 2357; AVX2-NEXT: andq $-32, %rsp 2358; AVX2-NEXT: subq $32, %rsp 2359; AVX2-NEXT: movq %rdi, %rax 2360; AVX2-NEXT: vbroadcastss 16(%rbp), %ymm8 2361; AVX2-NEXT: vmulps %ymm0, %ymm8, %ymm8 2362; AVX2-NEXT: vbroadcastss 20(%rbp), %ymm9 2363; AVX2-NEXT: vmulps %ymm1, %ymm9, %ymm9 2364; AVX2-NEXT: vaddps %ymm9, %ymm8, %ymm8 2365; AVX2-NEXT: vbroadcastss 24(%rbp), %ymm9 2366; AVX2-NEXT: vmulps %ymm2, %ymm9, %ymm9 2367; AVX2-NEXT: vaddps %ymm9, %ymm8, %ymm8 2368; AVX2-NEXT: vbroadcastss 28(%rbp), %ymm9 2369; AVX2-NEXT: vmulps %ymm3, %ymm9, %ymm9 2370; AVX2-NEXT: vaddps %ymm9, %ymm8, %ymm8 2371; AVX2-NEXT: vbroadcastss 32(%rbp), %ymm9 2372; AVX2-NEXT: vmulps %ymm4, %ymm9, %ymm9 2373; AVX2-NEXT: vaddps %ymm9, %ymm8, %ymm8 2374; AVX2-NEXT: vbroadcastss 36(%rbp), %ymm9 2375; AVX2-NEXT: vmulps %ymm5, %ymm9, %ymm9 2376; AVX2-NEXT: vaddps %ymm9, %ymm8, %ymm8 2377; AVX2-NEXT: vbroadcastss 40(%rbp), %ymm9 2378; AVX2-NEXT: vmulps %ymm6, %ymm9, %ymm9 2379; AVX2-NEXT: vaddps %ymm9, %ymm8, %ymm8 2380; AVX2-NEXT: vbroadcastss 44(%rbp), %ymm9 2381; AVX2-NEXT: vmulps %ymm7, %ymm9, %ymm9 2382; AVX2-NEXT: vaddps %ymm9, %ymm8, %ymm8 2383; AVX2-NEXT: vbroadcastss 48(%rbp), %ymm9 2384; AVX2-NEXT: vmulps %ymm0, %ymm9, %ymm9 2385; AVX2-NEXT: vbroadcastss 52(%rbp), %ymm10 2386; AVX2-NEXT: vmulps %ymm1, %ymm10, %ymm10 2387; AVX2-NEXT: vaddps %ymm10, %ymm9, %ymm9 2388; AVX2-NEXT: vbroadcastss 56(%rbp), %ymm10 2389; AVX2-NEXT: vmulps %ymm2, %ymm10, %ymm10 2390; AVX2-NEXT: vaddps %ymm10, %ymm9, %ymm9 2391; AVX2-NEXT: vbroadcastss 60(%rbp), %ymm10 2392; AVX2-NEXT: vmulps %ymm3, %ymm10, %ymm10 2393; AVX2-NEXT: vaddps %ymm10, %ymm9, %ymm9 2394; AVX2-NEXT: vbroadcastss 64(%rbp), %ymm10 2395; AVX2-NEXT: vmulps %ymm4, %ymm10, %ymm10 2396; AVX2-NEXT: vaddps %ymm10, %ymm9, %ymm9 2397; AVX2-NEXT: vbroadcastss 68(%rbp), %ymm10 2398; AVX2-NEXT: vmulps %ymm5, %ymm10, %ymm10 2399; AVX2-NEXT: vaddps %ymm10, %ymm9, %ymm9 2400; AVX2-NEXT: vbroadcastss 72(%rbp), %ymm10 2401; AVX2-NEXT: vmulps %ymm6, %ymm10, %ymm10 2402; AVX2-NEXT: vaddps %ymm10, %ymm9, %ymm9 2403; AVX2-NEXT: vbroadcastss 76(%rbp), %ymm10 2404; AVX2-NEXT: vmulps %ymm7, %ymm10, %ymm10 2405; AVX2-NEXT: vaddps %ymm10, %ymm9, %ymm9 2406; AVX2-NEXT: vbroadcastss 80(%rbp), %ymm10 2407; AVX2-NEXT: vmulps %ymm0, %ymm10, %ymm10 2408; AVX2-NEXT: vbroadcastss 84(%rbp), %ymm11 2409; AVX2-NEXT: vmulps %ymm1, %ymm11, %ymm11 2410; AVX2-NEXT: vaddps %ymm11, %ymm10, %ymm10 2411; AVX2-NEXT: vbroadcastss 88(%rbp), %ymm11 2412; AVX2-NEXT: vmulps %ymm2, %ymm11, %ymm11 2413; AVX2-NEXT: vaddps %ymm11, %ymm10, %ymm10 2414; AVX2-NEXT: vbroadcastss 92(%rbp), %ymm11 2415; AVX2-NEXT: vmulps %ymm3, %ymm11, %ymm11 2416; AVX2-NEXT: vaddps %ymm11, %ymm10, %ymm10 2417; AVX2-NEXT: vbroadcastss 96(%rbp), %ymm11 2418; AVX2-NEXT: vmulps %ymm4, %ymm11, %ymm11 2419; AVX2-NEXT: vaddps %ymm11, %ymm10, %ymm10 2420; AVX2-NEXT: vbroadcastss 100(%rbp), %ymm11 2421; AVX2-NEXT: vmulps %ymm5, %ymm11, %ymm11 2422; AVX2-NEXT: vaddps %ymm11, %ymm10, %ymm10 2423; AVX2-NEXT: vbroadcastss 104(%rbp), %ymm11 2424; AVX2-NEXT: vmulps %ymm6, %ymm11, %ymm11 2425; AVX2-NEXT: vaddps %ymm11, %ymm10, %ymm10 2426; AVX2-NEXT: vbroadcastss 108(%rbp), %ymm11 2427; AVX2-NEXT: vmulps %ymm7, %ymm11, %ymm11 2428; AVX2-NEXT: vaddps %ymm11, %ymm10, %ymm10 2429; AVX2-NEXT: vbroadcastss 112(%rbp), %ymm11 2430; AVX2-NEXT: vmulps %ymm0, %ymm11, %ymm11 2431; AVX2-NEXT: vbroadcastss 116(%rbp), %ymm12 2432; AVX2-NEXT: vmulps %ymm1, %ymm12, %ymm12 2433; AVX2-NEXT: vaddps %ymm12, %ymm11, %ymm11 2434; AVX2-NEXT: vbroadcastss 120(%rbp), %ymm12 2435; AVX2-NEXT: vmulps %ymm2, %ymm12, %ymm12 2436; AVX2-NEXT: vaddps %ymm12, %ymm11, %ymm11 2437; AVX2-NEXT: vbroadcastss 124(%rbp), %ymm12 2438; AVX2-NEXT: vmulps %ymm3, %ymm12, %ymm12 2439; AVX2-NEXT: vaddps %ymm12, %ymm11, %ymm11 2440; AVX2-NEXT: vbroadcastss 128(%rbp), %ymm12 2441; AVX2-NEXT: vmulps %ymm4, %ymm12, %ymm12 2442; AVX2-NEXT: vaddps %ymm12, %ymm11, %ymm11 2443; AVX2-NEXT: vbroadcastss 132(%rbp), %ymm12 2444; AVX2-NEXT: vmulps %ymm5, %ymm12, %ymm12 2445; AVX2-NEXT: vaddps %ymm12, %ymm11, %ymm11 2446; AVX2-NEXT: vbroadcastss 136(%rbp), %ymm12 2447; AVX2-NEXT: vmulps %ymm6, %ymm12, %ymm12 2448; AVX2-NEXT: vaddps %ymm12, %ymm11, %ymm11 2449; AVX2-NEXT: vbroadcastss 140(%rbp), %ymm12 2450; AVX2-NEXT: vmulps %ymm7, %ymm12, %ymm12 2451; AVX2-NEXT: vaddps %ymm12, %ymm11, %ymm11 2452; AVX2-NEXT: vbroadcastss 144(%rbp), %ymm12 2453; AVX2-NEXT: vmulps %ymm0, %ymm12, %ymm12 2454; AVX2-NEXT: vbroadcastss 148(%rbp), %ymm13 2455; AVX2-NEXT: vmulps %ymm1, %ymm13, %ymm13 2456; AVX2-NEXT: vaddps %ymm13, %ymm12, %ymm12 2457; AVX2-NEXT: vbroadcastss 152(%rbp), %ymm13 2458; AVX2-NEXT: vmulps %ymm2, %ymm13, %ymm13 2459; AVX2-NEXT: vaddps %ymm13, %ymm12, %ymm12 2460; AVX2-NEXT: vbroadcastss 156(%rbp), %ymm13 2461; AVX2-NEXT: vmulps %ymm3, %ymm13, %ymm13 2462; AVX2-NEXT: vaddps %ymm13, %ymm12, %ymm12 2463; AVX2-NEXT: vbroadcastss 160(%rbp), %ymm13 2464; AVX2-NEXT: vmulps %ymm4, %ymm13, %ymm13 2465; AVX2-NEXT: vaddps %ymm13, %ymm12, %ymm12 2466; AVX2-NEXT: vbroadcastss 164(%rbp), %ymm13 2467; AVX2-NEXT: vmulps %ymm5, %ymm13, %ymm13 2468; AVX2-NEXT: vaddps %ymm13, %ymm12, %ymm12 2469; AVX2-NEXT: vbroadcastss 168(%rbp), %ymm13 2470; AVX2-NEXT: vmulps %ymm6, %ymm13, %ymm13 2471; AVX2-NEXT: vaddps %ymm13, %ymm12, %ymm12 2472; AVX2-NEXT: vbroadcastss 172(%rbp), %ymm13 2473; AVX2-NEXT: vmulps %ymm7, %ymm13, %ymm13 2474; AVX2-NEXT: vaddps %ymm13, %ymm12, %ymm12 2475; AVX2-NEXT: vbroadcastss 176(%rbp), %ymm13 2476; AVX2-NEXT: vmulps %ymm0, %ymm13, %ymm13 2477; AVX2-NEXT: vbroadcastss 180(%rbp), %ymm14 2478; AVX2-NEXT: vmulps %ymm1, %ymm14, %ymm14 2479; AVX2-NEXT: vaddps %ymm14, %ymm13, %ymm13 2480; AVX2-NEXT: vbroadcastss 184(%rbp), %ymm14 2481; AVX2-NEXT: vmulps %ymm2, %ymm14, %ymm14 2482; AVX2-NEXT: vaddps %ymm14, %ymm13, %ymm13 2483; AVX2-NEXT: vbroadcastss 188(%rbp), %ymm14 2484; AVX2-NEXT: vmulps %ymm3, %ymm14, %ymm14 2485; AVX2-NEXT: vaddps %ymm14, %ymm13, %ymm13 2486; AVX2-NEXT: vbroadcastss 192(%rbp), %ymm14 2487; AVX2-NEXT: vmulps %ymm4, %ymm14, %ymm14 2488; AVX2-NEXT: vaddps %ymm14, %ymm13, %ymm13 2489; AVX2-NEXT: vbroadcastss 196(%rbp), %ymm14 2490; AVX2-NEXT: vmulps %ymm5, %ymm14, %ymm14 2491; AVX2-NEXT: vaddps %ymm14, %ymm13, %ymm13 2492; AVX2-NEXT: vbroadcastss 200(%rbp), %ymm14 2493; AVX2-NEXT: vmulps %ymm6, %ymm14, %ymm14 2494; AVX2-NEXT: vaddps %ymm14, %ymm13, %ymm13 2495; AVX2-NEXT: vbroadcastss 204(%rbp), %ymm14 2496; AVX2-NEXT: vmulps %ymm7, %ymm14, %ymm14 2497; AVX2-NEXT: vaddps %ymm14, %ymm13, %ymm13 2498; AVX2-NEXT: vbroadcastss 208(%rbp), %ymm14 2499; AVX2-NEXT: vmulps %ymm0, %ymm14, %ymm14 2500; AVX2-NEXT: vbroadcastss 212(%rbp), %ymm15 2501; AVX2-NEXT: vmulps %ymm1, %ymm15, %ymm15 2502; AVX2-NEXT: vaddps %ymm15, %ymm14, %ymm14 2503; AVX2-NEXT: vbroadcastss 216(%rbp), %ymm15 2504; AVX2-NEXT: vmulps %ymm2, %ymm15, %ymm15 2505; AVX2-NEXT: vaddps %ymm15, %ymm14, %ymm14 2506; AVX2-NEXT: vbroadcastss 220(%rbp), %ymm15 2507; AVX2-NEXT: vmulps %ymm3, %ymm15, %ymm15 2508; AVX2-NEXT: vaddps %ymm15, %ymm14, %ymm14 2509; AVX2-NEXT: vbroadcastss 224(%rbp), %ymm15 2510; AVX2-NEXT: vmulps %ymm4, %ymm15, %ymm15 2511; AVX2-NEXT: vaddps %ymm15, %ymm14, %ymm14 2512; AVX2-NEXT: vbroadcastss 228(%rbp), %ymm15 2513; AVX2-NEXT: vmulps %ymm5, %ymm15, %ymm15 2514; AVX2-NEXT: vaddps %ymm15, %ymm14, %ymm14 2515; AVX2-NEXT: vbroadcastss 232(%rbp), %ymm15 2516; AVX2-NEXT: vmulps %ymm6, %ymm15, %ymm15 2517; AVX2-NEXT: vaddps %ymm15, %ymm14, %ymm14 2518; AVX2-NEXT: vbroadcastss 236(%rbp), %ymm15 2519; AVX2-NEXT: vmulps %ymm7, %ymm15, %ymm15 2520; AVX2-NEXT: vaddps %ymm15, %ymm14, %ymm14 2521; AVX2-NEXT: vbroadcastss 240(%rbp), %ymm15 2522; AVX2-NEXT: vmulps %ymm0, %ymm15, %ymm0 2523; AVX2-NEXT: vbroadcastss 244(%rbp), %ymm15 2524; AVX2-NEXT: vmulps %ymm1, %ymm15, %ymm1 2525; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 2526; AVX2-NEXT: vbroadcastss 248(%rbp), %ymm1 2527; AVX2-NEXT: vmulps %ymm1, %ymm2, %ymm1 2528; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 2529; AVX2-NEXT: vbroadcastss 252(%rbp), %ymm1 2530; AVX2-NEXT: vmulps %ymm1, %ymm3, %ymm1 2531; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 2532; AVX2-NEXT: vbroadcastss 256(%rbp), %ymm1 2533; AVX2-NEXT: vmulps %ymm1, %ymm4, %ymm1 2534; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 2535; AVX2-NEXT: vbroadcastss 260(%rbp), %ymm1 2536; AVX2-NEXT: vmulps %ymm1, %ymm5, %ymm1 2537; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 2538; AVX2-NEXT: vbroadcastss 264(%rbp), %ymm1 2539; AVX2-NEXT: vmulps %ymm1, %ymm6, %ymm1 2540; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 2541; AVX2-NEXT: vbroadcastss 268(%rbp), %ymm1 2542; AVX2-NEXT: vmulps %ymm1, %ymm7, %ymm1 2543; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 2544; AVX2-NEXT: vmovaps %ymm0, 224(%rdi) 2545; AVX2-NEXT: vmovaps %ymm14, 192(%rdi) 2546; AVX2-NEXT: vmovaps %ymm13, 160(%rdi) 2547; AVX2-NEXT: vmovaps %ymm12, 128(%rdi) 2548; AVX2-NEXT: vmovaps %ymm11, 96(%rdi) 2549; AVX2-NEXT: vmovaps %ymm10, 64(%rdi) 2550; AVX2-NEXT: vmovaps %ymm9, 32(%rdi) 2551; AVX2-NEXT: vmovaps %ymm8, (%rdi) 2552; AVX2-NEXT: movq %rbp, %rsp 2553; AVX2-NEXT: popq %rbp 2554; AVX2-NEXT: vzeroupper 2555; AVX2-NEXT: retq 2556; 2557; AVX512F-LABEL: test_mul8x8_f32: 2558; AVX512F: # %bb.0: # %entry 2559; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm11 2560; AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm10 2561; AVX512F-NEXT: vextractf64x4 $1, %zmm2, %ymm9 2562; AVX512F-NEXT: vextractf64x4 $1, %zmm3, %ymm8 2563; AVX512F-NEXT: vbroadcastss %xmm4, %ymm12 2564; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12 2565; AVX512F-NEXT: vmovshdup {{.*#+}} xmm13 = xmm4[1,1,3,3] 2566; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13 2567; AVX512F-NEXT: vmulps %ymm13, %ymm11, %ymm13 2568; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 2569; AVX512F-NEXT: vshufpd {{.*#+}} xmm13 = xmm4[1,0] 2570; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13 2571; AVX512F-NEXT: vmulps %ymm1, %ymm13, %ymm13 2572; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 2573; AVX512F-NEXT: vshufps {{.*#+}} xmm13 = xmm4[3,3,3,3] 2574; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13 2575; AVX512F-NEXT: vmulps %ymm13, %ymm10, %ymm13 2576; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 2577; AVX512F-NEXT: vextractf128 $1, %ymm4, %xmm13 2578; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13 2579; AVX512F-NEXT: vmulps %ymm2, %ymm13, %ymm13 2580; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 2581; AVX512F-NEXT: vmovshdup {{.*#+}} ymm13 = ymm4[1,1,3,3,5,5,7,7] 2582; AVX512F-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] 2583; AVX512F-NEXT: vmulps %ymm13, %ymm9, %ymm13 2584; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 2585; AVX512F-NEXT: vshufps {{.*#+}} ymm13 = ymm4[2,2,2,2,6,6,6,6] 2586; AVX512F-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] 2587; AVX512F-NEXT: vmulps %ymm3, %ymm13, %ymm13 2588; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 2589; AVX512F-NEXT: vshufps {{.*#+}} ymm13 = ymm4[3,3,3,3,7,7,7,7] 2590; AVX512F-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] 2591; AVX512F-NEXT: vmulps %ymm13, %ymm8, %ymm13 2592; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 2593; AVX512F-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 2594; AVX512F-NEXT: vextractf64x4 $1, %zmm4, %ymm13 2595; AVX512F-NEXT: vextractf32x4 $2, %zmm4, %xmm14 2596; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 2597; AVX512F-NEXT: vmulps %ymm0, %ymm14, %ymm14 2598; AVX512F-NEXT: vmovshdup {{.*#+}} xmm15 = xmm13[1,1,3,3] 2599; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 2600; AVX512F-NEXT: vmulps %ymm15, %ymm11, %ymm15 2601; AVX512F-NEXT: vaddps %ymm15, %ymm14, %ymm14 2602; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm13[2,2,2,2] 2603; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 2604; AVX512F-NEXT: vmulps %ymm1, %ymm15, %ymm15 2605; AVX512F-NEXT: vaddps %ymm15, %ymm14, %ymm14 2606; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm13[3,3,3,3] 2607; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 2608; AVX512F-NEXT: vmulps %ymm15, %ymm10, %ymm15 2609; AVX512F-NEXT: vaddps %ymm15, %ymm14, %ymm14 2610; AVX512F-NEXT: vextractf32x4 $3, %zmm4, %xmm4 2611; AVX512F-NEXT: vbroadcastss %xmm4, %ymm4 2612; AVX512F-NEXT: vmulps %ymm4, %ymm2, %ymm4 2613; AVX512F-NEXT: vaddps %ymm4, %ymm14, %ymm4 2614; AVX512F-NEXT: vmovshdup {{.*#+}} ymm14 = ymm13[1,1,3,3,5,5,7,7] 2615; AVX512F-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] 2616; AVX512F-NEXT: vmulps %ymm14, %ymm9, %ymm14 2617; AVX512F-NEXT: vaddps %ymm4, %ymm14, %ymm4 2618; AVX512F-NEXT: vshufps {{.*#+}} ymm14 = ymm13[2,2,2,2,6,6,6,6] 2619; AVX512F-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] 2620; AVX512F-NEXT: vmulps %ymm3, %ymm14, %ymm14 2621; AVX512F-NEXT: vaddps %ymm4, %ymm14, %ymm4 2622; AVX512F-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,3,3,3,7,7,7,7] 2623; AVX512F-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] 2624; AVX512F-NEXT: vmulps %ymm13, %ymm8, %ymm13 2625; AVX512F-NEXT: vaddps %ymm4, %ymm13, %ymm4 2626; AVX512F-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2627; AVX512F-NEXT: vbroadcastss %xmm5, %ymm13 2628; AVX512F-NEXT: vmulps %ymm0, %ymm13, %ymm13 2629; AVX512F-NEXT: vmovshdup {{.*#+}} xmm14 = xmm5[1,1,3,3] 2630; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 2631; AVX512F-NEXT: vmulps %ymm14, %ymm11, %ymm14 2632; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13 2633; AVX512F-NEXT: vshufpd {{.*#+}} xmm14 = xmm5[1,0] 2634; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 2635; AVX512F-NEXT: vmulps %ymm1, %ymm14, %ymm14 2636; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13 2637; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm5[3,3,3,3] 2638; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 2639; AVX512F-NEXT: vmulps %ymm14, %ymm10, %ymm14 2640; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13 2641; AVX512F-NEXT: vextractf128 $1, %ymm5, %xmm14 2642; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 2643; AVX512F-NEXT: vmulps %ymm2, %ymm14, %ymm14 2644; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13 2645; AVX512F-NEXT: vmovshdup {{.*#+}} ymm14 = ymm5[1,1,3,3,5,5,7,7] 2646; AVX512F-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] 2647; AVX512F-NEXT: vmulps %ymm14, %ymm9, %ymm14 2648; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13 2649; AVX512F-NEXT: vshufps {{.*#+}} ymm14 = ymm5[2,2,2,2,6,6,6,6] 2650; AVX512F-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] 2651; AVX512F-NEXT: vmulps %ymm3, %ymm14, %ymm14 2652; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13 2653; AVX512F-NEXT: vshufps {{.*#+}} ymm14 = ymm5[3,3,3,3,7,7,7,7] 2654; AVX512F-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] 2655; AVX512F-NEXT: vmulps %ymm14, %ymm8, %ymm14 2656; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13 2657; AVX512F-NEXT: vextractf64x4 $1, %zmm5, %ymm14 2658; AVX512F-NEXT: vextractf32x4 $2, %zmm5, %xmm15 2659; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 2660; AVX512F-NEXT: vmulps %ymm0, %ymm15, %ymm15 2661; AVX512F-NEXT: vmovshdup {{.*#+}} xmm12 = xmm14[1,1,3,3] 2662; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 2663; AVX512F-NEXT: vmulps %ymm12, %ymm11, %ymm12 2664; AVX512F-NEXT: vaddps %ymm12, %ymm15, %ymm12 2665; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm14[2,2,2,2] 2666; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 2667; AVX512F-NEXT: vmulps %ymm1, %ymm15, %ymm15 2668; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 2669; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm14[3,3,3,3] 2670; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 2671; AVX512F-NEXT: vmulps %ymm15, %ymm10, %ymm15 2672; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 2673; AVX512F-NEXT: vextractf32x4 $3, %zmm5, %xmm5 2674; AVX512F-NEXT: vbroadcastss %xmm5, %ymm5 2675; AVX512F-NEXT: vmulps %ymm5, %ymm2, %ymm5 2676; AVX512F-NEXT: vaddps %ymm5, %ymm12, %ymm5 2677; AVX512F-NEXT: vmovshdup {{.*#+}} ymm12 = ymm14[1,1,3,3,5,5,7,7] 2678; AVX512F-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] 2679; AVX512F-NEXT: vmulps %ymm12, %ymm9, %ymm12 2680; AVX512F-NEXT: vaddps %ymm5, %ymm12, %ymm5 2681; AVX512F-NEXT: vshufps {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] 2682; AVX512F-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] 2683; AVX512F-NEXT: vmulps %ymm3, %ymm12, %ymm12 2684; AVX512F-NEXT: vaddps %ymm5, %ymm12, %ymm5 2685; AVX512F-NEXT: vshufps {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7] 2686; AVX512F-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] 2687; AVX512F-NEXT: vmulps %ymm12, %ymm8, %ymm12 2688; AVX512F-NEXT: vaddps %ymm5, %ymm12, %ymm5 2689; AVX512F-NEXT: vbroadcastss %xmm6, %ymm12 2690; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12 2691; AVX512F-NEXT: vmovshdup {{.*#+}} xmm14 = xmm6[1,1,3,3] 2692; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 2693; AVX512F-NEXT: vmulps %ymm14, %ymm11, %ymm14 2694; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm12 2695; AVX512F-NEXT: vshufpd {{.*#+}} xmm14 = xmm6[1,0] 2696; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 2697; AVX512F-NEXT: vmulps %ymm1, %ymm14, %ymm14 2698; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm12 2699; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm6[3,3,3,3] 2700; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 2701; AVX512F-NEXT: vmulps %ymm14, %ymm10, %ymm14 2702; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm12 2703; AVX512F-NEXT: vextractf128 $1, %ymm6, %xmm14 2704; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 2705; AVX512F-NEXT: vmulps %ymm2, %ymm14, %ymm14 2706; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm12 2707; AVX512F-NEXT: vmovshdup {{.*#+}} ymm14 = ymm6[1,1,3,3,5,5,7,7] 2708; AVX512F-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] 2709; AVX512F-NEXT: vmulps %ymm14, %ymm9, %ymm14 2710; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm12 2711; AVX512F-NEXT: vshufps {{.*#+}} ymm14 = ymm6[2,2,2,2,6,6,6,6] 2712; AVX512F-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] 2713; AVX512F-NEXT: vmulps %ymm3, %ymm14, %ymm14 2714; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm12 2715; AVX512F-NEXT: vshufps {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7] 2716; AVX512F-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] 2717; AVX512F-NEXT: vmulps %ymm14, %ymm8, %ymm14 2718; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm14 2719; AVX512F-NEXT: vextractf32x4 $2, %zmm6, %xmm12 2720; AVX512F-NEXT: vbroadcastss %xmm12, %ymm12 2721; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12 2722; AVX512F-NEXT: vextractf64x4 $1, %zmm6, %ymm15 2723; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm15[1,1,3,3] 2724; AVX512F-NEXT: vbroadcastsd %xmm4, %ymm4 2725; AVX512F-NEXT: vmulps %ymm4, %ymm11, %ymm4 2726; AVX512F-NEXT: vaddps %ymm4, %ymm12, %ymm4 2727; AVX512F-NEXT: vshufps {{.*#+}} xmm12 = xmm15[2,2,2,2] 2728; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 2729; AVX512F-NEXT: vmulps %ymm1, %ymm12, %ymm12 2730; AVX512F-NEXT: vaddps %ymm4, %ymm12, %ymm4 2731; AVX512F-NEXT: vshufps {{.*#+}} xmm12 = xmm15[3,3,3,3] 2732; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 2733; AVX512F-NEXT: vmulps %ymm12, %ymm10, %ymm12 2734; AVX512F-NEXT: vaddps %ymm4, %ymm12, %ymm4 2735; AVX512F-NEXT: vextractf32x4 $3, %zmm6, %xmm6 2736; AVX512F-NEXT: vbroadcastss %xmm6, %ymm6 2737; AVX512F-NEXT: vmulps %ymm6, %ymm2, %ymm6 2738; AVX512F-NEXT: vaddps %ymm6, %ymm4, %ymm4 2739; AVX512F-NEXT: vmovshdup {{.*#+}} ymm6 = ymm15[1,1,3,3,5,5,7,7] 2740; AVX512F-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] 2741; AVX512F-NEXT: vmulps %ymm6, %ymm9, %ymm6 2742; AVX512F-NEXT: vaddps %ymm6, %ymm4, %ymm4 2743; AVX512F-NEXT: vshufps {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6] 2744; AVX512F-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] 2745; AVX512F-NEXT: vmulps %ymm6, %ymm3, %ymm6 2746; AVX512F-NEXT: vaddps %ymm6, %ymm4, %ymm4 2747; AVX512F-NEXT: vshufps {{.*#+}} ymm6 = ymm15[3,3,3,3,7,7,7,7] 2748; AVX512F-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] 2749; AVX512F-NEXT: vmulps %ymm6, %ymm8, %ymm6 2750; AVX512F-NEXT: vaddps %ymm6, %ymm4, %ymm6 2751; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload 2752; AVX512F-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload 2753; AVX512F-NEXT: vbroadcastss %xmm7, %ymm12 2754; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12 2755; AVX512F-NEXT: vmovshdup {{.*#+}} xmm15 = xmm7[1,1,3,3] 2756; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 2757; AVX512F-NEXT: vmulps %ymm15, %ymm11, %ymm15 2758; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 2759; AVX512F-NEXT: vshufpd {{.*#+}} xmm15 = xmm7[1,0] 2760; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 2761; AVX512F-NEXT: vmulps %ymm1, %ymm15, %ymm15 2762; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 2763; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm7[3,3,3,3] 2764; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 2765; AVX512F-NEXT: vmulps %ymm15, %ymm10, %ymm15 2766; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 2767; AVX512F-NEXT: vextractf128 $1, %ymm7, %xmm15 2768; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 2769; AVX512F-NEXT: vmulps %ymm2, %ymm15, %ymm15 2770; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 2771; AVX512F-NEXT: vmovshdup {{.*#+}} ymm15 = ymm7[1,1,3,3,5,5,7,7] 2772; AVX512F-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] 2773; AVX512F-NEXT: vmulps %ymm15, %ymm9, %ymm15 2774; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 2775; AVX512F-NEXT: vshufps {{.*#+}} ymm15 = ymm7[2,2,2,2,6,6,6,6] 2776; AVX512F-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] 2777; AVX512F-NEXT: vmulps %ymm3, %ymm15, %ymm15 2778; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 2779; AVX512F-NEXT: vshufps {{.*#+}} ymm15 = ymm7[3,3,3,3,7,7,7,7] 2780; AVX512F-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] 2781; AVX512F-NEXT: vmulps %ymm15, %ymm8, %ymm15 2782; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 2783; AVX512F-NEXT: vinsertf64x4 $1, %ymm5, %zmm13, %zmm5 2784; AVX512F-NEXT: vextractf64x4 $1, %zmm7, %ymm13 2785; AVX512F-NEXT: vextractf32x4 $2, %zmm7, %xmm15 2786; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 2787; AVX512F-NEXT: vmulps %ymm0, %ymm15, %ymm0 2788; AVX512F-NEXT: vmovshdup {{.*#+}} xmm15 = xmm13[1,1,3,3] 2789; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 2790; AVX512F-NEXT: vmulps %ymm15, %ymm11, %ymm11 2791; AVX512F-NEXT: vaddps %ymm0, %ymm11, %ymm0 2792; AVX512F-NEXT: vshufps {{.*#+}} xmm11 = xmm13[2,2,2,2] 2793; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11 2794; AVX512F-NEXT: vmulps %ymm1, %ymm11, %ymm1 2795; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 2796; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm13[3,3,3,3] 2797; AVX512F-NEXT: vbroadcastsd %xmm1, %ymm1 2798; AVX512F-NEXT: vmulps %ymm1, %ymm10, %ymm1 2799; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 2800; AVX512F-NEXT: vextractf32x4 $3, %zmm7, %xmm1 2801; AVX512F-NEXT: vbroadcastss %xmm1, %ymm1 2802; AVX512F-NEXT: vmulps %ymm1, %ymm2, %ymm1 2803; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 2804; AVX512F-NEXT: vmovshdup {{.*#+}} ymm1 = ymm13[1,1,3,3,5,5,7,7] 2805; AVX512F-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 2806; AVX512F-NEXT: vmulps %ymm1, %ymm9, %ymm1 2807; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 2808; AVX512F-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,2,2,2,6,6,6,6] 2809; AVX512F-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 2810; AVX512F-NEXT: vmulps %ymm1, %ymm3, %ymm1 2811; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 2812; AVX512F-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] 2813; AVX512F-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 2814; AVX512F-NEXT: vmulps %ymm1, %ymm8, %ymm1 2815; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 2816; AVX512F-NEXT: vinsertf64x4 $1, %ymm6, %zmm14, %zmm2 2817; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm12, %zmm3 2818; AVX512F-NEXT: vmovaps %zmm4, %zmm0 2819; AVX512F-NEXT: vmovaps %zmm5, %zmm1 2820; AVX512F-NEXT: retq 2821; 2822; AVX512VL-LABEL: test_mul8x8_f32: 2823; AVX512VL: # %bb.0: # %entry 2824; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm11 2825; AVX512VL-NEXT: vextractf64x4 $1, %zmm1, %ymm10 2826; AVX512VL-NEXT: vextractf64x4 $1, %zmm2, %ymm9 2827; AVX512VL-NEXT: vextractf64x4 $1, %zmm3, %ymm8 2828; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm12 2829; AVX512VL-NEXT: vmulps %ymm0, %ymm12, %ymm12 2830; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm4[1,1,3,3] 2831; AVX512VL-NEXT: vbroadcastss %xmm13, %ymm13 2832; AVX512VL-NEXT: vmulps %ymm13, %ymm11, %ymm13 2833; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12 2834; AVX512VL-NEXT: vshufpd {{.*#+}} xmm13 = xmm4[1,0] 2835; AVX512VL-NEXT: vbroadcastss %xmm13, %ymm13 2836; AVX512VL-NEXT: vmulps %ymm1, %ymm13, %ymm13 2837; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12 2838; AVX512VL-NEXT: vshufps {{.*#+}} xmm13 = xmm4[3,3,3,3] 2839; AVX512VL-NEXT: vbroadcastss %xmm13, %ymm13 2840; AVX512VL-NEXT: vmulps %ymm13, %ymm10, %ymm13 2841; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12 2842; AVX512VL-NEXT: vextractf128 $1, %ymm4, %xmm13 2843; AVX512VL-NEXT: vbroadcastss %xmm13, %ymm13 2844; AVX512VL-NEXT: vmulps %ymm2, %ymm13, %ymm13 2845; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12 2846; AVX512VL-NEXT: vmovshdup {{.*#+}} ymm13 = ymm4[1,1,3,3,5,5,7,7] 2847; AVX512VL-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] 2848; AVX512VL-NEXT: vmulps %ymm13, %ymm9, %ymm13 2849; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12 2850; AVX512VL-NEXT: vshufps {{.*#+}} ymm13 = ymm4[2,2,2,2,6,6,6,6] 2851; AVX512VL-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] 2852; AVX512VL-NEXT: vmulps %ymm3, %ymm13, %ymm13 2853; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12 2854; AVX512VL-NEXT: vshufps {{.*#+}} ymm13 = ymm4[3,3,3,3,7,7,7,7] 2855; AVX512VL-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] 2856; AVX512VL-NEXT: vmulps %ymm13, %ymm8, %ymm13 2857; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12 2858; AVX512VL-NEXT: vextractf64x4 $1, %zmm4, %ymm13 2859; AVX512VL-NEXT: vextractf32x4 $2, %zmm4, %xmm14 2860; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14 2861; AVX512VL-NEXT: vmulps %ymm0, %ymm14, %ymm14 2862; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm13[1,1,3,3] 2863; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15 2864; AVX512VL-NEXT: vmulps %ymm15, %ymm11, %ymm15 2865; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 2866; AVX512VL-NEXT: vshufps {{.*#+}} xmm15 = xmm13[2,2,2,2] 2867; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15 2868; AVX512VL-NEXT: vmulps %ymm1, %ymm15, %ymm15 2869; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 2870; AVX512VL-NEXT: vshufps {{.*#+}} xmm15 = xmm13[3,3,3,3] 2871; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15 2872; AVX512VL-NEXT: vmulps %ymm15, %ymm10, %ymm15 2873; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 2874; AVX512VL-NEXT: vextractf32x4 $3, %zmm4, %xmm4 2875; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm4 2876; AVX512VL-NEXT: vmulps %ymm4, %ymm2, %ymm4 2877; AVX512VL-NEXT: vaddps %ymm4, %ymm14, %ymm4 2878; AVX512VL-NEXT: vmovshdup {{.*#+}} ymm14 = ymm13[1,1,3,3,5,5,7,7] 2879; AVX512VL-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] 2880; AVX512VL-NEXT: vmulps %ymm14, %ymm9, %ymm14 2881; AVX512VL-NEXT: vaddps %ymm4, %ymm14, %ymm4 2882; AVX512VL-NEXT: vshufps {{.*#+}} ymm14 = ymm13[2,2,2,2,6,6,6,6] 2883; AVX512VL-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] 2884; AVX512VL-NEXT: vmulps %ymm3, %ymm14, %ymm14 2885; AVX512VL-NEXT: vaddps %ymm4, %ymm14, %ymm4 2886; AVX512VL-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,3,3,3,7,7,7,7] 2887; AVX512VL-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] 2888; AVX512VL-NEXT: vmulps %ymm13, %ymm8, %ymm13 2889; AVX512VL-NEXT: vaddps %ymm4, %ymm13, %ymm4 2890; AVX512VL-NEXT: vbroadcastss %xmm5, %ymm13 2891; AVX512VL-NEXT: vmulps %ymm0, %ymm13, %ymm13 2892; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm14 = xmm5[1,1,3,3] 2893; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14 2894; AVX512VL-NEXT: vmulps %ymm14, %ymm11, %ymm14 2895; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13 2896; AVX512VL-NEXT: vshufpd {{.*#+}} xmm14 = xmm5[1,0] 2897; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14 2898; AVX512VL-NEXT: vmulps %ymm1, %ymm14, %ymm14 2899; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13 2900; AVX512VL-NEXT: vshufps {{.*#+}} xmm14 = xmm5[3,3,3,3] 2901; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14 2902; AVX512VL-NEXT: vmulps %ymm14, %ymm10, %ymm14 2903; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13 2904; AVX512VL-NEXT: vextractf128 $1, %ymm5, %xmm14 2905; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14 2906; AVX512VL-NEXT: vmulps %ymm2, %ymm14, %ymm14 2907; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13 2908; AVX512VL-NEXT: vmovshdup {{.*#+}} ymm14 = ymm5[1,1,3,3,5,5,7,7] 2909; AVX512VL-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] 2910; AVX512VL-NEXT: vmulps %ymm14, %ymm9, %ymm14 2911; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13 2912; AVX512VL-NEXT: vshufps {{.*#+}} ymm14 = ymm5[2,2,2,2,6,6,6,6] 2913; AVX512VL-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] 2914; AVX512VL-NEXT: vmulps %ymm3, %ymm14, %ymm14 2915; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13 2916; AVX512VL-NEXT: vshufps {{.*#+}} ymm14 = ymm5[3,3,3,3,7,7,7,7] 2917; AVX512VL-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] 2918; AVX512VL-NEXT: vmulps %ymm14, %ymm8, %ymm14 2919; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13 2920; AVX512VL-NEXT: vextractf64x4 $1, %zmm5, %ymm14 2921; AVX512VL-NEXT: vextractf32x4 $2, %zmm5, %xmm15 2922; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15 2923; AVX512VL-NEXT: vmulps %ymm0, %ymm15, %ymm15 2924; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm14[1,1,3,3] 2925; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16 2926; AVX512VL-NEXT: vmulps %ymm16, %ymm11, %ymm16 2927; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 2928; AVX512VL-NEXT: vshufps {{.*#+}} xmm16 = xmm14[2,2,2,2] 2929; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16 2930; AVX512VL-NEXT: vmulps %ymm16, %ymm1, %ymm16 2931; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 2932; AVX512VL-NEXT: vshufps {{.*#+}} xmm16 = xmm14[3,3,3,3] 2933; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16 2934; AVX512VL-NEXT: vmulps %ymm16, %ymm10, %ymm16 2935; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 2936; AVX512VL-NEXT: vextractf32x4 $3, %zmm5, %xmm5 2937; AVX512VL-NEXT: vbroadcastss %xmm5, %ymm5 2938; AVX512VL-NEXT: vmulps %ymm5, %ymm2, %ymm5 2939; AVX512VL-NEXT: vaddps %ymm5, %ymm15, %ymm5 2940; AVX512VL-NEXT: vmovshdup {{.*#+}} ymm15 = ymm14[1,1,3,3,5,5,7,7] 2941; AVX512VL-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] 2942; AVX512VL-NEXT: vmulps %ymm15, %ymm9, %ymm15 2943; AVX512VL-NEXT: vaddps %ymm5, %ymm15, %ymm5 2944; AVX512VL-NEXT: vshufps {{.*#+}} ymm15 = ymm14[2,2,2,2,6,6,6,6] 2945; AVX512VL-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] 2946; AVX512VL-NEXT: vmulps %ymm3, %ymm15, %ymm15 2947; AVX512VL-NEXT: vaddps %ymm5, %ymm15, %ymm5 2948; AVX512VL-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,3,3,3,7,7,7,7] 2949; AVX512VL-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] 2950; AVX512VL-NEXT: vmulps %ymm14, %ymm8, %ymm14 2951; AVX512VL-NEXT: vaddps %ymm5, %ymm14, %ymm5 2952; AVX512VL-NEXT: vbroadcastss %xmm6, %ymm14 2953; AVX512VL-NEXT: vmulps %ymm0, %ymm14, %ymm14 2954; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm6[1,1,3,3] 2955; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15 2956; AVX512VL-NEXT: vmulps %ymm15, %ymm11, %ymm15 2957; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 2958; AVX512VL-NEXT: vshufpd {{.*#+}} xmm15 = xmm6[1,0] 2959; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15 2960; AVX512VL-NEXT: vmulps %ymm1, %ymm15, %ymm15 2961; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 2962; AVX512VL-NEXT: vshufps {{.*#+}} xmm15 = xmm6[3,3,3,3] 2963; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15 2964; AVX512VL-NEXT: vmulps %ymm15, %ymm10, %ymm15 2965; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 2966; AVX512VL-NEXT: vextractf128 $1, %ymm6, %xmm15 2967; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15 2968; AVX512VL-NEXT: vmulps %ymm2, %ymm15, %ymm15 2969; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 2970; AVX512VL-NEXT: vmovshdup {{.*#+}} ymm15 = ymm6[1,1,3,3,5,5,7,7] 2971; AVX512VL-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] 2972; AVX512VL-NEXT: vmulps %ymm15, %ymm9, %ymm15 2973; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 2974; AVX512VL-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,2,2,2,6,6,6,6] 2975; AVX512VL-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] 2976; AVX512VL-NEXT: vmulps %ymm3, %ymm15, %ymm15 2977; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 2978; AVX512VL-NEXT: vshufps {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7] 2979; AVX512VL-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] 2980; AVX512VL-NEXT: vmulps %ymm15, %ymm8, %ymm15 2981; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 2982; AVX512VL-NEXT: vextractf64x4 $1, %zmm6, %ymm15 2983; AVX512VL-NEXT: vextractf32x4 $2, %zmm6, %xmm16 2984; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16 2985; AVX512VL-NEXT: vmulps %ymm16, %ymm0, %ymm16 2986; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm17 = xmm15[1,1,3,3] 2987; AVX512VL-NEXT: vbroadcastsd %xmm17, %ymm17 2988; AVX512VL-NEXT: vmulps %ymm17, %ymm11, %ymm17 2989; AVX512VL-NEXT: vaddps %ymm17, %ymm16, %ymm16 2990; AVX512VL-NEXT: vshufps {{.*#+}} xmm17 = xmm15[2,2,2,2] 2991; AVX512VL-NEXT: vbroadcastsd %xmm17, %ymm17 2992; AVX512VL-NEXT: vmulps %ymm17, %ymm1, %ymm17 2993; AVX512VL-NEXT: vaddps %ymm17, %ymm16, %ymm16 2994; AVX512VL-NEXT: vshufps {{.*#+}} xmm17 = xmm15[3,3,3,3] 2995; AVX512VL-NEXT: vbroadcastsd %xmm17, %ymm17 2996; AVX512VL-NEXT: vmulps %ymm17, %ymm10, %ymm17 2997; AVX512VL-NEXT: vaddps %ymm17, %ymm16, %ymm16 2998; AVX512VL-NEXT: vextractf32x4 $3, %zmm6, %xmm6 2999; AVX512VL-NEXT: vbroadcastss %xmm6, %ymm6 3000; AVX512VL-NEXT: vmulps %ymm6, %ymm2, %ymm6 3001; AVX512VL-NEXT: vaddps %ymm6, %ymm16, %ymm6 3002; AVX512VL-NEXT: vmovshdup {{.*#+}} ymm16 = ymm15[1,1,3,3,5,5,7,7] 3003; AVX512VL-NEXT: vpermpd {{.*#+}} ymm16 = ymm16[2,2,2,2] 3004; AVX512VL-NEXT: vmulps %ymm16, %ymm9, %ymm16 3005; AVX512VL-NEXT: vaddps %ymm16, %ymm6, %ymm6 3006; AVX512VL-NEXT: vshufps {{.*#+}} ymm16 = ymm15[2,2,2,2,6,6,6,6] 3007; AVX512VL-NEXT: vpermpd {{.*#+}} ymm16 = ymm16[2,2,2,2] 3008; AVX512VL-NEXT: vmulps %ymm16, %ymm3, %ymm16 3009; AVX512VL-NEXT: vaddps %ymm16, %ymm6, %ymm6 3010; AVX512VL-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,3,3,3,7,7,7,7] 3011; AVX512VL-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] 3012; AVX512VL-NEXT: vmulps %ymm15, %ymm8, %ymm15 3013; AVX512VL-NEXT: vaddps %ymm6, %ymm15, %ymm6 3014; AVX512VL-NEXT: vbroadcastss %xmm7, %ymm15 3015; AVX512VL-NEXT: vmulps %ymm0, %ymm15, %ymm15 3016; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm7[1,1,3,3] 3017; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16 3018; AVX512VL-NEXT: vmulps %ymm16, %ymm11, %ymm16 3019; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 3020; AVX512VL-NEXT: vshufpd {{.*#+}} xmm16 = xmm7[1,0] 3021; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16 3022; AVX512VL-NEXT: vmulps %ymm16, %ymm1, %ymm16 3023; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 3024; AVX512VL-NEXT: vshufps {{.*#+}} xmm16 = xmm7[3,3,3,3] 3025; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16 3026; AVX512VL-NEXT: vmulps %ymm16, %ymm10, %ymm16 3027; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 3028; AVX512VL-NEXT: vextractf32x4 $1, %ymm7, %xmm16 3029; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16 3030; AVX512VL-NEXT: vmulps %ymm16, %ymm2, %ymm16 3031; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 3032; AVX512VL-NEXT: vmovshdup {{.*#+}} ymm16 = ymm7[1,1,3,3,5,5,7,7] 3033; AVX512VL-NEXT: vpermpd {{.*#+}} ymm16 = ymm16[2,2,2,2] 3034; AVX512VL-NEXT: vmulps %ymm16, %ymm9, %ymm16 3035; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 3036; AVX512VL-NEXT: vshufps {{.*#+}} ymm16 = ymm7[2,2,2,2,6,6,6,6] 3037; AVX512VL-NEXT: vpermpd {{.*#+}} ymm16 = ymm16[2,2,2,2] 3038; AVX512VL-NEXT: vmulps %ymm16, %ymm3, %ymm16 3039; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 3040; AVX512VL-NEXT: vshufps {{.*#+}} ymm16 = ymm7[3,3,3,3,7,7,7,7] 3041; AVX512VL-NEXT: vpermpd {{.*#+}} ymm16 = ymm16[2,2,2,2] 3042; AVX512VL-NEXT: vmulps %ymm16, %ymm8, %ymm16 3043; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 3044; AVX512VL-NEXT: vextractf64x4 $1, %zmm7, %ymm16 3045; AVX512VL-NEXT: vextractf32x4 $2, %zmm7, %xmm17 3046; AVX512VL-NEXT: vbroadcastss %xmm17, %ymm17 3047; AVX512VL-NEXT: vmulps %ymm17, %ymm0, %ymm0 3048; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm17 = xmm16[1,1,3,3] 3049; AVX512VL-NEXT: vbroadcastsd %xmm17, %ymm17 3050; AVX512VL-NEXT: vmulps %ymm17, %ymm11, %ymm11 3051; AVX512VL-NEXT: vaddps %ymm0, %ymm11, %ymm0 3052; AVX512VL-NEXT: vshufps {{.*#+}} xmm11 = xmm16[2,2,2,2] 3053; AVX512VL-NEXT: vbroadcastsd %xmm11, %ymm11 3054; AVX512VL-NEXT: vmulps %ymm1, %ymm11, %ymm1 3055; AVX512VL-NEXT: vaddps %ymm1, %ymm0, %ymm0 3056; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm16[3,3,3,3] 3057; AVX512VL-NEXT: vbroadcastsd %xmm1, %ymm1 3058; AVX512VL-NEXT: vmulps %ymm1, %ymm10, %ymm1 3059; AVX512VL-NEXT: vaddps %ymm1, %ymm0, %ymm0 3060; AVX512VL-NEXT: vextractf32x4 $3, %zmm7, %xmm1 3061; AVX512VL-NEXT: vbroadcastss %xmm1, %ymm1 3062; AVX512VL-NEXT: vmulps %ymm1, %ymm2, %ymm1 3063; AVX512VL-NEXT: vaddps %ymm1, %ymm0, %ymm0 3064; AVX512VL-NEXT: vmovshdup {{.*#+}} ymm1 = ymm16[1,1,3,3,5,5,7,7] 3065; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 3066; AVX512VL-NEXT: vmulps %ymm1, %ymm9, %ymm1 3067; AVX512VL-NEXT: vaddps %ymm1, %ymm0, %ymm0 3068; AVX512VL-NEXT: vshufps {{.*#+}} ymm1 = ymm16[2,2,2,2,6,6,6,6] 3069; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 3070; AVX512VL-NEXT: vmulps %ymm1, %ymm3, %ymm1 3071; AVX512VL-NEXT: vaddps %ymm1, %ymm0, %ymm0 3072; AVX512VL-NEXT: vshufps {{.*#+}} ymm1 = ymm16[3,3,3,3,7,7,7,7] 3073; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] 3074; AVX512VL-NEXT: vmulps %ymm1, %ymm8, %ymm1 3075; AVX512VL-NEXT: vaddps %ymm1, %ymm0, %ymm3 3076; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm12, %zmm0 3077; AVX512VL-NEXT: vinsertf64x4 $1, %ymm5, %zmm13, %zmm1 3078; AVX512VL-NEXT: vinsertf64x4 $1, %ymm6, %zmm14, %zmm2 3079; AVX512VL-NEXT: vinsertf64x4 $1, %ymm3, %zmm15, %zmm3 3080; AVX512VL-NEXT: retq 3081entry: 3082 %split = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3083 %split1 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3084 %split2 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 3085 %split3 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 3086 %split4 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39> 3087 %split5 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 3088 %split6 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55> 3089 %split7 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 3090 %splat.splat = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> zeroinitializer 3091 %0 = fmul <8 x float> %split, %splat.splat 3092 %splat.splat18 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 3093 %1 = fmul <8 x float> %split1, %splat.splat18 3094 %2 = fadd <8 x float> %0, %1 3095 %splat.splat21 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 3096 %3 = fmul <8 x float> %split2, %splat.splat21 3097 %4 = fadd <8 x float> %2, %3 3098 %splat.splat24 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 3099 %5 = fmul <8 x float> %split3, %splat.splat24 3100 %6 = fadd <8 x float> %4, %5 3101 %splat.splat27 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 3102 %7 = fmul <8 x float> %split4, %splat.splat27 3103 %8 = fadd <8 x float> %6, %7 3104 %splat.splat30 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> 3105 %9 = fmul <8 x float> %split5, %splat.splat30 3106 %10 = fadd <8 x float> %8, %9 3107 %splat.splat33 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6> 3108 %11 = fmul <8 x float> %split6, %splat.splat33 3109 %12 = fadd <8 x float> %10, %11 3110 %splat.splat36 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 3111 %13 = fmul <8 x float> %split7, %splat.splat36 3112 %14 = fadd <8 x float> %12, %13 3113 %splat.splat39 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> 3114 %15 = fmul <8 x float> %split, %splat.splat39 3115 %splat.splat42 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9> 3116 %16 = fmul <8 x float> %split1, %splat.splat42 3117 %17 = fadd <8 x float> %15, %16 3118 %splat.splat45 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10> 3119 %18 = fmul <8 x float> %split2, %splat.splat45 3120 %19 = fadd <8 x float> %17, %18 3121 %splat.splat48 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11> 3122 %20 = fmul <8 x float> %split3, %splat.splat48 3123 %21 = fadd <8 x float> %19, %20 3124 %splat.splat51 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12> 3125 %22 = fmul <8 x float> %split4, %splat.splat51 3126 %23 = fadd <8 x float> %21, %22 3127 %splat.splat54 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13> 3128 %24 = fmul <8 x float> %split5, %splat.splat54 3129 %25 = fadd <8 x float> %23, %24 3130 %splat.splat57 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14> 3131 %26 = fmul <8 x float> %split6, %splat.splat57 3132 %27 = fadd <8 x float> %25, %26 3133 %splat.splat60 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 3134 %28 = fmul <8 x float> %split7, %splat.splat60 3135 %29 = fadd <8 x float> %27, %28 3136 %splat.splat63 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 3137 %30 = fmul <8 x float> %split, %splat.splat63 3138 %splat.splat66 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 3139 %31 = fmul <8 x float> %split1, %splat.splat66 3140 %32 = fadd <8 x float> %30, %31 3141 %splat.splat69 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18> 3142 %33 = fmul <8 x float> %split2, %splat.splat69 3143 %34 = fadd <8 x float> %32, %33 3144 %splat.splat72 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19> 3145 %35 = fmul <8 x float> %split3, %splat.splat72 3146 %36 = fadd <8 x float> %34, %35 3147 %splat.splat75 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20> 3148 %37 = fmul <8 x float> %split4, %splat.splat75 3149 %38 = fadd <8 x float> %36, %37 3150 %splat.splat78 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21> 3151 %39 = fmul <8 x float> %split5, %splat.splat78 3152 %40 = fadd <8 x float> %38, %39 3153 %splat.splat81 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22> 3154 %41 = fmul <8 x float> %split6, %splat.splat81 3155 %42 = fadd <8 x float> %40, %41 3156 %splat.splat84 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23> 3157 %43 = fmul <8 x float> %split7, %splat.splat84 3158 %44 = fadd <8 x float> %42, %43 3159 %splat.splat87 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> 3160 %45 = fmul <8 x float> %split, %splat.splat87 3161 %splat.splat90 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25> 3162 %46 = fmul <8 x float> %split1, %splat.splat90 3163 %47 = fadd <8 x float> %45, %46 3164 %splat.splat93 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26> 3165 %48 = fmul <8 x float> %split2, %splat.splat93 3166 %49 = fadd <8 x float> %47, %48 3167 %splat.splat96 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27> 3168 %50 = fmul <8 x float> %split3, %splat.splat96 3169 %51 = fadd <8 x float> %49, %50 3170 %splat.splat99 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28> 3171 %52 = fmul <8 x float> %split4, %splat.splat99 3172 %53 = fadd <8 x float> %51, %52 3173 %splat.splat102 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29> 3174 %54 = fmul <8 x float> %split5, %splat.splat102 3175 %55 = fadd <8 x float> %53, %54 3176 %splat.splat105 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30> 3177 %56 = fmul <8 x float> %split6, %splat.splat105 3178 %57 = fadd <8 x float> %55, %56 3179 %splat.splat108 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 3180 %58 = fmul <8 x float> %split7, %splat.splat108 3181 %59 = fadd <8 x float> %57, %58 3182 %splat.splat111 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32> 3183 %60 = fmul <8 x float> %split, %splat.splat111 3184 %splat.splat114 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33> 3185 %61 = fmul <8 x float> %split1, %splat.splat114 3186 %62 = fadd <8 x float> %60, %61 3187 %splat.splat117 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34> 3188 %63 = fmul <8 x float> %split2, %splat.splat117 3189 %64 = fadd <8 x float> %62, %63 3190 %splat.splat120 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35> 3191 %65 = fmul <8 x float> %split3, %splat.splat120 3192 %66 = fadd <8 x float> %64, %65 3193 %splat.splat123 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36> 3194 %67 = fmul <8 x float> %split4, %splat.splat123 3195 %68 = fadd <8 x float> %66, %67 3196 %splat.splat126 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37> 3197 %69 = fmul <8 x float> %split5, %splat.splat126 3198 %70 = fadd <8 x float> %68, %69 3199 %splat.splat129 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38> 3200 %71 = fmul <8 x float> %split6, %splat.splat129 3201 %72 = fadd <8 x float> %70, %71 3202 %splat.splat132 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39> 3203 %73 = fmul <8 x float> %split7, %splat.splat132 3204 %74 = fadd <8 x float> %72, %73 3205 %splat.splat135 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40> 3206 %75 = fmul <8 x float> %split, %splat.splat135 3207 %splat.splat138 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41> 3208 %76 = fmul <8 x float> %split1, %splat.splat138 3209 %77 = fadd <8 x float> %75, %76 3210 %splat.splat141 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42> 3211 %78 = fmul <8 x float> %split2, %splat.splat141 3212 %79 = fadd <8 x float> %77, %78 3213 %splat.splat144 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43> 3214 %80 = fmul <8 x float> %split3, %splat.splat144 3215 %81 = fadd <8 x float> %79, %80 3216 %splat.splat147 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44> 3217 %82 = fmul <8 x float> %split4, %splat.splat147 3218 %83 = fadd <8 x float> %81, %82 3219 %splat.splat150 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45> 3220 %84 = fmul <8 x float> %split5, %splat.splat150 3221 %85 = fadd <8 x float> %83, %84 3222 %splat.splat153 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46> 3223 %86 = fmul <8 x float> %split6, %splat.splat153 3224 %87 = fadd <8 x float> %85, %86 3225 %splat.splat156 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47> 3226 %88 = fmul <8 x float> %split7, %splat.splat156 3227 %89 = fadd <8 x float> %87, %88 3228 %splat.splat159 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48> 3229 %90 = fmul <8 x float> %split, %splat.splat159 3230 %splat.splat162 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49> 3231 %91 = fmul <8 x float> %split1, %splat.splat162 3232 %92 = fadd <8 x float> %90, %91 3233 %splat.splat165 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50> 3234 %93 = fmul <8 x float> %split2, %splat.splat165 3235 %94 = fadd <8 x float> %92, %93 3236 %splat.splat168 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51> 3237 %95 = fmul <8 x float> %split3, %splat.splat168 3238 %96 = fadd <8 x float> %94, %95 3239 %splat.splat171 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52> 3240 %97 = fmul <8 x float> %split4, %splat.splat171 3241 %98 = fadd <8 x float> %96, %97 3242 %splat.splat174 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53> 3243 %99 = fmul <8 x float> %split5, %splat.splat174 3244 %100 = fadd <8 x float> %98, %99 3245 %splat.splat177 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54> 3246 %101 = fmul <8 x float> %split6, %splat.splat177 3247 %102 = fadd <8 x float> %100, %101 3248 %splat.splat180 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55> 3249 %103 = fmul <8 x float> %split7, %splat.splat180 3250 %104 = fadd <8 x float> %102, %103 3251 %splat.splat183 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56> 3252 %105 = fmul <8 x float> %split, %splat.splat183 3253 %splat.splat186 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57> 3254 %106 = fmul <8 x float> %split1, %splat.splat186 3255 %107 = fadd <8 x float> %105, %106 3256 %splat.splat189 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58> 3257 %108 = fmul <8 x float> %split2, %splat.splat189 3258 %109 = fadd <8 x float> %107, %108 3259 %splat.splat192 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59> 3260 %110 = fmul <8 x float> %split3, %splat.splat192 3261 %111 = fadd <8 x float> %109, %110 3262 %splat.splat195 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60> 3263 %112 = fmul <8 x float> %split4, %splat.splat195 3264 %113 = fadd <8 x float> %111, %112 3265 %splat.splat198 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61> 3266 %114 = fmul <8 x float> %split5, %splat.splat198 3267 %115 = fadd <8 x float> %113, %114 3268 %splat.splat201 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62> 3269 %116 = fmul <8 x float> %split6, %splat.splat201 3270 %117 = fadd <8 x float> %115, %116 3271 %splat.splat204 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63> 3272 %118 = fmul <8 x float> %split7, %splat.splat204 3273 %119 = fadd <8 x float> %117, %118 3274 %120 = shufflevector <8 x float> %14, <8 x float> %29, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3275 %121 = shufflevector <8 x float> %44, <8 x float> %59, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3276 %122 = shufflevector <8 x float> %74, <8 x float> %89, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3277 %123 = shufflevector <8 x float> %104, <8 x float> %119, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3278 %124 = shufflevector <16 x float> %120, <16 x float> %121, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 3279 %125 = shufflevector <16 x float> %122, <16 x float> %123, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 3280 %126 = shufflevector <32 x float> %124, <32 x float> %125, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 3281 ret <64 x float> %126 3282} 3283 3284define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) nounwind { 3285; SSE-LABEL: test_mul8x8_f64: 3286; SSE: # %bb.0: # %entry 3287; SSE-NEXT: subq $328, %rsp # imm = 0x148 3288; SSE-NEXT: movapd %xmm7, %xmm15 3289; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3290; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3291; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3292; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3293; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3294; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3295; SSE-NEXT: movq %rdi, %rax 3296; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 3297; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 3298; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 3299; SSE-NEXT: movapd %xmm13, %xmm12 3300; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm13[0] 3301; SSE-NEXT: movapd %xmm3, %xmm10 3302; SSE-NEXT: mulpd %xmm12, %xmm10 3303; SSE-NEXT: movapd %xmm2, %xmm8 3304; SSE-NEXT: mulpd %xmm12, %xmm8 3305; SSE-NEXT: movapd %xmm1, %xmm9 3306; SSE-NEXT: mulpd %xmm12, %xmm9 3307; SSE-NEXT: mulpd %xmm0, %xmm12 3308; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1,1] 3309; SSE-NEXT: movapd %xmm7, %xmm2 3310; SSE-NEXT: mulpd %xmm13, %xmm2 3311; SSE-NEXT: addpd %xmm10, %xmm2 3312; SSE-NEXT: movapd %xmm6, %xmm7 3313; SSE-NEXT: movapd %xmm6, %xmm10 3314; SSE-NEXT: mulpd %xmm13, %xmm7 3315; SSE-NEXT: addpd %xmm8, %xmm7 3316; SSE-NEXT: movapd %xmm5, %xmm8 3317; SSE-NEXT: mulpd %xmm13, %xmm8 3318; SSE-NEXT: addpd %xmm9, %xmm8 3319; SSE-NEXT: mulpd %xmm4, %xmm13 3320; SSE-NEXT: addpd %xmm12, %xmm13 3321; SSE-NEXT: movapd %xmm11, %xmm6 3322; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm11[0] 3323; SSE-NEXT: movapd %xmm14, %xmm1 3324; SSE-NEXT: mulpd %xmm6, %xmm1 3325; SSE-NEXT: addpd %xmm13, %xmm1 3326; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 3327; SSE-NEXT: mulpd %xmm6, %xmm3 3328; SSE-NEXT: addpd %xmm8, %xmm3 3329; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3330; SSE-NEXT: mulpd %xmm6, %xmm5 3331; SSE-NEXT: addpd %xmm7, %xmm5 3332; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 3333; SSE-NEXT: addpd %xmm2, %xmm6 3334; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1,1] 3335; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 3336; SSE-NEXT: mulpd %xmm11, %xmm2 3337; SSE-NEXT: addpd %xmm6, %xmm2 3338; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 3339; SSE-NEXT: mulpd %xmm11, %xmm4 3340; SSE-NEXT: addpd %xmm5, %xmm4 3341; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3342; SSE-NEXT: mulpd %xmm11, %xmm5 3343; SSE-NEXT: addpd %xmm3, %xmm5 3344; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm11 3345; SSE-NEXT: addpd %xmm1, %xmm11 3346; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3347; SSE-NEXT: movapd %xmm1, %xmm6 3348; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0] 3349; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 3350; SSE-NEXT: mulpd %xmm6, %xmm3 3351; SSE-NEXT: addpd %xmm11, %xmm3 3352; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 3353; SSE-NEXT: mulpd %xmm6, %xmm7 3354; SSE-NEXT: addpd %xmm5, %xmm7 3355; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3356; SSE-NEXT: mulpd %xmm6, %xmm5 3357; SSE-NEXT: addpd %xmm4, %xmm5 3358; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 3359; SSE-NEXT: addpd %xmm2, %xmm6 3360; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 3361; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3362; SSE-NEXT: mulpd %xmm1, %xmm0 3363; SSE-NEXT: addpd %xmm6, %xmm0 3364; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 3365; SSE-NEXT: mulpd %xmm1, %xmm4 3366; SSE-NEXT: addpd %xmm5, %xmm4 3367; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3368; SSE-NEXT: mulpd %xmm1, %xmm5 3369; SSE-NEXT: addpd %xmm7, %xmm5 3370; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 3371; SSE-NEXT: addpd %xmm3, %xmm1 3372; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 3373; SSE-NEXT: movapd %xmm6, %xmm3 3374; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0] 3375; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 3376; SSE-NEXT: mulpd %xmm3, %xmm2 3377; SSE-NEXT: addpd %xmm1, %xmm2 3378; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3379; SSE-NEXT: mulpd %xmm3, %xmm1 3380; SSE-NEXT: addpd %xmm5, %xmm1 3381; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3382; SSE-NEXT: mulpd %xmm3, %xmm5 3383; SSE-NEXT: addpd %xmm4, %xmm5 3384; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 3385; SSE-NEXT: addpd %xmm0, %xmm3 3386; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 3387; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3388; SSE-NEXT: mulpd %xmm6, %xmm0 3389; SSE-NEXT: addpd %xmm3, %xmm0 3390; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3391; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3392; SSE-NEXT: mulpd %xmm6, %xmm0 3393; SSE-NEXT: addpd %xmm5, %xmm0 3394; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3395; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3396; SSE-NEXT: mulpd %xmm6, %xmm0 3397; SSE-NEXT: addpd %xmm1, %xmm0 3398; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3399; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 3400; SSE-NEXT: addpd %xmm2, %xmm6 3401; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3402; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3403; SSE-NEXT: movapd %xmm1, %xmm0 3404; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3405; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 3406; SSE-NEXT: movapd %xmm11, %xmm3 3407; SSE-NEXT: mulpd %xmm0, %xmm3 3408; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 3409; SSE-NEXT: movapd %xmm15, %xmm8 3410; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3411; SSE-NEXT: movapd %xmm15, %xmm2 3412; SSE-NEXT: mulpd %xmm1, %xmm2 3413; SSE-NEXT: addpd %xmm3, %xmm2 3414; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 3415; SSE-NEXT: movapd %xmm9, %xmm3 3416; SSE-NEXT: mulpd %xmm0, %xmm3 3417; SSE-NEXT: movapd %xmm10, %xmm15 3418; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3419; SSE-NEXT: movapd %xmm10, %xmm4 3420; SSE-NEXT: mulpd %xmm1, %xmm4 3421; SSE-NEXT: addpd %xmm3, %xmm4 3422; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3423; SSE-NEXT: movapd %xmm13, %xmm3 3424; SSE-NEXT: mulpd %xmm0, %xmm3 3425; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3426; SSE-NEXT: movapd %xmm10, %xmm5 3427; SSE-NEXT: mulpd %xmm1, %xmm5 3428; SSE-NEXT: addpd %xmm3, %xmm5 3429; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 3430; SSE-NEXT: mulpd %xmm12, %xmm0 3431; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3432; SSE-NEXT: mulpd %xmm14, %xmm1 3433; SSE-NEXT: addpd %xmm0, %xmm1 3434; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3435; SSE-NEXT: movapd %xmm0, %xmm6 3436; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0] 3437; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 3438; SSE-NEXT: mulpd %xmm6, %xmm3 3439; SSE-NEXT: addpd %xmm1, %xmm3 3440; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3441; SSE-NEXT: mulpd %xmm6, %xmm1 3442; SSE-NEXT: addpd %xmm5, %xmm1 3443; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3444; SSE-NEXT: mulpd %xmm6, %xmm5 3445; SSE-NEXT: addpd %xmm4, %xmm5 3446; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 3447; SSE-NEXT: addpd %xmm2, %xmm6 3448; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 3449; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 3450; SSE-NEXT: mulpd %xmm0, %xmm2 3451; SSE-NEXT: addpd %xmm6, %xmm2 3452; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 3453; SSE-NEXT: mulpd %xmm0, %xmm4 3454; SSE-NEXT: addpd %xmm5, %xmm4 3455; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3456; SSE-NEXT: mulpd %xmm0, %xmm5 3457; SSE-NEXT: addpd %xmm1, %xmm5 3458; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3459; SSE-NEXT: mulpd %xmm1, %xmm0 3460; SSE-NEXT: addpd %xmm3, %xmm0 3461; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3462; SSE-NEXT: movapd %xmm1, %xmm6 3463; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0] 3464; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 3465; SSE-NEXT: mulpd %xmm6, %xmm3 3466; SSE-NEXT: addpd %xmm0, %xmm3 3467; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 3468; SSE-NEXT: mulpd %xmm6, %xmm7 3469; SSE-NEXT: addpd %xmm5, %xmm7 3470; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3471; SSE-NEXT: mulpd %xmm6, %xmm5 3472; SSE-NEXT: addpd %xmm4, %xmm5 3473; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 3474; SSE-NEXT: addpd %xmm2, %xmm6 3475; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 3476; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3477; SSE-NEXT: mulpd %xmm1, %xmm0 3478; SSE-NEXT: addpd %xmm6, %xmm0 3479; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 3480; SSE-NEXT: mulpd %xmm1, %xmm4 3481; SSE-NEXT: addpd %xmm5, %xmm4 3482; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3483; SSE-NEXT: mulpd %xmm1, %xmm5 3484; SSE-NEXT: addpd %xmm7, %xmm5 3485; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 3486; SSE-NEXT: mulpd %xmm2, %xmm1 3487; SSE-NEXT: addpd %xmm3, %xmm1 3488; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 3489; SSE-NEXT: movapd %xmm6, %xmm3 3490; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0] 3491; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 3492; SSE-NEXT: mulpd %xmm3, %xmm2 3493; SSE-NEXT: addpd %xmm1, %xmm2 3494; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3495; SSE-NEXT: mulpd %xmm3, %xmm1 3496; SSE-NEXT: addpd %xmm5, %xmm1 3497; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3498; SSE-NEXT: mulpd %xmm3, %xmm5 3499; SSE-NEXT: addpd %xmm4, %xmm5 3500; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 3501; SSE-NEXT: addpd %xmm0, %xmm3 3502; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] 3503; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3504; SSE-NEXT: mulpd %xmm6, %xmm0 3505; SSE-NEXT: addpd %xmm3, %xmm0 3506; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3507; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3508; SSE-NEXT: mulpd %xmm6, %xmm0 3509; SSE-NEXT: addpd %xmm5, %xmm0 3510; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3511; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3512; SSE-NEXT: mulpd %xmm6, %xmm0 3513; SSE-NEXT: addpd %xmm1, %xmm0 3514; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3515; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 3516; SSE-NEXT: addpd %xmm2, %xmm6 3517; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3518; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3519; SSE-NEXT: movapd %xmm1, %xmm0 3520; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3521; SSE-NEXT: movapd %xmm11, %xmm3 3522; SSE-NEXT: mulpd %xmm0, %xmm3 3523; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 3524; SSE-NEXT: movapd %xmm8, %xmm2 3525; SSE-NEXT: mulpd %xmm1, %xmm2 3526; SSE-NEXT: addpd %xmm3, %xmm2 3527; SSE-NEXT: movapd %xmm9, %xmm3 3528; SSE-NEXT: mulpd %xmm0, %xmm3 3529; SSE-NEXT: movapd %xmm15, %xmm4 3530; SSE-NEXT: mulpd %xmm1, %xmm4 3531; SSE-NEXT: addpd %xmm3, %xmm4 3532; SSE-NEXT: movapd %xmm13, %xmm8 3533; SSE-NEXT: movapd %xmm13, %xmm3 3534; SSE-NEXT: mulpd %xmm0, %xmm3 3535; SSE-NEXT: movapd %xmm10, %xmm5 3536; SSE-NEXT: movapd %xmm10, %xmm15 3537; SSE-NEXT: mulpd %xmm1, %xmm5 3538; SSE-NEXT: addpd %xmm3, %xmm5 3539; SSE-NEXT: movapd %xmm12, %xmm10 3540; SSE-NEXT: mulpd %xmm12, %xmm0 3541; SSE-NEXT: movapd %xmm14, %xmm9 3542; SSE-NEXT: mulpd %xmm14, %xmm1 3543; SSE-NEXT: addpd %xmm0, %xmm1 3544; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3545; SSE-NEXT: movapd %xmm0, %xmm6 3546; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0] 3547; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 3548; SSE-NEXT: mulpd %xmm6, %xmm3 3549; SSE-NEXT: addpd %xmm1, %xmm3 3550; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3551; SSE-NEXT: mulpd %xmm6, %xmm1 3552; SSE-NEXT: addpd %xmm5, %xmm1 3553; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3554; SSE-NEXT: mulpd %xmm6, %xmm5 3555; SSE-NEXT: addpd %xmm4, %xmm5 3556; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 3557; SSE-NEXT: mulpd %xmm4, %xmm6 3558; SSE-NEXT: addpd %xmm2, %xmm6 3559; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 3560; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 3561; SSE-NEXT: mulpd %xmm0, %xmm2 3562; SSE-NEXT: addpd %xmm6, %xmm2 3563; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 3564; SSE-NEXT: mulpd %xmm0, %xmm4 3565; SSE-NEXT: addpd %xmm5, %xmm4 3566; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3567; SSE-NEXT: mulpd %xmm0, %xmm5 3568; SSE-NEXT: addpd %xmm1, %xmm5 3569; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 3570; SSE-NEXT: addpd %xmm3, %xmm0 3571; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3572; SSE-NEXT: movapd %xmm1, %xmm6 3573; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0] 3574; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 3575; SSE-NEXT: mulpd %xmm6, %xmm3 3576; SSE-NEXT: addpd %xmm0, %xmm3 3577; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 3578; SSE-NEXT: mulpd %xmm6, %xmm7 3579; SSE-NEXT: addpd %xmm5, %xmm7 3580; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3581; SSE-NEXT: mulpd %xmm6, %xmm5 3582; SSE-NEXT: addpd %xmm4, %xmm5 3583; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 3584; SSE-NEXT: addpd %xmm2, %xmm6 3585; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 3586; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3587; SSE-NEXT: mulpd %xmm1, %xmm0 3588; SSE-NEXT: addpd %xmm6, %xmm0 3589; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 3590; SSE-NEXT: mulpd %xmm1, %xmm4 3591; SSE-NEXT: addpd %xmm5, %xmm4 3592; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3593; SSE-NEXT: mulpd %xmm1, %xmm5 3594; SSE-NEXT: addpd %xmm7, %xmm5 3595; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 3596; SSE-NEXT: mulpd %xmm2, %xmm1 3597; SSE-NEXT: addpd %xmm3, %xmm1 3598; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 3599; SSE-NEXT: movapd %xmm7, %xmm3 3600; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0] 3601; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 3602; SSE-NEXT: mulpd %xmm3, %xmm2 3603; SSE-NEXT: addpd %xmm1, %xmm2 3604; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3605; SSE-NEXT: mulpd %xmm3, %xmm1 3606; SSE-NEXT: addpd %xmm5, %xmm1 3607; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3608; SSE-NEXT: mulpd %xmm3, %xmm5 3609; SSE-NEXT: addpd %xmm4, %xmm5 3610; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 3611; SSE-NEXT: mulpd %xmm4, %xmm3 3612; SSE-NEXT: addpd %xmm0, %xmm3 3613; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 3614; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3615; SSE-NEXT: mulpd %xmm7, %xmm0 3616; SSE-NEXT: addpd %xmm3, %xmm0 3617; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3618; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3619; SSE-NEXT: mulpd %xmm7, %xmm0 3620; SSE-NEXT: addpd %xmm5, %xmm0 3621; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3622; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3623; SSE-NEXT: mulpd %xmm7, %xmm0 3624; SSE-NEXT: addpd %xmm1, %xmm0 3625; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3626; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3627; SSE-NEXT: mulpd %xmm0, %xmm7 3628; SSE-NEXT: addpd %xmm2, %xmm7 3629; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3630; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3631; SSE-NEXT: movapd %xmm1, %xmm0 3632; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3633; SSE-NEXT: movapd %xmm11, %xmm3 3634; SSE-NEXT: movapd %xmm11, %xmm12 3635; SSE-NEXT: mulpd %xmm0, %xmm3 3636; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 3637; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3638; SSE-NEXT: movapd %xmm6, %xmm2 3639; SSE-NEXT: mulpd %xmm1, %xmm2 3640; SSE-NEXT: addpd %xmm3, %xmm2 3641; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 3642; SSE-NEXT: movapd %xmm11, %xmm3 3643; SSE-NEXT: mulpd %xmm0, %xmm3 3644; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3645; SSE-NEXT: movapd %xmm13, %xmm4 3646; SSE-NEXT: mulpd %xmm1, %xmm4 3647; SSE-NEXT: addpd %xmm3, %xmm4 3648; SSE-NEXT: movapd %xmm8, %xmm3 3649; SSE-NEXT: movapd %xmm8, %xmm14 3650; SSE-NEXT: mulpd %xmm0, %xmm3 3651; SSE-NEXT: movapd %xmm15, %xmm8 3652; SSE-NEXT: movapd %xmm15, %xmm5 3653; SSE-NEXT: mulpd %xmm1, %xmm5 3654; SSE-NEXT: addpd %xmm3, %xmm5 3655; SSE-NEXT: mulpd %xmm10, %xmm0 3656; SSE-NEXT: mulpd %xmm9, %xmm1 3657; SSE-NEXT: movapd %xmm9, %xmm10 3658; SSE-NEXT: addpd %xmm0, %xmm1 3659; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3660; SSE-NEXT: movapd %xmm0, %xmm7 3661; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm0[0] 3662; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 3663; SSE-NEXT: mulpd %xmm7, %xmm3 3664; SSE-NEXT: addpd %xmm1, %xmm3 3665; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3666; SSE-NEXT: mulpd %xmm7, %xmm1 3667; SSE-NEXT: addpd %xmm5, %xmm1 3668; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3669; SSE-NEXT: mulpd %xmm7, %xmm5 3670; SSE-NEXT: addpd %xmm4, %xmm5 3671; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm7 3672; SSE-NEXT: addpd %xmm2, %xmm7 3673; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 3674; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 3675; SSE-NEXT: mulpd %xmm0, %xmm2 3676; SSE-NEXT: addpd %xmm7, %xmm2 3677; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 3678; SSE-NEXT: mulpd %xmm0, %xmm4 3679; SSE-NEXT: addpd %xmm5, %xmm4 3680; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3681; SSE-NEXT: mulpd %xmm0, %xmm5 3682; SSE-NEXT: addpd %xmm1, %xmm5 3683; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3684; SSE-NEXT: mulpd %xmm1, %xmm0 3685; SSE-NEXT: addpd %xmm3, %xmm0 3686; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3687; SSE-NEXT: movapd %xmm1, %xmm7 3688; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] 3689; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 3690; SSE-NEXT: mulpd %xmm7, %xmm3 3691; SSE-NEXT: addpd %xmm0, %xmm3 3692; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 3693; SSE-NEXT: mulpd %xmm7, %xmm9 3694; SSE-NEXT: addpd %xmm5, %xmm9 3695; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 3696; SSE-NEXT: mulpd %xmm7, %xmm5 3697; SSE-NEXT: addpd %xmm4, %xmm5 3698; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3699; SSE-NEXT: mulpd %xmm0, %xmm7 3700; SSE-NEXT: addpd %xmm2, %xmm7 3701; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 3702; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3703; SSE-NEXT: mulpd %xmm1, %xmm0 3704; SSE-NEXT: addpd %xmm7, %xmm0 3705; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 3706; SSE-NEXT: mulpd %xmm1, %xmm4 3707; SSE-NEXT: addpd %xmm5, %xmm4 3708; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 3709; SSE-NEXT: mulpd %xmm1, %xmm7 3710; SSE-NEXT: addpd %xmm9, %xmm7 3711; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 3712; SSE-NEXT: addpd %xmm3, %xmm1 3713; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 3714; SSE-NEXT: movapd %xmm15, %xmm3 3715; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm15[0] 3716; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 3717; SSE-NEXT: mulpd %xmm3, %xmm2 3718; SSE-NEXT: addpd %xmm1, %xmm2 3719; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3720; SSE-NEXT: mulpd %xmm3, %xmm1 3721; SSE-NEXT: addpd %xmm7, %xmm1 3722; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 3723; SSE-NEXT: mulpd %xmm3, %xmm7 3724; SSE-NEXT: addpd %xmm4, %xmm7 3725; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 3726; SSE-NEXT: addpd %xmm0, %xmm3 3727; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1,1] 3728; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3729; SSE-NEXT: mulpd %xmm15, %xmm0 3730; SSE-NEXT: addpd %xmm3, %xmm0 3731; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3732; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3733; SSE-NEXT: mulpd %xmm15, %xmm0 3734; SSE-NEXT: addpd %xmm7, %xmm0 3735; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3736; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3737; SSE-NEXT: mulpd %xmm15, %xmm0 3738; SSE-NEXT: addpd %xmm1, %xmm0 3739; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3740; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm15 3741; SSE-NEXT: addpd %xmm2, %xmm15 3742; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3743; SSE-NEXT: movapd %xmm1, %xmm0 3744; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3745; SSE-NEXT: movapd %xmm12, %xmm3 3746; SSE-NEXT: mulpd %xmm0, %xmm3 3747; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 3748; SSE-NEXT: movapd %xmm6, %xmm2 3749; SSE-NEXT: movapd %xmm6, %xmm12 3750; SSE-NEXT: mulpd %xmm1, %xmm2 3751; SSE-NEXT: addpd %xmm3, %xmm2 3752; SSE-NEXT: mulpd %xmm0, %xmm11 3753; SSE-NEXT: movapd %xmm13, %xmm6 3754; SSE-NEXT: movapd %xmm13, %xmm4 3755; SSE-NEXT: mulpd %xmm1, %xmm4 3756; SSE-NEXT: addpd %xmm11, %xmm4 3757; SSE-NEXT: mulpd %xmm0, %xmm14 3758; SSE-NEXT: movapd %xmm8, %xmm7 3759; SSE-NEXT: mulpd %xmm1, %xmm7 3760; SSE-NEXT: addpd %xmm14, %xmm7 3761; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 3762; SSE-NEXT: mulpd %xmm8, %xmm0 3763; SSE-NEXT: movapd %xmm10, %xmm5 3764; SSE-NEXT: mulpd %xmm10, %xmm1 3765; SSE-NEXT: addpd %xmm0, %xmm1 3766; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3767; SSE-NEXT: movapd %xmm0, %xmm9 3768; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm0[0] 3769; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 3770; SSE-NEXT: mulpd %xmm9, %xmm3 3771; SSE-NEXT: addpd %xmm1, %xmm3 3772; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3773; SSE-NEXT: mulpd %xmm9, %xmm1 3774; SSE-NEXT: addpd %xmm7, %xmm1 3775; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 3776; SSE-NEXT: mulpd %xmm9, %xmm7 3777; SSE-NEXT: addpd %xmm4, %xmm7 3778; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm9 3779; SSE-NEXT: addpd %xmm2, %xmm9 3780; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 3781; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 3782; SSE-NEXT: mulpd %xmm0, %xmm2 3783; SSE-NEXT: addpd %xmm9, %xmm2 3784; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 3785; SSE-NEXT: mulpd %xmm0, %xmm4 3786; SSE-NEXT: addpd %xmm7, %xmm4 3787; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 3788; SSE-NEXT: mulpd %xmm0, %xmm7 3789; SSE-NEXT: addpd %xmm1, %xmm7 3790; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 3791; SSE-NEXT: addpd %xmm3, %xmm0 3792; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3793; SSE-NEXT: movapd %xmm1, %xmm9 3794; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm1[0] 3795; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 3796; SSE-NEXT: mulpd %xmm9, %xmm3 3797; SSE-NEXT: addpd %xmm0, %xmm3 3798; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 3799; SSE-NEXT: mulpd %xmm9, %xmm10 3800; SSE-NEXT: addpd %xmm7, %xmm10 3801; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 3802; SSE-NEXT: mulpd %xmm9, %xmm7 3803; SSE-NEXT: addpd %xmm4, %xmm7 3804; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm9 3805; SSE-NEXT: addpd %xmm2, %xmm9 3806; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 3807; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3808; SSE-NEXT: mulpd %xmm1, %xmm0 3809; SSE-NEXT: addpd %xmm9, %xmm0 3810; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 3811; SSE-NEXT: mulpd %xmm1, %xmm9 3812; SSE-NEXT: addpd %xmm7, %xmm9 3813; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 3814; SSE-NEXT: mulpd %xmm1, %xmm7 3815; SSE-NEXT: addpd %xmm10, %xmm7 3816; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 3817; SSE-NEXT: addpd %xmm3, %xmm1 3818; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 3819; SSE-NEXT: movapd %xmm11, %xmm3 3820; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm11[0] 3821; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 3822; SSE-NEXT: mulpd %xmm3, %xmm2 3823; SSE-NEXT: addpd %xmm1, %xmm2 3824; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3825; SSE-NEXT: mulpd %xmm3, %xmm1 3826; SSE-NEXT: addpd %xmm7, %xmm1 3827; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 3828; SSE-NEXT: mulpd %xmm3, %xmm7 3829; SSE-NEXT: addpd %xmm9, %xmm7 3830; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 3831; SSE-NEXT: addpd %xmm0, %xmm3 3832; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1,1] 3833; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3834; SSE-NEXT: mulpd %xmm11, %xmm0 3835; SSE-NEXT: addpd %xmm3, %xmm0 3836; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3837; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3838; SSE-NEXT: mulpd %xmm11, %xmm0 3839; SSE-NEXT: addpd %xmm7, %xmm0 3840; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3841; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3842; SSE-NEXT: mulpd %xmm11, %xmm0 3843; SSE-NEXT: addpd %xmm1, %xmm0 3844; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3845; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm11 3846; SSE-NEXT: addpd %xmm2, %xmm11 3847; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3848; SSE-NEXT: movapd %xmm1, %xmm0 3849; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3850; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3851; SSE-NEXT: movapd %xmm13, %xmm3 3852; SSE-NEXT: mulpd %xmm0, %xmm3 3853; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 3854; SSE-NEXT: movapd %xmm12, %xmm2 3855; SSE-NEXT: mulpd %xmm1, %xmm2 3856; SSE-NEXT: addpd %xmm3, %xmm2 3857; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3858; SSE-NEXT: movapd %xmm14, %xmm3 3859; SSE-NEXT: mulpd %xmm0, %xmm3 3860; SSE-NEXT: movapd %xmm6, %xmm7 3861; SSE-NEXT: mulpd %xmm1, %xmm7 3862; SSE-NEXT: addpd %xmm3, %xmm7 3863; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3864; SSE-NEXT: movapd %xmm4, %xmm3 3865; SSE-NEXT: mulpd %xmm0, %xmm3 3866; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3867; SSE-NEXT: movapd %xmm6, %xmm9 3868; SSE-NEXT: mulpd %xmm1, %xmm9 3869; SSE-NEXT: addpd %xmm3, %xmm9 3870; SSE-NEXT: mulpd %xmm8, %xmm0 3871; SSE-NEXT: mulpd %xmm5, %xmm1 3872; SSE-NEXT: addpd %xmm0, %xmm1 3873; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3874; SSE-NEXT: movapd %xmm0, %xmm10 3875; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm0[0] 3876; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 3877; SSE-NEXT: mulpd %xmm10, %xmm3 3878; SSE-NEXT: addpd %xmm1, %xmm3 3879; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 3880; SSE-NEXT: mulpd %xmm10, %xmm12 3881; SSE-NEXT: addpd %xmm9, %xmm12 3882; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 3883; SSE-NEXT: mulpd %xmm10, %xmm9 3884; SSE-NEXT: addpd %xmm7, %xmm9 3885; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm10 3886; SSE-NEXT: addpd %xmm2, %xmm10 3887; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 3888; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3889; SSE-NEXT: mulpd %xmm0, %xmm1 3890; SSE-NEXT: addpd %xmm10, %xmm1 3891; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 3892; SSE-NEXT: mulpd %xmm0, %xmm10 3893; SSE-NEXT: addpd %xmm9, %xmm10 3894; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 3895; SSE-NEXT: mulpd %xmm0, %xmm9 3896; SSE-NEXT: addpd %xmm12, %xmm9 3897; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 3898; SSE-NEXT: mulpd %xmm2, %xmm0 3899; SSE-NEXT: addpd %xmm3, %xmm0 3900; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 3901; SSE-NEXT: movapd %xmm7, %xmm3 3902; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0] 3903; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 3904; SSE-NEXT: mulpd %xmm3, %xmm2 3905; SSE-NEXT: addpd %xmm0, %xmm2 3906; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 3907; SSE-NEXT: mulpd %xmm3, %xmm12 3908; SSE-NEXT: addpd %xmm9, %xmm12 3909; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 3910; SSE-NEXT: mulpd %xmm3, %xmm9 3911; SSE-NEXT: addpd %xmm10, %xmm9 3912; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3913; SSE-NEXT: mulpd %xmm0, %xmm3 3914; SSE-NEXT: addpd %xmm1, %xmm3 3915; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] 3916; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3917; SSE-NEXT: mulpd %xmm7, %xmm0 3918; SSE-NEXT: addpd %xmm3, %xmm0 3919; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 3920; SSE-NEXT: mulpd %xmm7, %xmm10 3921; SSE-NEXT: addpd %xmm9, %xmm10 3922; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 3923; SSE-NEXT: mulpd %xmm7, %xmm9 3924; SSE-NEXT: addpd %xmm12, %xmm9 3925; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm7 3926; SSE-NEXT: addpd %xmm2, %xmm7 3927; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 3928; SSE-NEXT: movapd %xmm8, %xmm2 3929; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm8[0] 3930; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3931; SSE-NEXT: mulpd %xmm2, %xmm1 3932; SSE-NEXT: addpd %xmm7, %xmm1 3933; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 3934; SSE-NEXT: mulpd %xmm2, %xmm12 3935; SSE-NEXT: addpd %xmm9, %xmm12 3936; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 3937; SSE-NEXT: mulpd %xmm2, %xmm7 3938; SSE-NEXT: addpd %xmm10, %xmm7 3939; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 3940; SSE-NEXT: mulpd %xmm3, %xmm2 3941; SSE-NEXT: addpd %xmm0, %xmm2 3942; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] 3943; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3944; SSE-NEXT: mulpd %xmm8, %xmm0 3945; SSE-NEXT: addpd %xmm2, %xmm0 3946; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3947; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3948; SSE-NEXT: mulpd %xmm8, %xmm0 3949; SSE-NEXT: addpd %xmm7, %xmm0 3950; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill 3951; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 3952; SSE-NEXT: mulpd %xmm8, %xmm9 3953; SSE-NEXT: addpd %xmm12, %xmm9 3954; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 3955; SSE-NEXT: mulpd %xmm0, %xmm8 3956; SSE-NEXT: addpd %xmm1, %xmm8 3957; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3958; SSE-NEXT: movapd %xmm1, %xmm0 3959; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3960; SSE-NEXT: movapd %xmm13, %xmm12 3961; SSE-NEXT: mulpd %xmm0, %xmm12 3962; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 3963; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 3964; SSE-NEXT: mulpd %xmm1, %xmm3 3965; SSE-NEXT: addpd %xmm12, %xmm3 3966; SSE-NEXT: movapd %xmm14, %xmm12 3967; SSE-NEXT: movapd %xmm14, %xmm5 3968; SSE-NEXT: mulpd %xmm0, %xmm12 3969; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3970; SSE-NEXT: mulpd %xmm1, %xmm13 3971; SSE-NEXT: addpd %xmm12, %xmm13 3972; SSE-NEXT: mulpd %xmm0, %xmm4 3973; SSE-NEXT: movapd %xmm6, %xmm14 3974; SSE-NEXT: mulpd %xmm1, %xmm14 3975; SSE-NEXT: addpd %xmm4, %xmm14 3976; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3977; SSE-NEXT: mulpd %xmm6, %xmm0 3978; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3979; SSE-NEXT: mulpd %xmm10, %xmm1 3980; SSE-NEXT: addpd %xmm0, %xmm1 3981; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 3982; SSE-NEXT: movapd %xmm2, %xmm0 3983; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] 3984; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 3985; SSE-NEXT: mulpd %xmm0, %xmm12 3986; SSE-NEXT: addpd %xmm1, %xmm12 3987; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 3988; SSE-NEXT: mulpd %xmm0, %xmm1 3989; SSE-NEXT: addpd %xmm14, %xmm1 3990; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 3991; SSE-NEXT: mulpd %xmm0, %xmm14 3992; SSE-NEXT: addpd %xmm13, %xmm14 3993; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 3994; SSE-NEXT: addpd %xmm3, %xmm0 3995; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 3996; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 3997; SSE-NEXT: mulpd %xmm2, %xmm13 3998; SSE-NEXT: addpd %xmm0, %xmm13 3999; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 4000; SSE-NEXT: mulpd %xmm2, %xmm0 4001; SSE-NEXT: addpd %xmm14, %xmm0 4002; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 4003; SSE-NEXT: mulpd %xmm2, %xmm14 4004; SSE-NEXT: addpd %xmm1, %xmm14 4005; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 4006; SSE-NEXT: mulpd %xmm1, %xmm2 4007; SSE-NEXT: addpd %xmm12, %xmm2 4008; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 4009; SSE-NEXT: movapd %xmm12, %xmm1 4010; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm12[0] 4011; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 4012; SSE-NEXT: mulpd %xmm1, %xmm3 4013; SSE-NEXT: addpd %xmm2, %xmm3 4014; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 4015; SSE-NEXT: mulpd %xmm1, %xmm2 4016; SSE-NEXT: addpd %xmm14, %xmm2 4017; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 4018; SSE-NEXT: mulpd %xmm1, %xmm14 4019; SSE-NEXT: addpd %xmm0, %xmm14 4020; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 4021; SSE-NEXT: mulpd %xmm0, %xmm1 4022; SSE-NEXT: addpd %xmm13, %xmm1 4023; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1,1] 4024; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 4025; SSE-NEXT: mulpd %xmm12, %xmm4 4026; SSE-NEXT: addpd %xmm1, %xmm4 4027; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 4028; SSE-NEXT: mulpd %xmm12, %xmm13 4029; SSE-NEXT: addpd %xmm14, %xmm13 4030; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 4031; SSE-NEXT: mulpd %xmm12, %xmm14 4032; SSE-NEXT: addpd %xmm2, %xmm14 4033; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm12 4034; SSE-NEXT: addpd %xmm3, %xmm12 4035; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 4036; SSE-NEXT: movapd %xmm2, %xmm3 4037; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] 4038; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 4039; SSE-NEXT: mulpd %xmm3, %xmm1 4040; SSE-NEXT: addpd %xmm12, %xmm1 4041; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 4042; SSE-NEXT: mulpd %xmm3, %xmm12 4043; SSE-NEXT: addpd %xmm14, %xmm12 4044; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 4045; SSE-NEXT: mulpd %xmm3, %xmm0 4046; SSE-NEXT: addpd %xmm13, %xmm0 4047; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 4048; SSE-NEXT: mulpd %xmm7, %xmm3 4049; SSE-NEXT: addpd %xmm4, %xmm3 4050; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 4051; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 4052; SSE-NEXT: mulpd %xmm2, %xmm14 4053; SSE-NEXT: addpd %xmm3, %xmm14 4054; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 4055; SSE-NEXT: mulpd %xmm2, %xmm13 4056; SSE-NEXT: addpd %xmm0, %xmm13 4057; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 4058; SSE-NEXT: mulpd %xmm2, %xmm7 4059; SSE-NEXT: addpd %xmm12, %xmm7 4060; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 4061; SSE-NEXT: mulpd %xmm0, %xmm2 4062; SSE-NEXT: addpd %xmm1, %xmm2 4063; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 4064; SSE-NEXT: movapd %xmm1, %xmm0 4065; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4066; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 4067; SSE-NEXT: mulpd %xmm0, %xmm12 4068; SSE-NEXT: mulpd %xmm0, %xmm5 4069; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4070; SSE-NEXT: mulpd %xmm0, %xmm3 4071; SSE-NEXT: mulpd %xmm6, %xmm0 4072; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 4073; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4074; SSE-NEXT: mulpd %xmm1, %xmm4 4075; SSE-NEXT: addpd %xmm12, %xmm4 4076; SSE-NEXT: movapd %xmm4, %xmm12 4077; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4078; SSE-NEXT: mulpd %xmm1, %xmm4 4079; SSE-NEXT: addpd %xmm5, %xmm4 4080; SSE-NEXT: movapd %xmm4, %xmm5 4081; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 4082; SSE-NEXT: mulpd %xmm1, %xmm4 4083; SSE-NEXT: addpd %xmm3, %xmm4 4084; SSE-NEXT: movapd %xmm4, %xmm3 4085; SSE-NEXT: mulpd %xmm10, %xmm1 4086; SSE-NEXT: addpd %xmm0, %xmm1 4087; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 4088; SSE-NEXT: movapd %xmm0, %xmm4 4089; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] 4090; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 4091; SSE-NEXT: mulpd %xmm4, %xmm10 4092; SSE-NEXT: addpd %xmm1, %xmm10 4093; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 4094; SSE-NEXT: mulpd %xmm4, %xmm1 4095; SSE-NEXT: addpd %xmm3, %xmm1 4096; SSE-NEXT: movapd %xmm1, %xmm3 4097; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 4098; SSE-NEXT: mulpd %xmm4, %xmm1 4099; SSE-NEXT: addpd %xmm5, %xmm1 4100; SSE-NEXT: movapd %xmm1, %xmm5 4101; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 4102; SSE-NEXT: addpd %xmm12, %xmm4 4103; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 4104; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 4105; SSE-NEXT: mulpd %xmm0, %xmm1 4106; SSE-NEXT: addpd %xmm4, %xmm1 4107; SSE-NEXT: movapd %xmm1, %xmm12 4108; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 4109; SSE-NEXT: mulpd %xmm0, %xmm6 4110; SSE-NEXT: addpd %xmm5, %xmm6 4111; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 4112; SSE-NEXT: mulpd %xmm0, %xmm1 4113; SSE-NEXT: addpd %xmm3, %xmm1 4114; SSE-NEXT: movapd %xmm1, %xmm3 4115; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 4116; SSE-NEXT: addpd %xmm10, %xmm0 4117; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 4118; SSE-NEXT: movapd %xmm1, %xmm4 4119; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm1[0] 4120; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 4121; SSE-NEXT: mulpd %xmm4, %xmm5 4122; SSE-NEXT: addpd %xmm0, %xmm5 4123; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 4124; SSE-NEXT: mulpd %xmm4, %xmm0 4125; SSE-NEXT: addpd %xmm3, %xmm0 4126; SSE-NEXT: movapd %xmm0, %xmm10 4127; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 4128; SSE-NEXT: mulpd %xmm4, %xmm0 4129; SSE-NEXT: addpd %xmm6, %xmm0 4130; SSE-NEXT: movapd %xmm0, %xmm6 4131; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 4132; SSE-NEXT: addpd %xmm12, %xmm4 4133; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] 4134; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 4135; SSE-NEXT: mulpd %xmm1, %xmm0 4136; SSE-NEXT: addpd %xmm4, %xmm0 4137; SSE-NEXT: movapd %xmm0, %xmm3 4138; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 4139; SSE-NEXT: mulpd %xmm1, %xmm0 4140; SSE-NEXT: addpd %xmm6, %xmm0 4141; SSE-NEXT: movapd %xmm0, %xmm6 4142; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 4143; SSE-NEXT: mulpd %xmm1, %xmm0 4144; SSE-NEXT: addpd %xmm10, %xmm0 4145; SSE-NEXT: movapd %xmm0, %xmm10 4146; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 4147; SSE-NEXT: addpd %xmm5, %xmm1 4148; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 4149; SSE-NEXT: movapd %xmm0, %xmm4 4150; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] 4151; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 4152; SSE-NEXT: mulpd %xmm4, %xmm5 4153; SSE-NEXT: addpd %xmm1, %xmm5 4154; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 4155; SSE-NEXT: mulpd %xmm4, %xmm1 4156; SSE-NEXT: addpd %xmm10, %xmm1 4157; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 4158; SSE-NEXT: mulpd %xmm4, %xmm10 4159; SSE-NEXT: addpd %xmm6, %xmm10 4160; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 4161; SSE-NEXT: addpd %xmm3, %xmm4 4162; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 4163; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 4164; SSE-NEXT: mulpd %xmm0, %xmm3 4165; SSE-NEXT: addpd %xmm4, %xmm3 4166; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 4167; SSE-NEXT: mulpd %xmm0, %xmm4 4168; SSE-NEXT: addpd %xmm10, %xmm4 4169; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 4170; SSE-NEXT: mulpd %xmm0, %xmm6 4171; SSE-NEXT: addpd %xmm1, %xmm6 4172; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 4173; SSE-NEXT: addpd %xmm5, %xmm0 4174; SSE-NEXT: movapd %xmm3, 496(%rdi) 4175; SSE-NEXT: movapd %xmm4, 480(%rdi) 4176; SSE-NEXT: movapd %xmm6, 464(%rdi) 4177; SSE-NEXT: movapd %xmm0, 448(%rdi) 4178; SSE-NEXT: movapd %xmm14, 432(%rdi) 4179; SSE-NEXT: movapd %xmm13, 416(%rdi) 4180; SSE-NEXT: movapd %xmm7, 400(%rdi) 4181; SSE-NEXT: movapd %xmm2, 384(%rdi) 4182; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4183; SSE-NEXT: movaps %xmm0, 368(%rdi) 4184; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 4185; SSE-NEXT: movaps %xmm0, 352(%rdi) 4186; SSE-NEXT: movapd %xmm9, 336(%rdi) 4187; SSE-NEXT: movapd %xmm8, 320(%rdi) 4188; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4189; SSE-NEXT: movaps %xmm0, 304(%rdi) 4190; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4191; SSE-NEXT: movaps %xmm0, 288(%rdi) 4192; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4193; SSE-NEXT: movaps %xmm0, 272(%rdi) 4194; SSE-NEXT: movapd %xmm11, 256(%rdi) 4195; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4196; SSE-NEXT: movaps %xmm0, 240(%rdi) 4197; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4198; SSE-NEXT: movaps %xmm0, 224(%rdi) 4199; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4200; SSE-NEXT: movaps %xmm0, 208(%rdi) 4201; SSE-NEXT: movapd %xmm15, 192(%rdi) 4202; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4203; SSE-NEXT: movaps %xmm0, 176(%rdi) 4204; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4205; SSE-NEXT: movaps %xmm0, 160(%rdi) 4206; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4207; SSE-NEXT: movaps %xmm0, 144(%rdi) 4208; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4209; SSE-NEXT: movaps %xmm0, 128(%rdi) 4210; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4211; SSE-NEXT: movaps %xmm0, 112(%rdi) 4212; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4213; SSE-NEXT: movaps %xmm0, 96(%rdi) 4214; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4215; SSE-NEXT: movaps %xmm0, 80(%rdi) 4216; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4217; SSE-NEXT: movaps %xmm0, 64(%rdi) 4218; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4219; SSE-NEXT: movaps %xmm0, 48(%rdi) 4220; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4221; SSE-NEXT: movaps %xmm0, 32(%rdi) 4222; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4223; SSE-NEXT: movaps %xmm0, 16(%rdi) 4224; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4225; SSE-NEXT: movaps %xmm0, (%rdi) 4226; SSE-NEXT: addq $328, %rsp # imm = 0x148 4227; SSE-NEXT: retq 4228; 4229; AVX1-LABEL: test_mul8x8_f64: 4230; AVX1: # %bb.0: # %entry 4231; AVX1-NEXT: pushq %rbp 4232; AVX1-NEXT: movq %rsp, %rbp 4233; AVX1-NEXT: andq $-32, %rsp 4234; AVX1-NEXT: subq $448, %rsp # imm = 0x1C0 4235; AVX1-NEXT: vmovapd %ymm2, %ymm12 4236; AVX1-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill 4237; AVX1-NEXT: movq %rdi, %rax 4238; AVX1-NEXT: vmovapd 144(%rbp), %ymm13 4239; AVX1-NEXT: vmovapd 112(%rbp), %ymm14 4240; AVX1-NEXT: vbroadcastsd 272(%rbp), %ymm10 4241; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm8 4242; AVX1-NEXT: vmovapd %ymm1, %ymm9 4243; AVX1-NEXT: vmulpd %ymm0, %ymm10, %ymm0 4244; AVX1-NEXT: vbroadcastsd 280(%rbp), %ymm10 4245; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 4246; AVX1-NEXT: vaddpd %ymm11, %ymm8, %ymm1 4247; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 4248; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4249; AVX1-NEXT: vbroadcastsd 288(%rbp), %ymm10 4250; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 4251; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4252; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 4253; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4254; AVX1-NEXT: vbroadcastsd 296(%rbp), %ymm10 4255; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 4256; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4257; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 4258; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4259; AVX1-NEXT: vbroadcastsd 304(%rbp), %ymm10 4260; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 4261; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4262; AVX1-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 4263; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4264; AVX1-NEXT: vbroadcastsd 312(%rbp), %ymm10 4265; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 4266; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4267; AVX1-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 4268; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4269; AVX1-NEXT: vbroadcastsd 320(%rbp), %ymm10 4270; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 4271; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4272; AVX1-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 4273; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4274; AVX1-NEXT: vbroadcastsd 328(%rbp), %ymm10 4275; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 4276; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4277; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4278; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 4279; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 4280; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4281; AVX1-NEXT: vbroadcastsd 336(%rbp), %ymm0 4282; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm1 4283; AVX1-NEXT: vbroadcastsd 344(%rbp), %ymm10 4284; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 4285; AVX1-NEXT: vmovapd %ymm3, %ymm8 4286; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4287; AVX1-NEXT: vmovapd (%rsp), %ymm15 # 32-byte Reload 4288; AVX1-NEXT: vmulpd %ymm0, %ymm15, %ymm0 4289; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 4290; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4291; AVX1-NEXT: vbroadcastsd 352(%rbp), %ymm10 4292; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 4293; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4294; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 4295; AVX1-NEXT: vmovapd %ymm5, %ymm3 4296; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4297; AVX1-NEXT: vbroadcastsd 360(%rbp), %ymm10 4298; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 4299; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4300; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 4301; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4302; AVX1-NEXT: vbroadcastsd 368(%rbp), %ymm10 4303; AVX1-NEXT: vmovapd 16(%rbp), %ymm2 4304; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 4305; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4306; AVX1-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 4307; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4308; AVX1-NEXT: vbroadcastsd 376(%rbp), %ymm10 4309; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 4310; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4311; AVX1-NEXT: vmovapd 80(%rbp), %ymm2 4312; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 4313; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4314; AVX1-NEXT: vbroadcastsd 384(%rbp), %ymm10 4315; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 4316; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4317; AVX1-NEXT: vmovapd 176(%rbp), %ymm14 4318; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 4319; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4320; AVX1-NEXT: vbroadcastsd 392(%rbp), %ymm10 4321; AVX1-NEXT: vmovapd 240(%rbp), %ymm2 4322; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 4323; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4324; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4325; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 4326; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 4327; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4328; AVX1-NEXT: vbroadcastsd 400(%rbp), %ymm0 4329; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm1 4330; AVX1-NEXT: vbroadcastsd 408(%rbp), %ymm10 4331; AVX1-NEXT: vmovapd %ymm8, %ymm5 4332; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 4333; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4334; AVX1-NEXT: vmulpd %ymm0, %ymm15, %ymm0 4335; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 4336; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4337; AVX1-NEXT: vbroadcastsd 416(%rbp), %ymm10 4338; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 4339; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4340; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 4341; AVX1-NEXT: vmovapd %ymm3, %ymm2 4342; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4343; AVX1-NEXT: vbroadcastsd 424(%rbp), %ymm10 4344; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 4345; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4346; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 4347; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4348; AVX1-NEXT: vbroadcastsd 432(%rbp), %ymm10 4349; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 4350; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4351; AVX1-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 4352; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4353; AVX1-NEXT: vbroadcastsd 440(%rbp), %ymm10 4354; AVX1-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 4355; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4356; AVX1-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 4357; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4358; AVX1-NEXT: vbroadcastsd 448(%rbp), %ymm10 4359; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 4360; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4361; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 4362; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4363; AVX1-NEXT: vbroadcastsd 456(%rbp), %ymm10 4364; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 4365; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4366; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4367; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 4368; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 4369; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4370; AVX1-NEXT: vbroadcastsd 464(%rbp), %ymm0 4371; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm1 4372; AVX1-NEXT: vmovapd %ymm9, %ymm13 4373; AVX1-NEXT: vbroadcastsd 472(%rbp), %ymm10 4374; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 4375; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4376; AVX1-NEXT: vmulpd %ymm0, %ymm15, %ymm0 4377; AVX1-NEXT: vmovapd %ymm15, %ymm9 4378; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 4379; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4380; AVX1-NEXT: vbroadcastsd 480(%rbp), %ymm10 4381; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 4382; AVX1-NEXT: vmovapd %ymm4, %ymm3 4383; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4384; AVX1-NEXT: vmovapd %ymm2, %ymm15 4385; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 4386; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4387; AVX1-NEXT: vbroadcastsd 488(%rbp), %ymm10 4388; AVX1-NEXT: vmovapd %ymm7, %ymm8 4389; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 4390; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4391; AVX1-NEXT: vmovapd %ymm6, %ymm7 4392; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 4393; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4394; AVX1-NEXT: vbroadcastsd 496(%rbp), %ymm10 4395; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 4396; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4397; AVX1-NEXT: vmovapd 48(%rbp), %ymm4 4398; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 4399; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4400; AVX1-NEXT: vbroadcastsd 504(%rbp), %ymm10 4401; AVX1-NEXT: vmovapd 112(%rbp), %ymm2 4402; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 4403; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4404; AVX1-NEXT: vmovapd 80(%rbp), %ymm14 4405; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 4406; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4407; AVX1-NEXT: vbroadcastsd 512(%rbp), %ymm10 4408; AVX1-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 4409; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4410; AVX1-NEXT: vmovapd 176(%rbp), %ymm2 4411; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 4412; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4413; AVX1-NEXT: vbroadcastsd 520(%rbp), %ymm10 4414; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 4415; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4416; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4417; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 4418; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 4419; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4420; AVX1-NEXT: vbroadcastsd 528(%rbp), %ymm0 4421; AVX1-NEXT: vmulpd %ymm0, %ymm13, %ymm1 4422; AVX1-NEXT: vbroadcastsd 536(%rbp), %ymm10 4423; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm11 4424; AVX1-NEXT: vmovapd %ymm5, %ymm6 4425; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4426; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm0 4427; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 4428; AVX1-NEXT: vmovapd %ymm12, %ymm5 4429; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4430; AVX1-NEXT: vbroadcastsd 544(%rbp), %ymm10 4431; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 4432; AVX1-NEXT: vmovapd %ymm3, %ymm12 4433; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4434; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 4435; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4436; AVX1-NEXT: vbroadcastsd 552(%rbp), %ymm10 4437; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 4438; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4439; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm10 4440; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4441; AVX1-NEXT: vbroadcastsd 560(%rbp), %ymm10 4442; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 4443; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4444; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 4445; AVX1-NEXT: vmovapd %ymm4, %ymm3 4446; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4447; AVX1-NEXT: vbroadcastsd 568(%rbp), %ymm10 4448; AVX1-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 4449; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4450; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 4451; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4452; AVX1-NEXT: vbroadcastsd 576(%rbp), %ymm10 4453; AVX1-NEXT: vmovapd 144(%rbp), %ymm4 4454; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 4455; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4456; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 4457; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4458; AVX1-NEXT: vbroadcastsd 584(%rbp), %ymm10 4459; AVX1-NEXT: vmovapd 240(%rbp), %ymm14 4460; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 4461; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4462; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4463; AVX1-NEXT: vmovapd 208(%rbp), %ymm2 4464; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm1 4465; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 4466; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4467; AVX1-NEXT: vbroadcastsd 592(%rbp), %ymm0 4468; AVX1-NEXT: vmulpd %ymm0, %ymm13, %ymm1 4469; AVX1-NEXT: vbroadcastsd 600(%rbp), %ymm10 4470; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm11 4471; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4472; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm0 4473; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 4474; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4475; AVX1-NEXT: vbroadcastsd 608(%rbp), %ymm10 4476; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 4477; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4478; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 4479; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4480; AVX1-NEXT: vbroadcastsd 616(%rbp), %ymm10 4481; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 4482; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4483; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm10 4484; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4485; AVX1-NEXT: vbroadcastsd 624(%rbp), %ymm10 4486; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 4487; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4488; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 4489; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4490; AVX1-NEXT: vbroadcastsd 632(%rbp), %ymm10 4491; AVX1-NEXT: vmovapd 112(%rbp), %ymm3 4492; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 4493; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4494; AVX1-NEXT: vmovapd 80(%rbp), %ymm3 4495; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 4496; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4497; AVX1-NEXT: vbroadcastsd 640(%rbp), %ymm10 4498; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 4499; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4500; AVX1-NEXT: vmovapd 176(%rbp), %ymm3 4501; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 4502; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4503; AVX1-NEXT: vbroadcastsd 648(%rbp), %ymm10 4504; AVX1-NEXT: vmovapd %ymm14, %ymm4 4505; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 4506; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4507; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4508; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm1 4509; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 4510; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4511; AVX1-NEXT: vbroadcastsd 656(%rbp), %ymm2 4512; AVX1-NEXT: vmovapd %ymm13, %ymm3 4513; AVX1-NEXT: vmulpd %ymm2, %ymm13, %ymm1 4514; AVX1-NEXT: vbroadcastsd 664(%rbp), %ymm0 4515; AVX1-NEXT: vmulpd %ymm0, %ymm6, %ymm14 4516; AVX1-NEXT: vmovapd %ymm6, %ymm10 4517; AVX1-NEXT: vaddpd %ymm1, %ymm14, %ymm1 4518; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm2 4519; AVX1-NEXT: vmulpd %ymm0, %ymm5, %ymm0 4520; AVX1-NEXT: vmovapd %ymm5, %ymm6 4521; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0 4522; AVX1-NEXT: vbroadcastsd 672(%rbp), %ymm2 4523; AVX1-NEXT: vmulpd %ymm2, %ymm12, %ymm14 4524; AVX1-NEXT: vaddpd %ymm0, %ymm14, %ymm0 4525; AVX1-NEXT: vmulpd %ymm2, %ymm15, %ymm2 4526; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 4527; AVX1-NEXT: vbroadcastsd 680(%rbp), %ymm2 4528; AVX1-NEXT: vmulpd %ymm2, %ymm8, %ymm14 4529; AVX1-NEXT: vaddpd %ymm1, %ymm14, %ymm1 4530; AVX1-NEXT: vmulpd %ymm2, %ymm7, %ymm2 4531; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 4532; AVX1-NEXT: vbroadcastsd 688(%rbp), %ymm2 4533; AVX1-NEXT: vmovapd 16(%rbp), %ymm11 4534; AVX1-NEXT: vmulpd %ymm2, %ymm11, %ymm14 4535; AVX1-NEXT: vaddpd %ymm0, %ymm14, %ymm0 4536; AVX1-NEXT: vmulpd 48(%rbp), %ymm2, %ymm2 4537; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 4538; AVX1-NEXT: vbroadcastsd 696(%rbp), %ymm2 4539; AVX1-NEXT: vmovapd 112(%rbp), %ymm5 4540; AVX1-NEXT: vmulpd %ymm2, %ymm5, %ymm14 4541; AVX1-NEXT: vaddpd %ymm1, %ymm14, %ymm1 4542; AVX1-NEXT: vmovapd 80(%rbp), %ymm5 4543; AVX1-NEXT: vmulpd %ymm2, %ymm5, %ymm2 4544; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 4545; AVX1-NEXT: vbroadcastsd 704(%rbp), %ymm2 4546; AVX1-NEXT: vmulpd 144(%rbp), %ymm2, %ymm14 4547; AVX1-NEXT: vaddpd %ymm0, %ymm14, %ymm0 4548; AVX1-NEXT: vmovapd 176(%rbp), %ymm13 4549; AVX1-NEXT: vmulpd %ymm2, %ymm13, %ymm2 4550; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 4551; AVX1-NEXT: vbroadcastsd 712(%rbp), %ymm2 4552; AVX1-NEXT: vmulpd %ymm2, %ymm4, %ymm14 4553; AVX1-NEXT: vaddpd %ymm1, %ymm14, %ymm1 4554; AVX1-NEXT: vmovapd 208(%rbp), %ymm14 4555; AVX1-NEXT: vmulpd %ymm2, %ymm14, %ymm2 4556; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 4557; AVX1-NEXT: vbroadcastsd 720(%rbp), %ymm2 4558; AVX1-NEXT: vmulpd %ymm2, %ymm3, %ymm3 4559; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm2 4560; AVX1-NEXT: vbroadcastsd 728(%rbp), %ymm4 4561; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm5 4562; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 4563; AVX1-NEXT: vmulpd %ymm4, %ymm6, %ymm4 4564; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 4565; AVX1-NEXT: vbroadcastsd 736(%rbp), %ymm4 4566; AVX1-NEXT: vmulpd %ymm4, %ymm12, %ymm5 4567; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 4568; AVX1-NEXT: vmulpd %ymm4, %ymm15, %ymm4 4569; AVX1-NEXT: vaddpd %ymm4, %ymm3, %ymm3 4570; AVX1-NEXT: vbroadcastsd 744(%rbp), %ymm4 4571; AVX1-NEXT: vmulpd %ymm4, %ymm8, %ymm5 4572; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 4573; AVX1-NEXT: vmulpd %ymm4, %ymm7, %ymm4 4574; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 4575; AVX1-NEXT: vbroadcastsd 752(%rbp), %ymm4 4576; AVX1-NEXT: vmulpd %ymm4, %ymm11, %ymm5 4577; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 4578; AVX1-NEXT: vmulpd 48(%rbp), %ymm4, %ymm4 4579; AVX1-NEXT: vaddpd %ymm4, %ymm3, %ymm3 4580; AVX1-NEXT: vbroadcastsd 760(%rbp), %ymm4 4581; AVX1-NEXT: vmulpd 112(%rbp), %ymm4, %ymm5 4582; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 4583; AVX1-NEXT: vmulpd 80(%rbp), %ymm4, %ymm4 4584; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 4585; AVX1-NEXT: vbroadcastsd 768(%rbp), %ymm4 4586; AVX1-NEXT: vmulpd 144(%rbp), %ymm4, %ymm5 4587; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 4588; AVX1-NEXT: vmulpd %ymm4, %ymm13, %ymm4 4589; AVX1-NEXT: vaddpd %ymm4, %ymm3, %ymm3 4590; AVX1-NEXT: vbroadcastsd 776(%rbp), %ymm4 4591; AVX1-NEXT: vmulpd 240(%rbp), %ymm4, %ymm5 4592; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 4593; AVX1-NEXT: vmulpd %ymm4, %ymm14, %ymm4 4594; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 4595; AVX1-NEXT: vmovapd %ymm3, 480(%rdi) 4596; AVX1-NEXT: vmovapd %ymm2, 448(%rdi) 4597; AVX1-NEXT: vmovapd %ymm1, 416(%rdi) 4598; AVX1-NEXT: vmovapd %ymm0, 384(%rdi) 4599; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4600; AVX1-NEXT: vmovaps %ymm0, 352(%rdi) 4601; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4602; AVX1-NEXT: vmovaps %ymm0, 320(%rdi) 4603; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4604; AVX1-NEXT: vmovaps %ymm0, 288(%rdi) 4605; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4606; AVX1-NEXT: vmovaps %ymm0, 256(%rdi) 4607; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4608; AVX1-NEXT: vmovaps %ymm0, 224(%rdi) 4609; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4610; AVX1-NEXT: vmovaps %ymm0, 192(%rdi) 4611; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4612; AVX1-NEXT: vmovaps %ymm0, 160(%rdi) 4613; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4614; AVX1-NEXT: vmovaps %ymm0, 128(%rdi) 4615; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4616; AVX1-NEXT: vmovaps %ymm0, 96(%rdi) 4617; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4618; AVX1-NEXT: vmovaps %ymm0, 64(%rdi) 4619; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4620; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) 4621; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4622; AVX1-NEXT: vmovaps %ymm0, (%rdi) 4623; AVX1-NEXT: movq %rbp, %rsp 4624; AVX1-NEXT: popq %rbp 4625; AVX1-NEXT: vzeroupper 4626; AVX1-NEXT: retq 4627; 4628; AVX2-LABEL: test_mul8x8_f64: 4629; AVX2: # %bb.0: # %entry 4630; AVX2-NEXT: pushq %rbp 4631; AVX2-NEXT: movq %rsp, %rbp 4632; AVX2-NEXT: andq $-32, %rsp 4633; AVX2-NEXT: subq $448, %rsp # imm = 0x1C0 4634; AVX2-NEXT: vmovapd %ymm2, %ymm12 4635; AVX2-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill 4636; AVX2-NEXT: movq %rdi, %rax 4637; AVX2-NEXT: vmovapd 144(%rbp), %ymm13 4638; AVX2-NEXT: vmovapd 112(%rbp), %ymm14 4639; AVX2-NEXT: vbroadcastsd 272(%rbp), %ymm10 4640; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm8 4641; AVX2-NEXT: vmovapd %ymm1, %ymm9 4642; AVX2-NEXT: vmulpd %ymm0, %ymm10, %ymm0 4643; AVX2-NEXT: vbroadcastsd 280(%rbp), %ymm10 4644; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 4645; AVX2-NEXT: vaddpd %ymm11, %ymm8, %ymm1 4646; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 4647; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4648; AVX2-NEXT: vbroadcastsd 288(%rbp), %ymm10 4649; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 4650; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4651; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 4652; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4653; AVX2-NEXT: vbroadcastsd 296(%rbp), %ymm10 4654; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 4655; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4656; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 4657; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4658; AVX2-NEXT: vbroadcastsd 304(%rbp), %ymm10 4659; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 4660; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4661; AVX2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 4662; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4663; AVX2-NEXT: vbroadcastsd 312(%rbp), %ymm10 4664; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 4665; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4666; AVX2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 4667; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4668; AVX2-NEXT: vbroadcastsd 320(%rbp), %ymm10 4669; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 4670; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4671; AVX2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 4672; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4673; AVX2-NEXT: vbroadcastsd 328(%rbp), %ymm10 4674; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 4675; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4676; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4677; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 4678; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 4679; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4680; AVX2-NEXT: vbroadcastsd 336(%rbp), %ymm0 4681; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm1 4682; AVX2-NEXT: vbroadcastsd 344(%rbp), %ymm10 4683; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 4684; AVX2-NEXT: vmovapd %ymm3, %ymm8 4685; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4686; AVX2-NEXT: vmovapd (%rsp), %ymm15 # 32-byte Reload 4687; AVX2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 4688; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 4689; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4690; AVX2-NEXT: vbroadcastsd 352(%rbp), %ymm10 4691; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 4692; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4693; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 4694; AVX2-NEXT: vmovapd %ymm5, %ymm3 4695; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4696; AVX2-NEXT: vbroadcastsd 360(%rbp), %ymm10 4697; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 4698; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4699; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 4700; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4701; AVX2-NEXT: vbroadcastsd 368(%rbp), %ymm10 4702; AVX2-NEXT: vmovapd 16(%rbp), %ymm2 4703; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 4704; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4705; AVX2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 4706; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4707; AVX2-NEXT: vbroadcastsd 376(%rbp), %ymm10 4708; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 4709; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4710; AVX2-NEXT: vmovapd 80(%rbp), %ymm2 4711; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 4712; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4713; AVX2-NEXT: vbroadcastsd 384(%rbp), %ymm10 4714; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 4715; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4716; AVX2-NEXT: vmovapd 176(%rbp), %ymm14 4717; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 4718; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4719; AVX2-NEXT: vbroadcastsd 392(%rbp), %ymm10 4720; AVX2-NEXT: vmovapd 240(%rbp), %ymm2 4721; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 4722; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4723; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4724; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 4725; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 4726; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4727; AVX2-NEXT: vbroadcastsd 400(%rbp), %ymm0 4728; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm1 4729; AVX2-NEXT: vbroadcastsd 408(%rbp), %ymm10 4730; AVX2-NEXT: vmovapd %ymm8, %ymm5 4731; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 4732; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4733; AVX2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 4734; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 4735; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4736; AVX2-NEXT: vbroadcastsd 416(%rbp), %ymm10 4737; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 4738; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4739; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 4740; AVX2-NEXT: vmovapd %ymm3, %ymm2 4741; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4742; AVX2-NEXT: vbroadcastsd 424(%rbp), %ymm10 4743; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 4744; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4745; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 4746; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4747; AVX2-NEXT: vbroadcastsd 432(%rbp), %ymm10 4748; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 4749; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4750; AVX2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 4751; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4752; AVX2-NEXT: vbroadcastsd 440(%rbp), %ymm10 4753; AVX2-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 4754; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4755; AVX2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 4756; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4757; AVX2-NEXT: vbroadcastsd 448(%rbp), %ymm10 4758; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 4759; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4760; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 4761; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4762; AVX2-NEXT: vbroadcastsd 456(%rbp), %ymm10 4763; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 4764; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4765; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4766; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 4767; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 4768; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4769; AVX2-NEXT: vbroadcastsd 464(%rbp), %ymm0 4770; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm1 4771; AVX2-NEXT: vmovapd %ymm9, %ymm13 4772; AVX2-NEXT: vbroadcastsd 472(%rbp), %ymm10 4773; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 4774; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4775; AVX2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 4776; AVX2-NEXT: vmovapd %ymm15, %ymm9 4777; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 4778; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4779; AVX2-NEXT: vbroadcastsd 480(%rbp), %ymm10 4780; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 4781; AVX2-NEXT: vmovapd %ymm4, %ymm3 4782; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4783; AVX2-NEXT: vmovapd %ymm2, %ymm15 4784; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 4785; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4786; AVX2-NEXT: vbroadcastsd 488(%rbp), %ymm10 4787; AVX2-NEXT: vmovapd %ymm7, %ymm8 4788; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 4789; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4790; AVX2-NEXT: vmovapd %ymm6, %ymm7 4791; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 4792; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4793; AVX2-NEXT: vbroadcastsd 496(%rbp), %ymm10 4794; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 4795; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4796; AVX2-NEXT: vmovapd 48(%rbp), %ymm4 4797; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 4798; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4799; AVX2-NEXT: vbroadcastsd 504(%rbp), %ymm10 4800; AVX2-NEXT: vmovapd 112(%rbp), %ymm2 4801; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 4802; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4803; AVX2-NEXT: vmovapd 80(%rbp), %ymm14 4804; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 4805; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4806; AVX2-NEXT: vbroadcastsd 512(%rbp), %ymm10 4807; AVX2-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 4808; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4809; AVX2-NEXT: vmovapd 176(%rbp), %ymm2 4810; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 4811; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4812; AVX2-NEXT: vbroadcastsd 520(%rbp), %ymm10 4813; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 4814; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4815; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4816; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 4817; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 4818; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4819; AVX2-NEXT: vbroadcastsd 528(%rbp), %ymm0 4820; AVX2-NEXT: vmulpd %ymm0, %ymm13, %ymm1 4821; AVX2-NEXT: vbroadcastsd 536(%rbp), %ymm10 4822; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm11 4823; AVX2-NEXT: vmovapd %ymm5, %ymm6 4824; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4825; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm0 4826; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 4827; AVX2-NEXT: vmovapd %ymm12, %ymm5 4828; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4829; AVX2-NEXT: vbroadcastsd 544(%rbp), %ymm10 4830; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 4831; AVX2-NEXT: vmovapd %ymm3, %ymm12 4832; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4833; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 4834; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4835; AVX2-NEXT: vbroadcastsd 552(%rbp), %ymm10 4836; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 4837; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4838; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 4839; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4840; AVX2-NEXT: vbroadcastsd 560(%rbp), %ymm10 4841; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 4842; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4843; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 4844; AVX2-NEXT: vmovapd %ymm4, %ymm3 4845; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4846; AVX2-NEXT: vbroadcastsd 568(%rbp), %ymm10 4847; AVX2-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 4848; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4849; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 4850; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4851; AVX2-NEXT: vbroadcastsd 576(%rbp), %ymm10 4852; AVX2-NEXT: vmovapd 144(%rbp), %ymm4 4853; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 4854; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4855; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 4856; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4857; AVX2-NEXT: vbroadcastsd 584(%rbp), %ymm10 4858; AVX2-NEXT: vmovapd 240(%rbp), %ymm14 4859; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 4860; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4861; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4862; AVX2-NEXT: vmovapd 208(%rbp), %ymm2 4863; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm1 4864; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 4865; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4866; AVX2-NEXT: vbroadcastsd 592(%rbp), %ymm0 4867; AVX2-NEXT: vmulpd %ymm0, %ymm13, %ymm1 4868; AVX2-NEXT: vbroadcastsd 600(%rbp), %ymm10 4869; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm11 4870; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4871; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm0 4872; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 4873; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4874; AVX2-NEXT: vbroadcastsd 608(%rbp), %ymm10 4875; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 4876; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4877; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 4878; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4879; AVX2-NEXT: vbroadcastsd 616(%rbp), %ymm10 4880; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 4881; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4882; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 4883; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4884; AVX2-NEXT: vbroadcastsd 624(%rbp), %ymm10 4885; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 4886; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4887; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 4888; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4889; AVX2-NEXT: vbroadcastsd 632(%rbp), %ymm10 4890; AVX2-NEXT: vmovapd 112(%rbp), %ymm3 4891; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 4892; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4893; AVX2-NEXT: vmovapd 80(%rbp), %ymm3 4894; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 4895; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 4896; AVX2-NEXT: vbroadcastsd 640(%rbp), %ymm10 4897; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 4898; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 4899; AVX2-NEXT: vmovapd 176(%rbp), %ymm3 4900; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 4901; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 4902; AVX2-NEXT: vbroadcastsd 648(%rbp), %ymm10 4903; AVX2-NEXT: vmovapd %ymm14, %ymm4 4904; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 4905; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 4906; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4907; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm1 4908; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 4909; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4910; AVX2-NEXT: vbroadcastsd 656(%rbp), %ymm2 4911; AVX2-NEXT: vmovapd %ymm13, %ymm3 4912; AVX2-NEXT: vmulpd %ymm2, %ymm13, %ymm1 4913; AVX2-NEXT: vbroadcastsd 664(%rbp), %ymm0 4914; AVX2-NEXT: vmulpd %ymm0, %ymm6, %ymm14 4915; AVX2-NEXT: vmovapd %ymm6, %ymm10 4916; AVX2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 4917; AVX2-NEXT: vmulpd %ymm2, %ymm9, %ymm2 4918; AVX2-NEXT: vmulpd %ymm0, %ymm5, %ymm0 4919; AVX2-NEXT: vmovapd %ymm5, %ymm6 4920; AVX2-NEXT: vaddpd %ymm0, %ymm2, %ymm0 4921; AVX2-NEXT: vbroadcastsd 672(%rbp), %ymm2 4922; AVX2-NEXT: vmulpd %ymm2, %ymm12, %ymm14 4923; AVX2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 4924; AVX2-NEXT: vmulpd %ymm2, %ymm15, %ymm2 4925; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 4926; AVX2-NEXT: vbroadcastsd 680(%rbp), %ymm2 4927; AVX2-NEXT: vmulpd %ymm2, %ymm8, %ymm14 4928; AVX2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 4929; AVX2-NEXT: vmulpd %ymm2, %ymm7, %ymm2 4930; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 4931; AVX2-NEXT: vbroadcastsd 688(%rbp), %ymm2 4932; AVX2-NEXT: vmovapd 16(%rbp), %ymm11 4933; AVX2-NEXT: vmulpd %ymm2, %ymm11, %ymm14 4934; AVX2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 4935; AVX2-NEXT: vmulpd 48(%rbp), %ymm2, %ymm2 4936; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 4937; AVX2-NEXT: vbroadcastsd 696(%rbp), %ymm2 4938; AVX2-NEXT: vmovapd 112(%rbp), %ymm5 4939; AVX2-NEXT: vmulpd %ymm2, %ymm5, %ymm14 4940; AVX2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 4941; AVX2-NEXT: vmovapd 80(%rbp), %ymm5 4942; AVX2-NEXT: vmulpd %ymm2, %ymm5, %ymm2 4943; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 4944; AVX2-NEXT: vbroadcastsd 704(%rbp), %ymm2 4945; AVX2-NEXT: vmulpd 144(%rbp), %ymm2, %ymm14 4946; AVX2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 4947; AVX2-NEXT: vmovapd 176(%rbp), %ymm13 4948; AVX2-NEXT: vmulpd %ymm2, %ymm13, %ymm2 4949; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 4950; AVX2-NEXT: vbroadcastsd 712(%rbp), %ymm2 4951; AVX2-NEXT: vmulpd %ymm2, %ymm4, %ymm14 4952; AVX2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 4953; AVX2-NEXT: vmovapd 208(%rbp), %ymm14 4954; AVX2-NEXT: vmulpd %ymm2, %ymm14, %ymm2 4955; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 4956; AVX2-NEXT: vbroadcastsd 720(%rbp), %ymm2 4957; AVX2-NEXT: vmulpd %ymm2, %ymm3, %ymm3 4958; AVX2-NEXT: vmulpd %ymm2, %ymm9, %ymm2 4959; AVX2-NEXT: vbroadcastsd 728(%rbp), %ymm4 4960; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm5 4961; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 4962; AVX2-NEXT: vmulpd %ymm4, %ymm6, %ymm4 4963; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 4964; AVX2-NEXT: vbroadcastsd 736(%rbp), %ymm4 4965; AVX2-NEXT: vmulpd %ymm4, %ymm12, %ymm5 4966; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 4967; AVX2-NEXT: vmulpd %ymm4, %ymm15, %ymm4 4968; AVX2-NEXT: vaddpd %ymm4, %ymm3, %ymm3 4969; AVX2-NEXT: vbroadcastsd 744(%rbp), %ymm4 4970; AVX2-NEXT: vmulpd %ymm4, %ymm8, %ymm5 4971; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 4972; AVX2-NEXT: vmulpd %ymm4, %ymm7, %ymm4 4973; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 4974; AVX2-NEXT: vbroadcastsd 752(%rbp), %ymm4 4975; AVX2-NEXT: vmulpd %ymm4, %ymm11, %ymm5 4976; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 4977; AVX2-NEXT: vmulpd 48(%rbp), %ymm4, %ymm4 4978; AVX2-NEXT: vaddpd %ymm4, %ymm3, %ymm3 4979; AVX2-NEXT: vbroadcastsd 760(%rbp), %ymm4 4980; AVX2-NEXT: vmulpd 112(%rbp), %ymm4, %ymm5 4981; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 4982; AVX2-NEXT: vmulpd 80(%rbp), %ymm4, %ymm4 4983; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 4984; AVX2-NEXT: vbroadcastsd 768(%rbp), %ymm4 4985; AVX2-NEXT: vmulpd 144(%rbp), %ymm4, %ymm5 4986; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 4987; AVX2-NEXT: vmulpd %ymm4, %ymm13, %ymm4 4988; AVX2-NEXT: vaddpd %ymm4, %ymm3, %ymm3 4989; AVX2-NEXT: vbroadcastsd 776(%rbp), %ymm4 4990; AVX2-NEXT: vmulpd 240(%rbp), %ymm4, %ymm5 4991; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 4992; AVX2-NEXT: vmulpd %ymm4, %ymm14, %ymm4 4993; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 4994; AVX2-NEXT: vmovapd %ymm3, 480(%rdi) 4995; AVX2-NEXT: vmovapd %ymm2, 448(%rdi) 4996; AVX2-NEXT: vmovapd %ymm1, 416(%rdi) 4997; AVX2-NEXT: vmovapd %ymm0, 384(%rdi) 4998; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4999; AVX2-NEXT: vmovaps %ymm0, 352(%rdi) 5000; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5001; AVX2-NEXT: vmovaps %ymm0, 320(%rdi) 5002; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5003; AVX2-NEXT: vmovaps %ymm0, 288(%rdi) 5004; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5005; AVX2-NEXT: vmovaps %ymm0, 256(%rdi) 5006; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5007; AVX2-NEXT: vmovaps %ymm0, 224(%rdi) 5008; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5009; AVX2-NEXT: vmovaps %ymm0, 192(%rdi) 5010; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5011; AVX2-NEXT: vmovaps %ymm0, 160(%rdi) 5012; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5013; AVX2-NEXT: vmovaps %ymm0, 128(%rdi) 5014; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5015; AVX2-NEXT: vmovaps %ymm0, 96(%rdi) 5016; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5017; AVX2-NEXT: vmovaps %ymm0, 64(%rdi) 5018; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5019; AVX2-NEXT: vmovaps %ymm0, 32(%rdi) 5020; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 5021; AVX2-NEXT: vmovaps %ymm0, (%rdi) 5022; AVX2-NEXT: movq %rbp, %rsp 5023; AVX2-NEXT: popq %rbp 5024; AVX2-NEXT: vzeroupper 5025; AVX2-NEXT: retq 5026; 5027; AVX512-LABEL: test_mul8x8_f64: 5028; AVX512: # %bb.0: # %entry 5029; AVX512-NEXT: pushq %rbp 5030; AVX512-NEXT: movq %rsp, %rbp 5031; AVX512-NEXT: andq $-64, %rsp 5032; AVX512-NEXT: subq $64, %rsp 5033; AVX512-NEXT: movq %rdi, %rax 5034; AVX512-NEXT: vmulpd 16(%rbp){1to8}, %zmm0, %zmm8 5035; AVX512-NEXT: vmulpd 24(%rbp){1to8}, %zmm1, %zmm9 5036; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 5037; AVX512-NEXT: vmulpd 32(%rbp){1to8}, %zmm2, %zmm9 5038; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 5039; AVX512-NEXT: vmulpd 40(%rbp){1to8}, %zmm3, %zmm9 5040; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 5041; AVX512-NEXT: vmulpd 48(%rbp){1to8}, %zmm4, %zmm9 5042; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 5043; AVX512-NEXT: vmulpd 56(%rbp){1to8}, %zmm5, %zmm9 5044; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 5045; AVX512-NEXT: vmulpd 64(%rbp){1to8}, %zmm6, %zmm9 5046; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 5047; AVX512-NEXT: vmulpd 72(%rbp){1to8}, %zmm7, %zmm9 5048; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 5049; AVX512-NEXT: vmulpd 80(%rbp){1to8}, %zmm0, %zmm9 5050; AVX512-NEXT: vmulpd 88(%rbp){1to8}, %zmm1, %zmm10 5051; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 5052; AVX512-NEXT: vmulpd 96(%rbp){1to8}, %zmm2, %zmm10 5053; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 5054; AVX512-NEXT: vmulpd 104(%rbp){1to8}, %zmm3, %zmm10 5055; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 5056; AVX512-NEXT: vmulpd 112(%rbp){1to8}, %zmm4, %zmm10 5057; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 5058; AVX512-NEXT: vmulpd 120(%rbp){1to8}, %zmm5, %zmm10 5059; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 5060; AVX512-NEXT: vmulpd 128(%rbp){1to8}, %zmm6, %zmm10 5061; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 5062; AVX512-NEXT: vmulpd 136(%rbp){1to8}, %zmm7, %zmm10 5063; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 5064; AVX512-NEXT: vmulpd 144(%rbp){1to8}, %zmm0, %zmm10 5065; AVX512-NEXT: vmulpd 152(%rbp){1to8}, %zmm1, %zmm11 5066; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 5067; AVX512-NEXT: vmulpd 160(%rbp){1to8}, %zmm2, %zmm11 5068; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 5069; AVX512-NEXT: vmulpd 168(%rbp){1to8}, %zmm3, %zmm11 5070; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 5071; AVX512-NEXT: vmulpd 176(%rbp){1to8}, %zmm4, %zmm11 5072; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 5073; AVX512-NEXT: vmulpd 184(%rbp){1to8}, %zmm5, %zmm11 5074; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 5075; AVX512-NEXT: vmulpd 192(%rbp){1to8}, %zmm6, %zmm11 5076; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 5077; AVX512-NEXT: vmulpd 200(%rbp){1to8}, %zmm7, %zmm11 5078; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 5079; AVX512-NEXT: vmulpd 208(%rbp){1to8}, %zmm0, %zmm11 5080; AVX512-NEXT: vmulpd 216(%rbp){1to8}, %zmm1, %zmm12 5081; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 5082; AVX512-NEXT: vmulpd 224(%rbp){1to8}, %zmm2, %zmm12 5083; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 5084; AVX512-NEXT: vmulpd 232(%rbp){1to8}, %zmm3, %zmm12 5085; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 5086; AVX512-NEXT: vmulpd 240(%rbp){1to8}, %zmm4, %zmm12 5087; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 5088; AVX512-NEXT: vmulpd 248(%rbp){1to8}, %zmm5, %zmm12 5089; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 5090; AVX512-NEXT: vmulpd 256(%rbp){1to8}, %zmm6, %zmm12 5091; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 5092; AVX512-NEXT: vmulpd 264(%rbp){1to8}, %zmm7, %zmm12 5093; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 5094; AVX512-NEXT: vmulpd 272(%rbp){1to8}, %zmm0, %zmm12 5095; AVX512-NEXT: vmulpd 280(%rbp){1to8}, %zmm1, %zmm13 5096; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 5097; AVX512-NEXT: vmulpd 288(%rbp){1to8}, %zmm2, %zmm13 5098; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 5099; AVX512-NEXT: vmulpd 296(%rbp){1to8}, %zmm3, %zmm13 5100; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 5101; AVX512-NEXT: vmulpd 304(%rbp){1to8}, %zmm4, %zmm13 5102; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 5103; AVX512-NEXT: vmulpd 312(%rbp){1to8}, %zmm5, %zmm13 5104; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 5105; AVX512-NEXT: vmulpd 320(%rbp){1to8}, %zmm6, %zmm13 5106; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 5107; AVX512-NEXT: vmulpd 328(%rbp){1to8}, %zmm7, %zmm13 5108; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 5109; AVX512-NEXT: vmulpd 336(%rbp){1to8}, %zmm0, %zmm13 5110; AVX512-NEXT: vmulpd 344(%rbp){1to8}, %zmm1, %zmm14 5111; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 5112; AVX512-NEXT: vmulpd 352(%rbp){1to8}, %zmm2, %zmm14 5113; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 5114; AVX512-NEXT: vmulpd 360(%rbp){1to8}, %zmm3, %zmm14 5115; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 5116; AVX512-NEXT: vmulpd 368(%rbp){1to8}, %zmm4, %zmm14 5117; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 5118; AVX512-NEXT: vmulpd 376(%rbp){1to8}, %zmm5, %zmm14 5119; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 5120; AVX512-NEXT: vmulpd 384(%rbp){1to8}, %zmm6, %zmm14 5121; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 5122; AVX512-NEXT: vmulpd 392(%rbp){1to8}, %zmm7, %zmm14 5123; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 5124; AVX512-NEXT: vmulpd 400(%rbp){1to8}, %zmm0, %zmm14 5125; AVX512-NEXT: vmulpd 408(%rbp){1to8}, %zmm1, %zmm15 5126; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 5127; AVX512-NEXT: vmulpd 416(%rbp){1to8}, %zmm2, %zmm15 5128; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 5129; AVX512-NEXT: vmulpd 424(%rbp){1to8}, %zmm3, %zmm15 5130; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 5131; AVX512-NEXT: vmulpd 432(%rbp){1to8}, %zmm4, %zmm15 5132; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 5133; AVX512-NEXT: vmulpd 440(%rbp){1to8}, %zmm5, %zmm15 5134; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 5135; AVX512-NEXT: vmulpd 448(%rbp){1to8}, %zmm6, %zmm15 5136; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 5137; AVX512-NEXT: vmulpd 456(%rbp){1to8}, %zmm7, %zmm15 5138; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 5139; AVX512-NEXT: vmulpd 464(%rbp){1to8}, %zmm0, %zmm0 5140; AVX512-NEXT: vmulpd 472(%rbp){1to8}, %zmm1, %zmm1 5141; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 5142; AVX512-NEXT: vmulpd 480(%rbp){1to8}, %zmm2, %zmm1 5143; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 5144; AVX512-NEXT: vmulpd 488(%rbp){1to8}, %zmm3, %zmm1 5145; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 5146; AVX512-NEXT: vmulpd 496(%rbp){1to8}, %zmm4, %zmm1 5147; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 5148; AVX512-NEXT: vmulpd 504(%rbp){1to8}, %zmm5, %zmm1 5149; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 5150; AVX512-NEXT: vmulpd 512(%rbp){1to8}, %zmm6, %zmm1 5151; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 5152; AVX512-NEXT: vmulpd 520(%rbp){1to8}, %zmm7, %zmm1 5153; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 5154; AVX512-NEXT: vmovapd %zmm0, 448(%rdi) 5155; AVX512-NEXT: vmovapd %zmm14, 384(%rdi) 5156; AVX512-NEXT: vmovapd %zmm13, 320(%rdi) 5157; AVX512-NEXT: vmovapd %zmm12, 256(%rdi) 5158; AVX512-NEXT: vmovapd %zmm11, 192(%rdi) 5159; AVX512-NEXT: vmovapd %zmm10, 128(%rdi) 5160; AVX512-NEXT: vmovapd %zmm9, 64(%rdi) 5161; AVX512-NEXT: vmovapd %zmm8, (%rdi) 5162; AVX512-NEXT: movq %rbp, %rsp 5163; AVX512-NEXT: popq %rbp 5164; AVX512-NEXT: vzeroupper 5165; AVX512-NEXT: retq 5166entry: 5167 %split = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 5168 %split1 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 5169 %split2 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 5170 %split3 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 5171 %split4 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39> 5172 %split5 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47> 5173 %split6 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55> 5174 %split7 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 5175 %splat.splat = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> zeroinitializer 5176 %0 = fmul <8 x double> %split, %splat.splat 5177 %splat.splat18 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 5178 %1 = fmul <8 x double> %split1, %splat.splat18 5179 %2 = fadd <8 x double> %0, %1 5180 %splat.splat21 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 5181 %3 = fmul <8 x double> %split2, %splat.splat21 5182 %4 = fadd <8 x double> %2, %3 5183 %splat.splat24 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 5184 %5 = fmul <8 x double> %split3, %splat.splat24 5185 %6 = fadd <8 x double> %4, %5 5186 %splat.splat27 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 5187 %7 = fmul <8 x double> %split4, %splat.splat27 5188 %8 = fadd <8 x double> %6, %7 5189 %splat.splat30 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> 5190 %9 = fmul <8 x double> %split5, %splat.splat30 5191 %10 = fadd <8 x double> %8, %9 5192 %splat.splat33 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6> 5193 %11 = fmul <8 x double> %split6, %splat.splat33 5194 %12 = fadd <8 x double> %10, %11 5195 %splat.splat36 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 5196 %13 = fmul <8 x double> %split7, %splat.splat36 5197 %14 = fadd <8 x double> %12, %13 5198 %splat.splat39 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> 5199 %15 = fmul <8 x double> %split, %splat.splat39 5200 %splat.splat42 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9> 5201 %16 = fmul <8 x double> %split1, %splat.splat42 5202 %17 = fadd <8 x double> %15, %16 5203 %splat.splat45 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10> 5204 %18 = fmul <8 x double> %split2, %splat.splat45 5205 %19 = fadd <8 x double> %17, %18 5206 %splat.splat48 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11> 5207 %20 = fmul <8 x double> %split3, %splat.splat48 5208 %21 = fadd <8 x double> %19, %20 5209 %splat.splat51 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12> 5210 %22 = fmul <8 x double> %split4, %splat.splat51 5211 %23 = fadd <8 x double> %21, %22 5212 %splat.splat54 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13> 5213 %24 = fmul <8 x double> %split5, %splat.splat54 5214 %25 = fadd <8 x double> %23, %24 5215 %splat.splat57 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14> 5216 %26 = fmul <8 x double> %split6, %splat.splat57 5217 %27 = fadd <8 x double> %25, %26 5218 %splat.splat60 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 5219 %28 = fmul <8 x double> %split7, %splat.splat60 5220 %29 = fadd <8 x double> %27, %28 5221 %splat.splat63 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 5222 %30 = fmul <8 x double> %split, %splat.splat63 5223 %splat.splat66 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 5224 %31 = fmul <8 x double> %split1, %splat.splat66 5225 %32 = fadd <8 x double> %30, %31 5226 %splat.splat69 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18> 5227 %33 = fmul <8 x double> %split2, %splat.splat69 5228 %34 = fadd <8 x double> %32, %33 5229 %splat.splat72 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19> 5230 %35 = fmul <8 x double> %split3, %splat.splat72 5231 %36 = fadd <8 x double> %34, %35 5232 %splat.splat75 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20> 5233 %37 = fmul <8 x double> %split4, %splat.splat75 5234 %38 = fadd <8 x double> %36, %37 5235 %splat.splat78 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21> 5236 %39 = fmul <8 x double> %split5, %splat.splat78 5237 %40 = fadd <8 x double> %38, %39 5238 %splat.splat81 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22> 5239 %41 = fmul <8 x double> %split6, %splat.splat81 5240 %42 = fadd <8 x double> %40, %41 5241 %splat.splat84 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23> 5242 %43 = fmul <8 x double> %split7, %splat.splat84 5243 %44 = fadd <8 x double> %42, %43 5244 %splat.splat87 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> 5245 %45 = fmul <8 x double> %split, %splat.splat87 5246 %splat.splat90 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25> 5247 %46 = fmul <8 x double> %split1, %splat.splat90 5248 %47 = fadd <8 x double> %45, %46 5249 %splat.splat93 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26> 5250 %48 = fmul <8 x double> %split2, %splat.splat93 5251 %49 = fadd <8 x double> %47, %48 5252 %splat.splat96 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27> 5253 %50 = fmul <8 x double> %split3, %splat.splat96 5254 %51 = fadd <8 x double> %49, %50 5255 %splat.splat99 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28> 5256 %52 = fmul <8 x double> %split4, %splat.splat99 5257 %53 = fadd <8 x double> %51, %52 5258 %splat.splat102 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29> 5259 %54 = fmul <8 x double> %split5, %splat.splat102 5260 %55 = fadd <8 x double> %53, %54 5261 %splat.splat105 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30> 5262 %56 = fmul <8 x double> %split6, %splat.splat105 5263 %57 = fadd <8 x double> %55, %56 5264 %splat.splat108 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 5265 %58 = fmul <8 x double> %split7, %splat.splat108 5266 %59 = fadd <8 x double> %57, %58 5267 %splat.splat111 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32> 5268 %60 = fmul <8 x double> %split, %splat.splat111 5269 %splat.splat114 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33> 5270 %61 = fmul <8 x double> %split1, %splat.splat114 5271 %62 = fadd <8 x double> %60, %61 5272 %splat.splat117 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34> 5273 %63 = fmul <8 x double> %split2, %splat.splat117 5274 %64 = fadd <8 x double> %62, %63 5275 %splat.splat120 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35> 5276 %65 = fmul <8 x double> %split3, %splat.splat120 5277 %66 = fadd <8 x double> %64, %65 5278 %splat.splat123 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36> 5279 %67 = fmul <8 x double> %split4, %splat.splat123 5280 %68 = fadd <8 x double> %66, %67 5281 %splat.splat126 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37> 5282 %69 = fmul <8 x double> %split5, %splat.splat126 5283 %70 = fadd <8 x double> %68, %69 5284 %splat.splat129 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38> 5285 %71 = fmul <8 x double> %split6, %splat.splat129 5286 %72 = fadd <8 x double> %70, %71 5287 %splat.splat132 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39> 5288 %73 = fmul <8 x double> %split7, %splat.splat132 5289 %74 = fadd <8 x double> %72, %73 5290 %splat.splat135 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40> 5291 %75 = fmul <8 x double> %split, %splat.splat135 5292 %splat.splat138 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41> 5293 %76 = fmul <8 x double> %split1, %splat.splat138 5294 %77 = fadd <8 x double> %75, %76 5295 %splat.splat141 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42> 5296 %78 = fmul <8 x double> %split2, %splat.splat141 5297 %79 = fadd <8 x double> %77, %78 5298 %splat.splat144 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43> 5299 %80 = fmul <8 x double> %split3, %splat.splat144 5300 %81 = fadd <8 x double> %79, %80 5301 %splat.splat147 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44> 5302 %82 = fmul <8 x double> %split4, %splat.splat147 5303 %83 = fadd <8 x double> %81, %82 5304 %splat.splat150 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45> 5305 %84 = fmul <8 x double> %split5, %splat.splat150 5306 %85 = fadd <8 x double> %83, %84 5307 %splat.splat153 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46> 5308 %86 = fmul <8 x double> %split6, %splat.splat153 5309 %87 = fadd <8 x double> %85, %86 5310 %splat.splat156 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47> 5311 %88 = fmul <8 x double> %split7, %splat.splat156 5312 %89 = fadd <8 x double> %87, %88 5313 %splat.splat159 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48> 5314 %90 = fmul <8 x double> %split, %splat.splat159 5315 %splat.splat162 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49> 5316 %91 = fmul <8 x double> %split1, %splat.splat162 5317 %92 = fadd <8 x double> %90, %91 5318 %splat.splat165 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50> 5319 %93 = fmul <8 x double> %split2, %splat.splat165 5320 %94 = fadd <8 x double> %92, %93 5321 %splat.splat168 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51> 5322 %95 = fmul <8 x double> %split3, %splat.splat168 5323 %96 = fadd <8 x double> %94, %95 5324 %splat.splat171 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52> 5325 %97 = fmul <8 x double> %split4, %splat.splat171 5326 %98 = fadd <8 x double> %96, %97 5327 %splat.splat174 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53> 5328 %99 = fmul <8 x double> %split5, %splat.splat174 5329 %100 = fadd <8 x double> %98, %99 5330 %splat.splat177 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54> 5331 %101 = fmul <8 x double> %split6, %splat.splat177 5332 %102 = fadd <8 x double> %100, %101 5333 %splat.splat180 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55> 5334 %103 = fmul <8 x double> %split7, %splat.splat180 5335 %104 = fadd <8 x double> %102, %103 5336 %splat.splat183 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56> 5337 %105 = fmul <8 x double> %split, %splat.splat183 5338 %splat.splat186 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57> 5339 %106 = fmul <8 x double> %split1, %splat.splat186 5340 %107 = fadd <8 x double> %105, %106 5341 %splat.splat189 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58> 5342 %108 = fmul <8 x double> %split2, %splat.splat189 5343 %109 = fadd <8 x double> %107, %108 5344 %splat.splat192 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59> 5345 %110 = fmul <8 x double> %split3, %splat.splat192 5346 %111 = fadd <8 x double> %109, %110 5347 %splat.splat195 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60> 5348 %112 = fmul <8 x double> %split4, %splat.splat195 5349 %113 = fadd <8 x double> %111, %112 5350 %splat.splat198 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61> 5351 %114 = fmul <8 x double> %split5, %splat.splat198 5352 %115 = fadd <8 x double> %113, %114 5353 %splat.splat201 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62> 5354 %116 = fmul <8 x double> %split6, %splat.splat201 5355 %117 = fadd <8 x double> %115, %116 5356 %splat.splat204 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63> 5357 %118 = fmul <8 x double> %split7, %splat.splat204 5358 %119 = fadd <8 x double> %117, %118 5359 %120 = shufflevector <8 x double> %14, <8 x double> %29, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 5360 %121 = shufflevector <8 x double> %44, <8 x double> %59, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 5361 %122 = shufflevector <8 x double> %74, <8 x double> %89, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 5362 %123 = shufflevector <8 x double> %104, <8 x double> %119, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 5363 %124 = shufflevector <16 x double> %120, <16 x double> %121, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 5364 %125 = shufflevector <16 x double> %122, <16 x double> %123, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 5365 %126 = shufflevector <32 x double> %124, <32 x double> %125, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 5366 ret <64 x double> %126 5367} 5368