xref: /llvm-project/llvm/test/CodeGen/X86/matrix-multiply.ll (revision be6c752e157638849f1f59f7e2b7ecbe11a022fe)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
6; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL
7
8;
9; Basic matrix multiply tests based on the pattern:
10;
11; using matrix_ty = float __attribute__((matrix_type(2,2)));
12; matrix_ty test_mul2x2(matrix_ty a0, matrix_ty a1) nounwind {
13;     return a0 * a1;
14; }
15;
16
17define <4 x float> @test_mul2x2_f32(<4 x float> %a0, <4 x float> %a1) nounwind {
18; SSE-LABEL: test_mul2x2_f32:
19; SSE:       # %bb.0: # %entry
20; SSE-NEXT:    movaps %xmm1, %xmm2
21; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
22; SSE-NEXT:    mulps %xmm0, %xmm2
23; SSE-NEXT:    movaps %xmm1, %xmm3
24; SSE-NEXT:    unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
25; SSE-NEXT:    mulps %xmm0, %xmm3
26; SSE-NEXT:    movaps %xmm0, %xmm4
27; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
28; SSE-NEXT:    movaps %xmm1, %xmm0
29; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
30; SSE-NEXT:    mulps %xmm4, %xmm0
31; SSE-NEXT:    addps %xmm2, %xmm0
32; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
33; SSE-NEXT:    mulps %xmm4, %xmm1
34; SSE-NEXT:    addps %xmm3, %xmm1
35; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
36; SSE-NEXT:    retq
37;
38; AVX1-LABEL: test_mul2x2_f32:
39; AVX1:       # %bb.0: # %entry
40; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
41; AVX1-NEXT:    vmovsldup {{.*#+}} xmm3 = xmm1[0,0,2,2]
42; AVX1-NEXT:    vmulps %xmm3, %xmm0, %xmm3
43; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
44; AVX1-NEXT:    vmulps %xmm4, %xmm2, %xmm4
45; AVX1-NEXT:    vaddps %xmm4, %xmm3, %xmm3
46; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2]
47; AVX1-NEXT:    vmulps %xmm4, %xmm0, %xmm0
48; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
49; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
50; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0
51; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0]
52; AVX1-NEXT:    retq
53;
54; AVX2-LABEL: test_mul2x2_f32:
55; AVX2:       # %bb.0: # %entry
56; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
57; AVX2-NEXT:    vbroadcastss %xmm1, %xmm3
58; AVX2-NEXT:    vmulps %xmm3, %xmm0, %xmm3
59; AVX2-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
60; AVX2-NEXT:    vmulps %xmm4, %xmm2, %xmm4
61; AVX2-NEXT:    vaddps %xmm4, %xmm3, %xmm3
62; AVX2-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2]
63; AVX2-NEXT:    vmulps %xmm4, %xmm0, %xmm0
64; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
65; AVX2-NEXT:    vmulps %xmm1, %xmm2, %xmm1
66; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
67; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0]
68; AVX2-NEXT:    retq
69;
70; AVX512-LABEL: test_mul2x2_f32:
71; AVX512:       # %bb.0: # %entry
72; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
73; AVX512-NEXT:    vbroadcastss %xmm1, %xmm3
74; AVX512-NEXT:    vmulps %xmm3, %xmm0, %xmm3
75; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
76; AVX512-NEXT:    vmulps %xmm4, %xmm2, %xmm4
77; AVX512-NEXT:    vaddps %xmm4, %xmm3, %xmm3
78; AVX512-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2]
79; AVX512-NEXT:    vmulps %xmm4, %xmm0, %xmm0
80; AVX512-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
81; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
82; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
83; AVX512-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0]
84; AVX512-NEXT:    retq
85entry:
86  %split = shufflevector <4 x float> %a0, <4 x float> poison, <2 x i32> <i32 0, i32 1>
87  %split1 = shufflevector <4 x float> %a0, <4 x float> poison, <2 x i32> <i32 2, i32 3>
88  %splat.splat = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> zeroinitializer
89  %0 = fmul <2 x float> %split, %splat.splat
90  %splat.splat6 = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> <i32 1, i32 1>
91  %1 = fmul <2 x float> %split1, %splat.splat6
92  %2 = fadd <2 x float> %0, %1
93  %splat.splat9 = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> <i32 2, i32 2>
94  %3 = fmul <2 x float> %split, %splat.splat9
95  %splat.splat12 = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> <i32 3, i32 3>
96  %4 = fmul <2 x float> %split1, %splat.splat12
97  %5 = fadd <2 x float> %3, %4
98  %6 = shufflevector <2 x float> %2, <2 x float> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
99  ret <4 x float> %6
100}
101
102define <4 x double> @test_mul2x2_f64(<4 x double> %a0, <4 x double> %a1) nounwind {
103; SSE-LABEL: test_mul2x2_f64:
104; SSE:       # %bb.0: # %entry
105; SSE-NEXT:    movapd %xmm2, %xmm4
106; SSE-NEXT:    unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm2[0]
107; SSE-NEXT:    mulpd %xmm0, %xmm4
108; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
109; SSE-NEXT:    mulpd %xmm1, %xmm2
110; SSE-NEXT:    addpd %xmm2, %xmm4
111; SSE-NEXT:    movapd %xmm3, %xmm2
112; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
113; SSE-NEXT:    mulpd %xmm0, %xmm2
114; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
115; SSE-NEXT:    mulpd %xmm3, %xmm1
116; SSE-NEXT:    addpd %xmm2, %xmm1
117; SSE-NEXT:    movapd %xmm4, %xmm0
118; SSE-NEXT:    retq
119;
120; AVX-LABEL: test_mul2x2_f64:
121; AVX:       # %bb.0: # %entry
122; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
123; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = xmm1[0,0]
124; AVX-NEXT:    vmulpd %xmm3, %xmm0, %xmm3
125; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,1]
126; AVX-NEXT:    vmulpd %xmm4, %xmm2, %xmm4
127; AVX-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
128; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
129; AVX-NEXT:    vmovddup {{.*#+}} xmm4 = xmm1[0,0]
130; AVX-NEXT:    vmulpd %xmm4, %xmm0, %xmm0
131; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,1]
132; AVX-NEXT:    vmulpd %xmm1, %xmm2, %xmm1
133; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
134; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
135; AVX-NEXT:    retq
136entry:
137  %split = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> <i32 0, i32 1>
138  %split1 = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> <i32 2, i32 3>
139  %splat.splat = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> zeroinitializer
140  %0 = fmul <2 x double> %split, %splat.splat
141  %splat.splat6 = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> <i32 1, i32 1>
142  %1 = fmul <2 x double> %split1, %splat.splat6
143  %2 = fadd <2 x double> %0, %1
144  %splat.splat9 = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> <i32 2, i32 2>
145  %3 = fmul <2 x double> %split, %splat.splat9
146  %splat.splat12 = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> <i32 3, i32 3>
147  %4 = fmul <2 x double> %split1, %splat.splat12
148  %5 = fadd <2 x double> %3, %4
149  %6 = shufflevector <2 x double> %2, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
150  ret <4 x double> %6
151}
152
153define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
154; SSE-LABEL: test_mul3x3_f32:
155; SSE:       # %bb.0: # %entry
156; SSE-NEXT:    movq %rdi, %rax
157; SSE-NEXT:    movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
158; SSE-NEXT:    movss {{.*#+}} xmm10 = mem[0],zero,zero,zero
159; SSE-NEXT:    movss {{.*#+}} xmm9 = mem[0],zero,zero,zero
160; SSE-NEXT:    movss {{.*#+}} xmm11 = mem[0],zero,zero,zero
161; SSE-NEXT:    movss {{.*#+}} xmm12 = mem[0],zero,zero,zero
162; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
163; SSE-NEXT:    movaps %xmm2, %xmm13
164; SSE-NEXT:    mulss %xmm12, %xmm13
165; SSE-NEXT:    unpcklps {{.*#+}} xmm12 = xmm12[0,0,1,1]
166; SSE-NEXT:    mulps %xmm0, %xmm12
167; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
168; SSE-NEXT:    movaps %xmm5, %xmm1
169; SSE-NEXT:    mulss %xmm11, %xmm1
170; SSE-NEXT:    unpcklps {{.*#+}} xmm11 = xmm11[0,0,1,1]
171; SSE-NEXT:    mulps %xmm3, %xmm11
172; SSE-NEXT:    addps %xmm12, %xmm11
173; SSE-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
174; SSE-NEXT:    unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
175; SSE-NEXT:    movaps %xmm9, %xmm7
176; SSE-NEXT:    mulss %xmm4, %xmm7
177; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0,0,1,1]
178; SSE-NEXT:    mulps %xmm6, %xmm4
179; SSE-NEXT:    addps %xmm11, %xmm4
180; SSE-NEXT:    movss {{.*#+}} xmm11 = mem[0],zero,zero,zero
181; SSE-NEXT:    addss %xmm13, %xmm1
182; SSE-NEXT:    addss %xmm7, %xmm1
183; SSE-NEXT:    movaps %xmm2, %xmm7
184; SSE-NEXT:    mulss %xmm11, %xmm7
185; SSE-NEXT:    unpcklps {{.*#+}} xmm11 = xmm11[0,0,1,1]
186; SSE-NEXT:    mulps %xmm0, %xmm11
187; SSE-NEXT:    movaps %xmm5, %xmm12
188; SSE-NEXT:    mulss %xmm10, %xmm12
189; SSE-NEXT:    unpcklps {{.*#+}} xmm10 = xmm10[0,0,1,1]
190; SSE-NEXT:    mulps %xmm3, %xmm10
191; SSE-NEXT:    addps %xmm11, %xmm10
192; SSE-NEXT:    movaps %xmm9, %xmm11
193; SSE-NEXT:    mulss %xmm8, %xmm11
194; SSE-NEXT:    unpcklps {{.*#+}} xmm8 = xmm8[0,0,1,1]
195; SSE-NEXT:    mulps %xmm6, %xmm8
196; SSE-NEXT:    addps %xmm10, %xmm8
197; SSE-NEXT:    addss %xmm7, %xmm12
198; SSE-NEXT:    addss %xmm11, %xmm12
199; SSE-NEXT:    movaps %xmm8, %xmm7
200; SSE-NEXT:    movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0]
201; SSE-NEXT:    movss {{.*#+}} xmm10 = mem[0],zero,zero,zero
202; SSE-NEXT:    mulss %xmm10, %xmm2
203; SSE-NEXT:    unpcklps {{.*#+}} xmm10 = xmm10[0,0,1,1]
204; SSE-NEXT:    mulps %xmm0, %xmm10
205; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
206; SSE-NEXT:    mulss %xmm0, %xmm5
207; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
208; SSE-NEXT:    mulps %xmm3, %xmm0
209; SSE-NEXT:    addps %xmm10, %xmm0
210; SSE-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
211; SSE-NEXT:    mulss %xmm3, %xmm9
212; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0,0,1,1]
213; SSE-NEXT:    mulps %xmm6, %xmm3
214; SSE-NEXT:    addps %xmm0, %xmm3
215; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,2],xmm3[0,1]
216; SSE-NEXT:    addss %xmm2, %xmm5
217; SSE-NEXT:    addss %xmm9, %xmm5
218; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0]
219; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0]
220; SSE-NEXT:    movss %xmm5, 32(%rdi)
221; SSE-NEXT:    movaps %xmm7, 16(%rdi)
222; SSE-NEXT:    movaps %xmm4, (%rdi)
223; SSE-NEXT:    retq
224;
225; AVX1-LABEL: test_mul3x3_f32:
226; AVX1:       # %bb.0: # %entry
227; AVX1-NEXT:    movq %rdi, %rax
228; AVX1-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
229; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
230; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm1
231; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm9
232; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
233; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm4
234; AVX1-NEXT:    vmulps %xmm4, %xmm3, %xmm10
235; AVX1-NEXT:    vaddps %xmm10, %xmm9, %xmm9
236; AVX1-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
237; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm7
238; AVX1-NEXT:    vmulps %xmm7, %xmm6, %xmm10
239; AVX1-NEXT:    vaddps %xmm10, %xmm9, %xmm9
240; AVX1-NEXT:    vmulss %xmm1, %xmm2, %xmm1
241; AVX1-NEXT:    vmulss %xmm4, %xmm5, %xmm4
242; AVX1-NEXT:    vaddss %xmm4, %xmm1, %xmm1
243; AVX1-NEXT:    vmulss %xmm7, %xmm8, %xmm4
244; AVX1-NEXT:    vaddss %xmm4, %xmm1, %xmm1
245; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm9[0,1],xmm1[0],xmm9[3]
246; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm4
247; AVX1-NEXT:    vmulps %xmm4, %xmm0, %xmm7
248; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm9
249; AVX1-NEXT:    vmulps %xmm3, %xmm9, %xmm10
250; AVX1-NEXT:    vaddps %xmm7, %xmm10, %xmm7
251; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm10
252; AVX1-NEXT:    vmulps %xmm6, %xmm10, %xmm11
253; AVX1-NEXT:    vaddps %xmm7, %xmm11, %xmm7
254; AVX1-NEXT:    vmulss %xmm4, %xmm2, %xmm4
255; AVX1-NEXT:    vmulss %xmm5, %xmm9, %xmm9
256; AVX1-NEXT:    vaddss %xmm4, %xmm9, %xmm4
257; AVX1-NEXT:    vmulss %xmm10, %xmm8, %xmm9
258; AVX1-NEXT:    vaddss %xmm4, %xmm9, %xmm4
259; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm7[0,1],xmm4[0],xmm7[3]
260; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm9
261; AVX1-NEXT:    vmulps %xmm0, %xmm9, %xmm0
262; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm10
263; AVX1-NEXT:    vmulps %xmm3, %xmm10, %xmm3
264; AVX1-NEXT:    vaddps %xmm3, %xmm0, %xmm0
265; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm3
266; AVX1-NEXT:    vmulps %xmm3, %xmm6, %xmm6
267; AVX1-NEXT:    vaddps %xmm6, %xmm0, %xmm0
268; AVX1-NEXT:    vmulss %xmm2, %xmm9, %xmm2
269; AVX1-NEXT:    vmulss %xmm5, %xmm10, %xmm5
270; AVX1-NEXT:    vaddss %xmm5, %xmm2, %xmm2
271; AVX1-NEXT:    vmulss %xmm3, %xmm8, %xmm3
272; AVX1-NEXT:    vaddss %xmm3, %xmm2, %xmm2
273; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[0]
274; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm4[1,2,2,3]
275; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
276; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
277; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
278; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
279; AVX1-NEXT:    vmovss %xmm2, 32(%rdi)
280; AVX1-NEXT:    vmovaps %ymm0, (%rdi)
281; AVX1-NEXT:    vzeroupper
282; AVX1-NEXT:    retq
283;
284; AVX2-LABEL: test_mul3x3_f32:
285; AVX2:       # %bb.0: # %entry
286; AVX2-NEXT:    movq %rdi, %rax
287; AVX2-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
288; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
289; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm1
290; AVX2-NEXT:    vmulps %xmm1, %xmm0, %xmm9
291; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
292; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm4
293; AVX2-NEXT:    vmulps %xmm4, %xmm3, %xmm10
294; AVX2-NEXT:    vaddps %xmm10, %xmm9, %xmm9
295; AVX2-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
296; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm7
297; AVX2-NEXT:    vmulps %xmm7, %xmm6, %xmm10
298; AVX2-NEXT:    vaddps %xmm10, %xmm9, %xmm9
299; AVX2-NEXT:    vmulss %xmm1, %xmm2, %xmm1
300; AVX2-NEXT:    vmulss %xmm4, %xmm5, %xmm4
301; AVX2-NEXT:    vaddss %xmm4, %xmm1, %xmm1
302; AVX2-NEXT:    vmulss %xmm7, %xmm8, %xmm4
303; AVX2-NEXT:    vaddss %xmm4, %xmm1, %xmm1
304; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm9[0,1],xmm1[0],xmm9[3]
305; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm4
306; AVX2-NEXT:    vmulps %xmm4, %xmm0, %xmm7
307; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm9
308; AVX2-NEXT:    vmulps %xmm3, %xmm9, %xmm10
309; AVX2-NEXT:    vaddps %xmm7, %xmm10, %xmm7
310; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm10
311; AVX2-NEXT:    vmulps %xmm6, %xmm10, %xmm11
312; AVX2-NEXT:    vaddps %xmm7, %xmm11, %xmm7
313; AVX2-NEXT:    vmulss %xmm4, %xmm2, %xmm4
314; AVX2-NEXT:    vmulss %xmm5, %xmm9, %xmm9
315; AVX2-NEXT:    vaddss %xmm4, %xmm9, %xmm4
316; AVX2-NEXT:    vmulss %xmm10, %xmm8, %xmm9
317; AVX2-NEXT:    vaddss %xmm4, %xmm9, %xmm4
318; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm7[0,1],xmm4[0],xmm7[3]
319; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm7
320; AVX2-NEXT:    vmulps %xmm7, %xmm0, %xmm0
321; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm9
322; AVX2-NEXT:    vmulps %xmm3, %xmm9, %xmm3
323; AVX2-NEXT:    vaddps %xmm3, %xmm0, %xmm0
324; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm3
325; AVX2-NEXT:    vmulps %xmm3, %xmm6, %xmm6
326; AVX2-NEXT:    vaddps %xmm6, %xmm0, %xmm0
327; AVX2-NEXT:    vmulss %xmm7, %xmm2, %xmm2
328; AVX2-NEXT:    vmulss %xmm5, %xmm9, %xmm5
329; AVX2-NEXT:    vaddss %xmm5, %xmm2, %xmm2
330; AVX2-NEXT:    vmulss %xmm3, %xmm8, %xmm3
331; AVX2-NEXT:    vaddss %xmm3, %xmm2, %xmm2
332; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
333; AVX2-NEXT:    vmovaps {{.*#+}} ymm3 = [0,1,2,4,5,6,u,u]
334; AVX2-NEXT:    vpermps %ymm1, %ymm3, %ymm1
335; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
336; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
337; AVX2-NEXT:    vmovss %xmm2, 32(%rdi)
338; AVX2-NEXT:    vmovaps %ymm0, (%rdi)
339; AVX2-NEXT:    vzeroupper
340; AVX2-NEXT:    retq
341;
342; AVX512F-LABEL: test_mul3x3_f32:
343; AVX512F:       # %bb.0: # %entry
344; AVX512F-NEXT:    valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2]
345; AVX512F-NEXT:    vbroadcastss %xmm1, %xmm3
346; AVX512F-NEXT:    vmulps %xmm3, %xmm0, %xmm3
347; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm5
348; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3]
349; AVX512F-NEXT:    vmulps %xmm6, %xmm2, %xmm4
350; AVX512F-NEXT:    vaddps %xmm4, %xmm3, %xmm4
351; AVX512F-NEXT:    vshufpd {{.*#+}} xmm3 = xmm5[1,0]
352; AVX512F-NEXT:    vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3]
353; AVX512F-NEXT:    vshufpd {{.*#+}} xmm8 = xmm1[1,0]
354; AVX512F-NEXT:    vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2]
355; AVX512F-NEXT:    vmulps %xmm3, %xmm9, %xmm9
356; AVX512F-NEXT:    vaddps %xmm4, %xmm9, %xmm9
357; AVX512F-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
358; AVX512F-NEXT:    vmulss %xmm1, %xmm4, %xmm10
359; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm5[1,1,3,3]
360; AVX512F-NEXT:    vmulss %xmm6, %xmm5, %xmm6
361; AVX512F-NEXT:    vaddss %xmm6, %xmm10, %xmm6
362; AVX512F-NEXT:    vextractf32x4 $2, %zmm0, %xmm10
363; AVX512F-NEXT:    vmulss %xmm8, %xmm10, %xmm8
364; AVX512F-NEXT:    vaddss %xmm6, %xmm8, %xmm6
365; AVX512F-NEXT:    vinsertps {{.*#+}} xmm6 = xmm9[0,1],xmm6[0],xmm9[3]
366; AVX512F-NEXT:    vmulps %xmm7, %xmm0, %xmm8
367; AVX512F-NEXT:    vextractf128 $1, %ymm1, %xmm9
368; AVX512F-NEXT:    vmovsldup {{.*#+}} xmm11 = xmm9[0,0,2,2]
369; AVX512F-NEXT:    vmulps %xmm2, %xmm11, %xmm11
370; AVX512F-NEXT:    vaddps %xmm11, %xmm8, %xmm8
371; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm11 = xmm9[1,1,3,3]
372; AVX512F-NEXT:    vmulps %xmm3, %xmm11, %xmm12
373; AVX512F-NEXT:    vaddps %xmm12, %xmm8, %xmm8
374; AVX512F-NEXT:    vmulss %xmm7, %xmm4, %xmm7
375; AVX512F-NEXT:    vmulss %xmm5, %xmm9, %xmm12
376; AVX512F-NEXT:    vaddss %xmm7, %xmm12, %xmm7
377; AVX512F-NEXT:    vmulss %xmm11, %xmm10, %xmm11
378; AVX512F-NEXT:    vaddss %xmm7, %xmm11, %xmm7
379; AVX512F-NEXT:    vinsertps {{.*#+}} xmm7 = xmm8[0,1],xmm7[0],xmm8[3]
380; AVX512F-NEXT:    vshufps {{.*#+}} xmm8 = xmm9[3,3,3,3]
381; AVX512F-NEXT:    vshufpd {{.*#+}} xmm11 = xmm9[1,0]
382; AVX512F-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[2,2,2,2]
383; AVX512F-NEXT:    vmulps %xmm0, %xmm9, %xmm0
384; AVX512F-NEXT:    vmulps %xmm2, %xmm8, %xmm2
385; AVX512F-NEXT:    vaddps %xmm2, %xmm0, %xmm0
386; AVX512F-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
387; AVX512F-NEXT:    vbroadcastss %xmm1, %xmm2
388; AVX512F-NEXT:    vmulps %xmm2, %xmm3, %xmm2
389; AVX512F-NEXT:    vaddps %xmm2, %xmm0, %xmm0
390; AVX512F-NEXT:    vmulss %xmm4, %xmm11, %xmm2
391; AVX512F-NEXT:    vmulss %xmm5, %xmm8, %xmm3
392; AVX512F-NEXT:    vaddss %xmm3, %xmm2, %xmm2
393; AVX512F-NEXT:    vmulss %xmm1, %xmm10, %xmm1
394; AVX512F-NEXT:    vaddss %xmm1, %xmm2, %xmm1
395; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
396; AVX512F-NEXT:    vinsertf32x4 $1, %xmm7, %zmm6, %zmm2
397; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0]
398; AVX512F-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
399; AVX512F-NEXT:    retq
400;
401; AVX512VL-LABEL: test_mul3x3_f32:
402; AVX512VL:       # %bb.0: # %entry
403; AVX512VL-NEXT:    valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2]
404; AVX512VL-NEXT:    vbroadcastss %xmm1, %xmm3
405; AVX512VL-NEXT:    vmulps %xmm3, %xmm0, %xmm3
406; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm4
407; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
408; AVX512VL-NEXT:    vmulps %xmm5, %xmm2, %xmm6
409; AVX512VL-NEXT:    vaddps %xmm6, %xmm3, %xmm3
410; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm6 = xmm4[1,0]
411; AVX512VL-NEXT:    vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3]
412; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm8 = xmm1[1,0]
413; AVX512VL-NEXT:    vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2]
414; AVX512VL-NEXT:    vmulps %xmm6, %xmm9, %xmm9
415; AVX512VL-NEXT:    vaddps %xmm3, %xmm9, %xmm3
416; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm9 = xmm0[1,0]
417; AVX512VL-NEXT:    vmulss %xmm1, %xmm9, %xmm10
418; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm4[1,1,3,3]
419; AVX512VL-NEXT:    vmulss %xmm5, %xmm4, %xmm5
420; AVX512VL-NEXT:    vaddss %xmm5, %xmm10, %xmm5
421; AVX512VL-NEXT:    vextractf32x4 $2, %zmm0, %xmm10
422; AVX512VL-NEXT:    vmulss %xmm8, %xmm10, %xmm8
423; AVX512VL-NEXT:    vaddss %xmm5, %xmm8, %xmm5
424; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3]
425; AVX512VL-NEXT:    vmulps %xmm7, %xmm0, %xmm5
426; AVX512VL-NEXT:    vextractf128 $1, %ymm1, %xmm8
427; AVX512VL-NEXT:    vmovsldup {{.*#+}} xmm11 = xmm8[0,0,2,2]
428; AVX512VL-NEXT:    vmulps %xmm2, %xmm11, %xmm11
429; AVX512VL-NEXT:    vaddps %xmm5, %xmm11, %xmm5
430; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm11 = xmm8[1,1,3,3]
431; AVX512VL-NEXT:    vmulps %xmm6, %xmm11, %xmm12
432; AVX512VL-NEXT:    vaddps %xmm5, %xmm12, %xmm5
433; AVX512VL-NEXT:    vmulss %xmm7, %xmm9, %xmm7
434; AVX512VL-NEXT:    vmulss %xmm4, %xmm8, %xmm12
435; AVX512VL-NEXT:    vaddss %xmm7, %xmm12, %xmm7
436; AVX512VL-NEXT:    vmulss %xmm11, %xmm10, %xmm11
437; AVX512VL-NEXT:    vaddss %xmm7, %xmm11, %xmm7
438; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
439; AVX512VL-NEXT:    vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3]
440; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm11 = xmm8[1,0]
441; AVX512VL-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[2,2,2,2]
442; AVX512VL-NEXT:    vmulps %xmm0, %xmm8, %xmm0
443; AVX512VL-NEXT:    vmulps %xmm7, %xmm2, %xmm2
444; AVX512VL-NEXT:    vaddps %xmm2, %xmm0, %xmm0
445; AVX512VL-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
446; AVX512VL-NEXT:    vbroadcastss %xmm1, %xmm2
447; AVX512VL-NEXT:    vmulps %xmm2, %xmm6, %xmm2
448; AVX512VL-NEXT:    vaddps %xmm2, %xmm0, %xmm0
449; AVX512VL-NEXT:    vmulss %xmm11, %xmm9, %xmm2
450; AVX512VL-NEXT:    vmulss %xmm7, %xmm4, %xmm4
451; AVX512VL-NEXT:    vaddss %xmm4, %xmm2, %xmm2
452; AVX512VL-NEXT:    vmulss %xmm1, %xmm10, %xmm1
453; AVX512VL-NEXT:    vaddss %xmm1, %xmm2, %xmm1
454; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
455; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm5, %zmm3, %zmm2
456; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0]
457; AVX512VL-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
458; AVX512VL-NEXT:    retq
459entry:
460  %block = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> <i32 0, i32 1>
461  %splat.splat = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> zeroinitializer
462  %0 = fmul <2 x float> %block, %splat.splat
463  %block6 = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> <i32 3, i32 4>
464  %splat.splat8 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 1, i32 1>
465  %1 = fmul <2 x float> %block6, %splat.splat8
466  %2 = fadd <2 x float> %0, %1
467  %block9 = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> <i32 6, i32 7>
468  %splat.splat11 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 2, i32 2>
469  %3 = fmul <2 x float> %block9, %splat.splat11
470  %4 = fadd <2 x float> %2, %3
471  %5 = shufflevector <2 x float> %4, <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef>
472  %block12 = shufflevector <9 x float> %a0, <9 x float> poison, <1 x i32> <i32 2>
473  %splat.splatinsert13 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> zeroinitializer
474  %6 = fmul <1 x float> %block12, %splat.splatinsert13
475  %block15 = shufflevector <9 x float> %a0, <9 x float> poison, <1 x i32> <i32 5>
476  %splat.splatinsert16 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 1>
477  %7 = fmul <1 x float> %block15, %splat.splatinsert16
478  %8 = fadd <1 x float> %6, %7
479  %block18 = shufflevector <9 x float> %a0, <9 x float> poison, <1 x i32> <i32 8>
480  %splat.splatinsert19 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 2>
481  %9 = fmul <1 x float> %block18, %splat.splatinsert19
482  %10 = fadd <1 x float> %8, %9
483  %11 = shufflevector <1 x float> %10, <1 x float> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
484  %12 = shufflevector <3 x float> %5, <3 x float> %11, <3 x i32> <i32 0, i32 1, i32 3>
485  %splat.splat23 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 3, i32 3>
486  %13 = fmul <2 x float> %block, %splat.splat23
487  %splat.splat26 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 4, i32 4>
488  %14 = fmul <2 x float> %block6, %splat.splat26
489  %15 = fadd <2 x float> %13, %14
490  %splat.splat29 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 5, i32 5>
491  %16 = fmul <2 x float> %block9, %splat.splat29
492  %17 = fadd <2 x float> %15, %16
493  %18 = shufflevector <2 x float> %17, <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef>
494  %splat.splatinsert31 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 3>
495  %19 = fmul <1 x float> %block12, %splat.splatinsert31
496  %splat.splatinsert34 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 4>
497  %20 = fmul <1 x float> %block15, %splat.splatinsert34
498  %21 = fadd <1 x float> %19, %20
499  %splat.splatinsert37 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 5>
500  %22 = fmul <1 x float> %block18, %splat.splatinsert37
501  %23 = fadd <1 x float> %21, %22
502  %24 = shufflevector <1 x float> %23, <1 x float> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
503  %25 = shufflevector <3 x float> %18, <3 x float> %24, <3 x i32> <i32 0, i32 1, i32 3>
504  %splat.splat41 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 6, i32 6>
505  %26 = fmul <2 x float> %block, %splat.splat41
506  %splat.splat44 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 7, i32 7>
507  %27 = fmul <2 x float> %block6, %splat.splat44
508  %28 = fadd <2 x float> %26, %27
509  %splat.splat47 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 8, i32 8>
510  %29 = fmul <2 x float> %block9, %splat.splat47
511  %30 = fadd <2 x float> %28, %29
512  %31 = shufflevector <2 x float> %30, <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef>
513  %splat.splatinsert49 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 6>
514  %32 = fmul <1 x float> %block12, %splat.splatinsert49
515  %splat.splatinsert52 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 7>
516  %33 = fmul <1 x float> %block15, %splat.splatinsert52
517  %34 = fadd <1 x float> %32, %33
518  %35 = fmul <9 x float> %a0, %a1
519  %36 = shufflevector <9 x float> %35, <9 x float> poison, <1 x i32> <i32 8>
520  %37 = fadd <1 x float> %34, %36
521  %38 = shufflevector <1 x float> %37, <1 x float> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
522  %39 = shufflevector <3 x float> %31, <3 x float> %38, <3 x i32> <i32 0, i32 1, i32 3>
523  %40 = shufflevector <3 x float> %12, <3 x float> %25, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
524  %41 = shufflevector <3 x float> %39, <3 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef>
525  %42 = shufflevector <6 x float> %40, <6 x float> %41, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
526  ret <9 x float> %42
527}
528
529define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwind {
530; SSE-LABEL: test_mul3x3_f64:
531; SSE:       # %bb.0: # %entry
532; SSE-NEXT:    movq %rdi, %rax
533; SSE-NEXT:    movsd {{.*#+}} xmm8 = mem[0],zero
534; SSE-NEXT:    movsd {{.*#+}} xmm10 = mem[0],zero
535; SSE-NEXT:    movsd {{.*#+}} xmm9 = mem[0],zero
536; SSE-NEXT:    movsd {{.*#+}} xmm11 = mem[0],zero
537; SSE-NEXT:    movsd {{.*#+}} xmm12 = mem[0],zero
538; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
539; SSE-NEXT:    movapd %xmm2, %xmm13
540; SSE-NEXT:    mulsd %xmm12, %xmm13
541; SSE-NEXT:    unpcklpd {{.*#+}} xmm12 = xmm12[0,0]
542; SSE-NEXT:    mulpd %xmm0, %xmm12
543; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
544; SSE-NEXT:    movapd %xmm5, %xmm1
545; SSE-NEXT:    mulsd %xmm11, %xmm1
546; SSE-NEXT:    unpcklpd {{.*#+}} xmm11 = xmm11[0,0]
547; SSE-NEXT:    mulpd %xmm3, %xmm11
548; SSE-NEXT:    addpd %xmm12, %xmm11
549; SSE-NEXT:    movsd {{.*#+}} xmm4 = mem[0],zero
550; SSE-NEXT:    unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
551; SSE-NEXT:    movapd %xmm9, %xmm7
552; SSE-NEXT:    mulsd %xmm4, %xmm7
553; SSE-NEXT:    unpcklpd {{.*#+}} xmm4 = xmm4[0,0]
554; SSE-NEXT:    mulpd %xmm6, %xmm4
555; SSE-NEXT:    addpd %xmm11, %xmm4
556; SSE-NEXT:    movsd {{.*#+}} xmm11 = mem[0],zero
557; SSE-NEXT:    addsd %xmm13, %xmm1
558; SSE-NEXT:    addsd %xmm7, %xmm1
559; SSE-NEXT:    movapd %xmm2, %xmm12
560; SSE-NEXT:    mulsd %xmm11, %xmm12
561; SSE-NEXT:    unpcklpd {{.*#+}} xmm11 = xmm11[0,0]
562; SSE-NEXT:    mulpd %xmm0, %xmm11
563; SSE-NEXT:    movapd %xmm5, %xmm7
564; SSE-NEXT:    mulsd %xmm10, %xmm7
565; SSE-NEXT:    unpcklpd {{.*#+}} xmm10 = xmm10[0,0]
566; SSE-NEXT:    mulpd %xmm3, %xmm10
567; SSE-NEXT:    addpd %xmm11, %xmm10
568; SSE-NEXT:    movapd %xmm9, %xmm11
569; SSE-NEXT:    mulsd %xmm8, %xmm11
570; SSE-NEXT:    unpcklpd {{.*#+}} xmm8 = xmm8[0,0]
571; SSE-NEXT:    mulpd %xmm6, %xmm8
572; SSE-NEXT:    addpd %xmm10, %xmm8
573; SSE-NEXT:    addsd %xmm12, %xmm7
574; SSE-NEXT:    addsd %xmm11, %xmm7
575; SSE-NEXT:    movsd {{.*#+}} xmm10 = mem[0],zero
576; SSE-NEXT:    mulsd %xmm10, %xmm2
577; SSE-NEXT:    unpcklpd {{.*#+}} xmm10 = xmm10[0,0]
578; SSE-NEXT:    mulpd %xmm0, %xmm10
579; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
580; SSE-NEXT:    mulsd %xmm0, %xmm5
581; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
582; SSE-NEXT:    mulpd %xmm3, %xmm0
583; SSE-NEXT:    addpd %xmm10, %xmm0
584; SSE-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
585; SSE-NEXT:    mulsd %xmm3, %xmm9
586; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0,0]
587; SSE-NEXT:    mulpd %xmm6, %xmm3
588; SSE-NEXT:    addpd %xmm0, %xmm3
589; SSE-NEXT:    addsd %xmm2, %xmm5
590; SSE-NEXT:    addsd %xmm9, %xmm5
591; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm8[0]
592; SSE-NEXT:    shufpd {{.*#+}} xmm8 = xmm8[1],xmm7[0]
593; SSE-NEXT:    movsd %xmm5, 64(%rdi)
594; SSE-NEXT:    movapd %xmm3, 48(%rdi)
595; SSE-NEXT:    movapd %xmm4, (%rdi)
596; SSE-NEXT:    movapd %xmm8, 32(%rdi)
597; SSE-NEXT:    movapd %xmm1, 16(%rdi)
598; SSE-NEXT:    retq
599;
600; AVX1-LABEL: test_mul3x3_f64:
601; AVX1:       # %bb.0: # %entry
602; AVX1-NEXT:    movq %rdi, %rax
603; AVX1-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
604; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
605; AVX1-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
606; AVX1-NEXT:    vmulpd %xmm1, %xmm9, %xmm0
607; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
608; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
609; AVX1-NEXT:    vmulpd %xmm4, %xmm3, %xmm10
610; AVX1-NEXT:    vaddpd %xmm0, %xmm10, %xmm0
611; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
612; AVX1-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
613; AVX1-NEXT:    vmulpd %xmm7, %xmm6, %xmm10
614; AVX1-NEXT:    vaddpd %xmm0, %xmm10, %xmm0
615; AVX1-NEXT:    vmulsd %xmm2, %xmm9, %xmm9
616; AVX1-NEXT:    vmulsd %xmm4, %xmm5, %xmm4
617; AVX1-NEXT:    vaddsd %xmm4, %xmm9, %xmm4
618; AVX1-NEXT:    vmulsd %xmm7, %xmm8, %xmm7
619; AVX1-NEXT:    vaddsd %xmm7, %xmm4, %xmm4
620; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
621; AVX1-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
622; AVX1-NEXT:    vmulpd %xmm7, %xmm1, %xmm9
623; AVX1-NEXT:    vmovddup {{.*#+}} xmm10 = mem[0,0]
624; AVX1-NEXT:    vmulpd %xmm3, %xmm10, %xmm11
625; AVX1-NEXT:    vaddpd %xmm11, %xmm9, %xmm9
626; AVX1-NEXT:    vmovddup {{.*#+}} xmm11 = mem[0,0]
627; AVX1-NEXT:    vmulpd %xmm6, %xmm11, %xmm12
628; AVX1-NEXT:    vaddpd %xmm12, %xmm9, %xmm9
629; AVX1-NEXT:    vmulsd %xmm7, %xmm2, %xmm7
630; AVX1-NEXT:    vmulsd %xmm5, %xmm10, %xmm10
631; AVX1-NEXT:    vaddsd %xmm7, %xmm10, %xmm7
632; AVX1-NEXT:    vmulsd %xmm11, %xmm8, %xmm10
633; AVX1-NEXT:    vaddsd %xmm7, %xmm10, %xmm7
634; AVX1-NEXT:    vmovddup {{.*#+}} xmm10 = mem[0,0]
635; AVX1-NEXT:    vmulpd %xmm1, %xmm10, %xmm1
636; AVX1-NEXT:    vmovddup {{.*#+}} xmm11 = mem[0,0]
637; AVX1-NEXT:    vmulpd %xmm3, %xmm11, %xmm3
638; AVX1-NEXT:    vaddpd %xmm3, %xmm1, %xmm1
639; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
640; AVX1-NEXT:    vmulpd %xmm3, %xmm6, %xmm6
641; AVX1-NEXT:    vaddpd %xmm6, %xmm1, %xmm1
642; AVX1-NEXT:    vmulsd %xmm2, %xmm10, %xmm2
643; AVX1-NEXT:    vmulsd %xmm5, %xmm11, %xmm5
644; AVX1-NEXT:    vaddsd %xmm5, %xmm2, %xmm2
645; AVX1-NEXT:    vmulsd %xmm3, %xmm8, %xmm3
646; AVX1-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
647; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
648; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2]
649; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm7, %ymm3
650; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm9, %ymm1
651; AVX1-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3]
652; AVX1-NEXT:    vmovsd %xmm2, 64(%rdi)
653; AVX1-NEXT:    vmovapd %ymm1, 32(%rdi)
654; AVX1-NEXT:    vmovapd %ymm0, (%rdi)
655; AVX1-NEXT:    vzeroupper
656; AVX1-NEXT:    retq
657;
658; AVX2-LABEL: test_mul3x3_f64:
659; AVX2:       # %bb.0: # %entry
660; AVX2-NEXT:    movq %rdi, %rax
661; AVX2-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
662; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
663; AVX2-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
664; AVX2-NEXT:    vmulpd %xmm1, %xmm9, %xmm0
665; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
666; AVX2-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
667; AVX2-NEXT:    vmulpd %xmm4, %xmm3, %xmm10
668; AVX2-NEXT:    vaddpd %xmm0, %xmm10, %xmm0
669; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
670; AVX2-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
671; AVX2-NEXT:    vmulpd %xmm7, %xmm6, %xmm10
672; AVX2-NEXT:    vaddpd %xmm0, %xmm10, %xmm0
673; AVX2-NEXT:    vmulsd %xmm2, %xmm9, %xmm9
674; AVX2-NEXT:    vmulsd %xmm4, %xmm5, %xmm4
675; AVX2-NEXT:    vaddsd %xmm4, %xmm9, %xmm4
676; AVX2-NEXT:    vmulsd %xmm7, %xmm8, %xmm7
677; AVX2-NEXT:    vaddsd %xmm7, %xmm4, %xmm4
678; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
679; AVX2-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
680; AVX2-NEXT:    vmulpd %xmm7, %xmm1, %xmm9
681; AVX2-NEXT:    vmovddup {{.*#+}} xmm10 = mem[0,0]
682; AVX2-NEXT:    vmulpd %xmm3, %xmm10, %xmm11
683; AVX2-NEXT:    vaddpd %xmm11, %xmm9, %xmm9
684; AVX2-NEXT:    vmovddup {{.*#+}} xmm11 = mem[0,0]
685; AVX2-NEXT:    vmulpd %xmm6, %xmm11, %xmm12
686; AVX2-NEXT:    vaddpd %xmm12, %xmm9, %xmm9
687; AVX2-NEXT:    vmulsd %xmm7, %xmm2, %xmm7
688; AVX2-NEXT:    vmulsd %xmm5, %xmm10, %xmm10
689; AVX2-NEXT:    vaddsd %xmm7, %xmm10, %xmm7
690; AVX2-NEXT:    vmulsd %xmm11, %xmm8, %xmm10
691; AVX2-NEXT:    vaddsd %xmm7, %xmm10, %xmm7
692; AVX2-NEXT:    vmovddup {{.*#+}} xmm10 = mem[0,0]
693; AVX2-NEXT:    vmulpd %xmm1, %xmm10, %xmm1
694; AVX2-NEXT:    vmovddup {{.*#+}} xmm11 = mem[0,0]
695; AVX2-NEXT:    vmulpd %xmm3, %xmm11, %xmm3
696; AVX2-NEXT:    vaddpd %xmm3, %xmm1, %xmm1
697; AVX2-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
698; AVX2-NEXT:    vmulpd %xmm3, %xmm6, %xmm6
699; AVX2-NEXT:    vaddpd %xmm6, %xmm1, %xmm1
700; AVX2-NEXT:    vmulsd %xmm2, %xmm10, %xmm2
701; AVX2-NEXT:    vmulsd %xmm5, %xmm11, %xmm5
702; AVX2-NEXT:    vaddsd %xmm5, %xmm2, %xmm2
703; AVX2-NEXT:    vmulsd %xmm3, %xmm8, %xmm3
704; AVX2-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
705; AVX2-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
706; AVX2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2]
707; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm7, %ymm3
708; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm9, %ymm1
709; AVX2-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3]
710; AVX2-NEXT:    vmovsd %xmm2, 64(%rdi)
711; AVX2-NEXT:    vmovapd %ymm1, 32(%rdi)
712; AVX2-NEXT:    vmovapd %ymm0, (%rdi)
713; AVX2-NEXT:    vzeroupper
714; AVX2-NEXT:    retq
715;
716; AVX512F-LABEL: test_mul3x3_f64:
717; AVX512F:       # %bb.0: # %entry
718; AVX512F-NEXT:    movq %rdi, %rax
719; AVX512F-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
720; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
721; AVX512F-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
722; AVX512F-NEXT:    vmulpd %xmm0, %xmm9, %xmm10
723; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm4[0]
724; AVX512F-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
725; AVX512F-NEXT:    vmulpd %xmm3, %xmm1, %xmm4
726; AVX512F-NEXT:    vaddpd %xmm4, %xmm10, %xmm4
727; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
728; AVX512F-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
729; AVX512F-NEXT:    vmulpd %xmm7, %xmm6, %xmm10
730; AVX512F-NEXT:    vaddpd %xmm4, %xmm10, %xmm4
731; AVX512F-NEXT:    vmulsd %xmm2, %xmm9, %xmm9
732; AVX512F-NEXT:    vmulsd %xmm3, %xmm5, %xmm3
733; AVX512F-NEXT:    vaddsd %xmm3, %xmm9, %xmm3
734; AVX512F-NEXT:    vmulsd %xmm7, %xmm8, %xmm7
735; AVX512F-NEXT:    vaddsd %xmm7, %xmm3, %xmm3
736; AVX512F-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
737; AVX512F-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
738; AVX512F-NEXT:    vmulpd %xmm4, %xmm0, %xmm7
739; AVX512F-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
740; AVX512F-NEXT:    vmulpd %xmm1, %xmm9, %xmm10
741; AVX512F-NEXT:    vaddpd %xmm7, %xmm10, %xmm7
742; AVX512F-NEXT:    vmovddup {{.*#+}} xmm10 = mem[0,0]
743; AVX512F-NEXT:    vmulpd %xmm6, %xmm10, %xmm11
744; AVX512F-NEXT:    vaddpd %xmm7, %xmm11, %xmm7
745; AVX512F-NEXT:    vmulsd %xmm4, %xmm2, %xmm4
746; AVX512F-NEXT:    vmulsd %xmm5, %xmm9, %xmm9
747; AVX512F-NEXT:    vaddsd %xmm4, %xmm9, %xmm4
748; AVX512F-NEXT:    vmulsd %xmm10, %xmm8, %xmm9
749; AVX512F-NEXT:    vaddsd %xmm4, %xmm9, %xmm4
750; AVX512F-NEXT:    vinsertf128 $1, %xmm4, %ymm7, %ymm4
751; AVX512F-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
752; AVX512F-NEXT:    vmulpd %xmm7, %xmm0, %xmm0
753; AVX512F-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
754; AVX512F-NEXT:    vmulpd %xmm1, %xmm9, %xmm1
755; AVX512F-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
756; AVX512F-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
757; AVX512F-NEXT:    vmulpd %xmm1, %xmm6, %xmm6
758; AVX512F-NEXT:    vaddpd %xmm6, %xmm0, %xmm0
759; AVX512F-NEXT:    vmulsd %xmm7, %xmm2, %xmm2
760; AVX512F-NEXT:    vmulsd %xmm5, %xmm9, %xmm5
761; AVX512F-NEXT:    vaddsd %xmm5, %xmm2, %xmm2
762; AVX512F-NEXT:    vmulsd %xmm1, %xmm8, %xmm1
763; AVX512F-NEXT:    vaddsd %xmm1, %xmm2, %xmm1
764; AVX512F-NEXT:    vinsertf64x4 $1, %ymm4, %zmm3, %zmm2
765; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9]
766; AVX512F-NEXT:    vpermi2pd %zmm0, %zmm2, %zmm3
767; AVX512F-NEXT:    vmovsd %xmm1, 64(%rdi)
768; AVX512F-NEXT:    vmovapd %zmm3, (%rdi)
769; AVX512F-NEXT:    vzeroupper
770; AVX512F-NEXT:    retq
771;
772; AVX512VL-LABEL: test_mul3x3_f64:
773; AVX512VL:       # %bb.0: # %entry
774; AVX512VL-NEXT:    movq %rdi, %rax
775; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
776; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
777; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
778; AVX512VL-NEXT:    vmulpd %xmm1, %xmm0, %xmm9
779; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
780; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
781; AVX512VL-NEXT:    vmulpd %xmm4, %xmm3, %xmm10
782; AVX512VL-NEXT:    vaddpd %xmm10, %xmm9, %xmm9
783; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
784; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
785; AVX512VL-NEXT:    vmulpd %xmm7, %xmm6, %xmm10
786; AVX512VL-NEXT:    vaddpd %xmm10, %xmm9, %xmm9
787; AVX512VL-NEXT:    vmulsd %xmm1, %xmm2, %xmm1
788; AVX512VL-NEXT:    vmulsd %xmm4, %xmm5, %xmm4
789; AVX512VL-NEXT:    vaddsd %xmm4, %xmm1, %xmm1
790; AVX512VL-NEXT:    vmulsd %xmm7, %xmm8, %xmm4
791; AVX512VL-NEXT:    vaddsd %xmm4, %xmm1, %xmm1
792; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm9, %ymm1
793; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
794; AVX512VL-NEXT:    vmulpd %xmm4, %xmm0, %xmm7
795; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
796; AVX512VL-NEXT:    vmulpd %xmm3, %xmm9, %xmm10
797; AVX512VL-NEXT:    vaddpd %xmm7, %xmm10, %xmm7
798; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm10 = mem[0,0]
799; AVX512VL-NEXT:    vmulpd %xmm6, %xmm10, %xmm11
800; AVX512VL-NEXT:    vaddpd %xmm7, %xmm11, %xmm7
801; AVX512VL-NEXT:    vmulsd %xmm4, %xmm2, %xmm4
802; AVX512VL-NEXT:    vmulsd %xmm5, %xmm9, %xmm9
803; AVX512VL-NEXT:    vaddsd %xmm4, %xmm9, %xmm4
804; AVX512VL-NEXT:    vmulsd %xmm10, %xmm8, %xmm9
805; AVX512VL-NEXT:    vaddsd %xmm4, %xmm9, %xmm4
806; AVX512VL-NEXT:    vinsertf128 $1, %xmm4, %ymm7, %ymm4
807; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
808; AVX512VL-NEXT:    vmulpd %xmm7, %xmm0, %xmm0
809; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
810; AVX512VL-NEXT:    vmulpd %xmm3, %xmm9, %xmm3
811; AVX512VL-NEXT:    vaddpd %xmm3, %xmm0, %xmm0
812; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
813; AVX512VL-NEXT:    vmulpd %xmm3, %xmm6, %xmm6
814; AVX512VL-NEXT:    vaddpd %xmm6, %xmm0, %xmm0
815; AVX512VL-NEXT:    vmulsd %xmm7, %xmm2, %xmm2
816; AVX512VL-NEXT:    vmulsd %xmm5, %xmm9, %xmm5
817; AVX512VL-NEXT:    vaddsd %xmm5, %xmm2, %xmm2
818; AVX512VL-NEXT:    vmulsd %xmm3, %xmm8, %xmm3
819; AVX512VL-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
820; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
821; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9]
822; AVX512VL-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm3
823; AVX512VL-NEXT:    vmovsd %xmm2, 64(%rdi)
824; AVX512VL-NEXT:    vmovapd %zmm3, (%rdi)
825; AVX512VL-NEXT:    vzeroupper
826; AVX512VL-NEXT:    retq
827entry:
828  %block = shufflevector <9 x double> %a0, <9 x double> poison, <2 x i32> <i32 0, i32 1>
829  %splat.splat = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> zeroinitializer
830  %0 = fmul <2 x double> %block, %splat.splat
831  %block6 = shufflevector <9 x double> %a0, <9 x double> poison, <2 x i32> <i32 3, i32 4>
832  %splat.splat8 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 1, i32 1>
833  %1 = fmul <2 x double> %block6, %splat.splat8
834  %2 = fadd <2 x double> %0, %1
835  %block9 = shufflevector <9 x double> %a0, <9 x double> poison, <2 x i32> <i32 6, i32 7>
836  %splat.splat11 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 2, i32 2>
837  %3 = fmul <2 x double> %block9, %splat.splat11
838  %4 = fadd <2 x double> %2, %3
839  %5 = shufflevector <2 x double> %4, <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 undef>
840  %block12 = shufflevector <9 x double> %a0, <9 x double> poison, <1 x i32> <i32 2>
841  %splat.splatinsert13 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> zeroinitializer
842  %6 = fmul <1 x double> %block12, %splat.splatinsert13
843  %block15 = shufflevector <9 x double> %a0, <9 x double> poison, <1 x i32> <i32 5>
844  %splat.splatinsert16 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 1>
845  %7 = fmul <1 x double> %block15, %splat.splatinsert16
846  %8 = fadd <1 x double> %6, %7
847  %block18 = shufflevector <9 x double> %a0, <9 x double> poison, <1 x i32> <i32 8>
848  %splat.splatinsert19 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 2>
849  %9 = fmul <1 x double> %block18, %splat.splatinsert19
850  %10 = fadd <1 x double> %8, %9
851  %11 = shufflevector <1 x double> %10, <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
852  %12 = shufflevector <3 x double> %5, <3 x double> %11, <3 x i32> <i32 0, i32 1, i32 3>
853  %splat.splat23 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 3, i32 3>
854  %13 = fmul <2 x double> %block, %splat.splat23
855  %splat.splat26 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 4, i32 4>
856  %14 = fmul <2 x double> %block6, %splat.splat26
857  %15 = fadd <2 x double> %13, %14
858  %splat.splat29 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 5, i32 5>
859  %16 = fmul <2 x double> %block9, %splat.splat29
860  %17 = fadd <2 x double> %15, %16
861  %18 = shufflevector <2 x double> %17, <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 undef>
862  %splat.splatinsert31 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 3>
863  %19 = fmul <1 x double> %block12, %splat.splatinsert31
864  %splat.splatinsert34 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 4>
865  %20 = fmul <1 x double> %block15, %splat.splatinsert34
866  %21 = fadd <1 x double> %19, %20
867  %splat.splatinsert37 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 5>
868  %22 = fmul <1 x double> %block18, %splat.splatinsert37
869  %23 = fadd <1 x double> %21, %22
870  %24 = shufflevector <1 x double> %23, <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
871  %25 = shufflevector <3 x double> %18, <3 x double> %24, <3 x i32> <i32 0, i32 1, i32 3>
872  %splat.splat41 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 6, i32 6>
873  %26 = fmul <2 x double> %block, %splat.splat41
874  %splat.splat44 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 7, i32 7>
875  %27 = fmul <2 x double> %block6, %splat.splat44
876  %28 = fadd <2 x double> %26, %27
877  %splat.splat47 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 8, i32 8>
878  %29 = fmul <2 x double> %block9, %splat.splat47
879  %30 = fadd <2 x double> %28, %29
880  %31 = shufflevector <2 x double> %30, <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 undef>
881  %splat.splatinsert49 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 6>
882  %32 = fmul <1 x double> %block12, %splat.splatinsert49
883  %splat.splatinsert52 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 7>
884  %33 = fmul <1 x double> %block15, %splat.splatinsert52
885  %34 = fadd <1 x double> %32, %33
886  %35 = fmul <9 x double> %a0, %a1
887  %36 = shufflevector <9 x double> %35, <9 x double> poison, <1 x i32> <i32 8>
888  %37 = fadd <1 x double> %34, %36
889  %38 = shufflevector <1 x double> %37, <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
890  %39 = shufflevector <3 x double> %31, <3 x double> %38, <3 x i32> <i32 0, i32 1, i32 3>
891  %40 = shufflevector <3 x double> %12, <3 x double> %25, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
892  %41 = shufflevector <3 x double> %39, <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef>
893  %42 = shufflevector <6 x double> %40, <6 x double> %41, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
894  ret <9 x double> %42
895}
896
897define <16 x float> @test_mul4x4_f32(<16 x float> %a0, <16 x float> %a1) nounwind {
898; SSE-LABEL: test_mul4x4_f32:
899; SSE:       # %bb.0: # %entry
900; SSE-NEXT:    movaps %xmm0, %xmm9
901; SSE-NEXT:    movaps %xmm4, %xmm0
902; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm4[0,0]
903; SSE-NEXT:    mulps %xmm9, %xmm0
904; SSE-NEXT:    movaps %xmm4, %xmm8
905; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,1],xmm4[1,1]
906; SSE-NEXT:    mulps %xmm1, %xmm8
907; SSE-NEXT:    addps %xmm0, %xmm8
908; SSE-NEXT:    movaps %xmm4, %xmm0
909; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2]
910; SSE-NEXT:    mulps %xmm2, %xmm0
911; SSE-NEXT:    addps %xmm8, %xmm0
912; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
913; SSE-NEXT:    mulps %xmm3, %xmm4
914; SSE-NEXT:    addps %xmm4, %xmm0
915; SSE-NEXT:    movaps %xmm5, %xmm4
916; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0],xmm5[0,0]
917; SSE-NEXT:    mulps %xmm9, %xmm4
918; SSE-NEXT:    movaps %xmm5, %xmm10
919; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[1,1],xmm5[1,1]
920; SSE-NEXT:    mulps %xmm1, %xmm10
921; SSE-NEXT:    addps %xmm4, %xmm10
922; SSE-NEXT:    movaps %xmm5, %xmm8
923; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,2],xmm5[2,2]
924; SSE-NEXT:    mulps %xmm2, %xmm8
925; SSE-NEXT:    addps %xmm10, %xmm8
926; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3,3,3]
927; SSE-NEXT:    mulps %xmm3, %xmm5
928; SSE-NEXT:    addps %xmm5, %xmm8
929; SSE-NEXT:    movaps %xmm6, %xmm4
930; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0],xmm6[0,0]
931; SSE-NEXT:    mulps %xmm9, %xmm4
932; SSE-NEXT:    movaps %xmm6, %xmm10
933; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[1,1],xmm6[1,1]
934; SSE-NEXT:    mulps %xmm1, %xmm10
935; SSE-NEXT:    addps %xmm4, %xmm10
936; SSE-NEXT:    movaps %xmm6, %xmm5
937; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,2],xmm6[2,2]
938; SSE-NEXT:    mulps %xmm2, %xmm5
939; SSE-NEXT:    addps %xmm10, %xmm5
940; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[3,3,3,3]
941; SSE-NEXT:    mulps %xmm3, %xmm6
942; SSE-NEXT:    addps %xmm6, %xmm5
943; SSE-NEXT:    movaps %xmm7, %xmm4
944; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0],xmm7[0,0]
945; SSE-NEXT:    mulps %xmm9, %xmm4
946; SSE-NEXT:    movaps %xmm7, %xmm6
947; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[1,1],xmm7[1,1]
948; SSE-NEXT:    mulps %xmm1, %xmm6
949; SSE-NEXT:    addps %xmm4, %xmm6
950; SSE-NEXT:    movaps %xmm7, %xmm1
951; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[2,2]
952; SSE-NEXT:    mulps %xmm2, %xmm1
953; SSE-NEXT:    addps %xmm6, %xmm1
954; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[3,3,3,3]
955; SSE-NEXT:    mulps %xmm7, %xmm3
956; SSE-NEXT:    addps %xmm1, %xmm3
957; SSE-NEXT:    movaps %xmm8, %xmm1
958; SSE-NEXT:    movaps %xmm5, %xmm2
959; SSE-NEXT:    retq
960;
961; AVX1-LABEL: test_mul4x4_f32:
962; AVX1:       # %bb.0: # %entry
963; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
964; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
965; AVX1-NEXT:    vshufps {{.*#+}} xmm6 = xmm2[0,0,0,0]
966; AVX1-NEXT:    vmulps %xmm6, %xmm0, %xmm6
967; AVX1-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1]
968; AVX1-NEXT:    vmulps %xmm7, %xmm5, %xmm7
969; AVX1-NEXT:    vaddps %xmm7, %xmm6, %xmm6
970; AVX1-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2]
971; AVX1-NEXT:    vmulps %xmm7, %xmm1, %xmm7
972; AVX1-NEXT:    vaddps %xmm7, %xmm6, %xmm6
973; AVX1-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3]
974; AVX1-NEXT:    vmulps %xmm7, %xmm4, %xmm7
975; AVX1-NEXT:    vaddps %xmm7, %xmm6, %xmm6
976; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
977; AVX1-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[0,0,0,0]
978; AVX1-NEXT:    vmulps %xmm7, %xmm0, %xmm7
979; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm2[1,1,1,1]
980; AVX1-NEXT:    vmulps %xmm5, %xmm8, %xmm8
981; AVX1-NEXT:    vaddps %xmm7, %xmm8, %xmm7
982; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm2[2,2,2,2]
983; AVX1-NEXT:    vmulps %xmm1, %xmm8, %xmm8
984; AVX1-NEXT:    vaddps %xmm7, %xmm8, %xmm7
985; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
986; AVX1-NEXT:    vmulps %xmm2, %xmm4, %xmm2
987; AVX1-NEXT:    vaddps %xmm2, %xmm7, %xmm2
988; AVX1-NEXT:    vshufps {{.*#+}} xmm7 = xmm3[0,0,0,0]
989; AVX1-NEXT:    vmulps %xmm7, %xmm0, %xmm7
990; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
991; AVX1-NEXT:    vmulps %xmm5, %xmm8, %xmm8
992; AVX1-NEXT:    vaddps %xmm7, %xmm8, %xmm7
993; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2]
994; AVX1-NEXT:    vmulps %xmm1, %xmm8, %xmm8
995; AVX1-NEXT:    vaddps %xmm7, %xmm8, %xmm7
996; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[3,3,3,3]
997; AVX1-NEXT:    vmulps %xmm4, %xmm8, %xmm8
998; AVX1-NEXT:    vaddps %xmm7, %xmm8, %xmm7
999; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1000; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[0,0,0,0]
1001; AVX1-NEXT:    vmulps %xmm0, %xmm8, %xmm0
1002; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
1003; AVX1-NEXT:    vmulps %xmm5, %xmm8, %xmm5
1004; AVX1-NEXT:    vaddps %xmm5, %xmm0, %xmm0
1005; AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2]
1006; AVX1-NEXT:    vmulps %xmm5, %xmm1, %xmm1
1007; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1008; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
1009; AVX1-NEXT:    vmulps %xmm1, %xmm4, %xmm1
1010; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm1
1011; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm0
1012; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm7, %ymm1
1013; AVX1-NEXT:    retq
1014;
1015; AVX2-LABEL: test_mul4x4_f32:
1016; AVX2:       # %bb.0: # %entry
1017; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm5
1018; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm4
1019; AVX2-NEXT:    vbroadcastss %xmm2, %xmm6
1020; AVX2-NEXT:    vmulps %xmm6, %xmm0, %xmm6
1021; AVX2-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1]
1022; AVX2-NEXT:    vmulps %xmm7, %xmm5, %xmm7
1023; AVX2-NEXT:    vaddps %xmm7, %xmm6, %xmm6
1024; AVX2-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2]
1025; AVX2-NEXT:    vmulps %xmm7, %xmm1, %xmm7
1026; AVX2-NEXT:    vaddps %xmm7, %xmm6, %xmm6
1027; AVX2-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3]
1028; AVX2-NEXT:    vmulps %xmm7, %xmm4, %xmm7
1029; AVX2-NEXT:    vaddps %xmm7, %xmm6, %xmm6
1030; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
1031; AVX2-NEXT:    vbroadcastss %xmm2, %xmm7
1032; AVX2-NEXT:    vmulps %xmm7, %xmm0, %xmm7
1033; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm2[1,1,1,1]
1034; AVX2-NEXT:    vmulps %xmm5, %xmm8, %xmm8
1035; AVX2-NEXT:    vaddps %xmm7, %xmm8, %xmm7
1036; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm2[2,2,2,2]
1037; AVX2-NEXT:    vmulps %xmm1, %xmm8, %xmm8
1038; AVX2-NEXT:    vaddps %xmm7, %xmm8, %xmm7
1039; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
1040; AVX2-NEXT:    vmulps %xmm2, %xmm4, %xmm2
1041; AVX2-NEXT:    vaddps %xmm2, %xmm7, %xmm2
1042; AVX2-NEXT:    vbroadcastss %xmm3, %xmm7
1043; AVX2-NEXT:    vmulps %xmm7, %xmm0, %xmm7
1044; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
1045; AVX2-NEXT:    vmulps %xmm5, %xmm8, %xmm8
1046; AVX2-NEXT:    vaddps %xmm7, %xmm8, %xmm7
1047; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2]
1048; AVX2-NEXT:    vmulps %xmm1, %xmm8, %xmm8
1049; AVX2-NEXT:    vaddps %xmm7, %xmm8, %xmm7
1050; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[3,3,3,3]
1051; AVX2-NEXT:    vmulps %xmm4, %xmm8, %xmm8
1052; AVX2-NEXT:    vaddps %xmm7, %xmm8, %xmm7
1053; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm3
1054; AVX2-NEXT:    vbroadcastss %xmm3, %xmm8
1055; AVX2-NEXT:    vmulps %xmm0, %xmm8, %xmm0
1056; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
1057; AVX2-NEXT:    vmulps %xmm5, %xmm8, %xmm5
1058; AVX2-NEXT:    vaddps %xmm5, %xmm0, %xmm0
1059; AVX2-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2]
1060; AVX2-NEXT:    vmulps %xmm5, %xmm1, %xmm1
1061; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1062; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
1063; AVX2-NEXT:    vmulps %xmm1, %xmm4, %xmm1
1064; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm1
1065; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm0
1066; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm7, %ymm1
1067; AVX2-NEXT:    retq
1068;
1069; AVX512F-LABEL: test_mul4x4_f32:
1070; AVX512F:       # %bb.0: # %entry
1071; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm4
1072; AVX512F-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
1073; AVX512F-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
1074; AVX512F-NEXT:    vbroadcastss %xmm1, %xmm5
1075; AVX512F-NEXT:    vmulps %xmm5, %xmm0, %xmm5
1076; AVX512F-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[1,1,1,1]
1077; AVX512F-NEXT:    vmulps %xmm6, %xmm4, %xmm6
1078; AVX512F-NEXT:    vaddps %xmm6, %xmm5, %xmm5
1079; AVX512F-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[2,2,2,2]
1080; AVX512F-NEXT:    vmulps %xmm6, %xmm3, %xmm6
1081; AVX512F-NEXT:    vaddps %xmm6, %xmm5, %xmm5
1082; AVX512F-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3]
1083; AVX512F-NEXT:    vmulps %xmm6, %xmm2, %xmm6
1084; AVX512F-NEXT:    vaddps %xmm6, %xmm5, %xmm5
1085; AVX512F-NEXT:    vextractf128 $1, %ymm1, %xmm6
1086; AVX512F-NEXT:    vbroadcastss %xmm6, %xmm7
1087; AVX512F-NEXT:    vmulps %xmm7, %xmm0, %xmm7
1088; AVX512F-NEXT:    vshufps {{.*#+}} xmm8 = xmm6[1,1,1,1]
1089; AVX512F-NEXT:    vmulps %xmm4, %xmm8, %xmm8
1090; AVX512F-NEXT:    vaddps %xmm7, %xmm8, %xmm7
1091; AVX512F-NEXT:    vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2]
1092; AVX512F-NEXT:    vmulps %xmm3, %xmm8, %xmm8
1093; AVX512F-NEXT:    vaddps %xmm7, %xmm8, %xmm7
1094; AVX512F-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3]
1095; AVX512F-NEXT:    vmulps %xmm6, %xmm2, %xmm6
1096; AVX512F-NEXT:    vaddps %xmm6, %xmm7, %xmm6
1097; AVX512F-NEXT:    vextractf32x4 $2, %zmm1, %xmm7
1098; AVX512F-NEXT:    vbroadcastss %xmm7, %xmm8
1099; AVX512F-NEXT:    vmulps %xmm0, %xmm8, %xmm8
1100; AVX512F-NEXT:    vshufps {{.*#+}} xmm9 = xmm7[1,1,1,1]
1101; AVX512F-NEXT:    vmulps %xmm4, %xmm9, %xmm9
1102; AVX512F-NEXT:    vaddps %xmm9, %xmm8, %xmm8
1103; AVX512F-NEXT:    vshufps {{.*#+}} xmm9 = xmm7[2,2,2,2]
1104; AVX512F-NEXT:    vmulps %xmm3, %xmm9, %xmm9
1105; AVX512F-NEXT:    vaddps %xmm9, %xmm8, %xmm8
1106; AVX512F-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[3,3,3,3]
1107; AVX512F-NEXT:    vmulps %xmm7, %xmm2, %xmm7
1108; AVX512F-NEXT:    vaddps %xmm7, %xmm8, %xmm7
1109; AVX512F-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1110; AVX512F-NEXT:    vbroadcastss %xmm1, %xmm8
1111; AVX512F-NEXT:    vmulps %xmm0, %xmm8, %xmm0
1112; AVX512F-NEXT:    vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1]
1113; AVX512F-NEXT:    vmulps %xmm4, %xmm8, %xmm4
1114; AVX512F-NEXT:    vaddps %xmm4, %xmm0, %xmm0
1115; AVX512F-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2]
1116; AVX512F-NEXT:    vmulps %xmm4, %xmm3, %xmm3
1117; AVX512F-NEXT:    vaddps %xmm3, %xmm0, %xmm0
1118; AVX512F-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
1119; AVX512F-NEXT:    vmulps %xmm1, %xmm2, %xmm1
1120; AVX512F-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1121; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
1122; AVX512F-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm1
1123; AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
1124; AVX512F-NEXT:    retq
1125;
1126; AVX512VL-LABEL: test_mul4x4_f32:
1127; AVX512VL:       # %bb.0: # %entry
1128; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm2
1129; AVX512VL-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
1130; AVX512VL-NEXT:    vextractf32x4 $3, %zmm0, %xmm4
1131; AVX512VL-NEXT:    vbroadcastss %xmm1, %xmm5
1132; AVX512VL-NEXT:    vmulps %xmm5, %xmm0, %xmm5
1133; AVX512VL-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[1,1,1,1]
1134; AVX512VL-NEXT:    vmulps %xmm6, %xmm2, %xmm6
1135; AVX512VL-NEXT:    vaddps %xmm6, %xmm5, %xmm5
1136; AVX512VL-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[2,2,2,2]
1137; AVX512VL-NEXT:    vmulps %xmm6, %xmm3, %xmm6
1138; AVX512VL-NEXT:    vaddps %xmm6, %xmm5, %xmm5
1139; AVX512VL-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3]
1140; AVX512VL-NEXT:    vmulps %xmm6, %xmm4, %xmm6
1141; AVX512VL-NEXT:    vaddps %xmm6, %xmm5, %xmm5
1142; AVX512VL-NEXT:    vextractf128 $1, %ymm1, %xmm6
1143; AVX512VL-NEXT:    vbroadcastss %xmm6, %xmm7
1144; AVX512VL-NEXT:    vmulps %xmm7, %xmm0, %xmm7
1145; AVX512VL-NEXT:    vshufps {{.*#+}} xmm8 = xmm6[1,1,1,1]
1146; AVX512VL-NEXT:    vmulps %xmm2, %xmm8, %xmm8
1147; AVX512VL-NEXT:    vaddps %xmm7, %xmm8, %xmm7
1148; AVX512VL-NEXT:    vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2]
1149; AVX512VL-NEXT:    vmulps %xmm3, %xmm8, %xmm8
1150; AVX512VL-NEXT:    vaddps %xmm7, %xmm8, %xmm7
1151; AVX512VL-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3]
1152; AVX512VL-NEXT:    vmulps %xmm6, %xmm4, %xmm6
1153; AVX512VL-NEXT:    vaddps %xmm6, %xmm7, %xmm6
1154; AVX512VL-NEXT:    vextractf32x4 $2, %zmm1, %xmm7
1155; AVX512VL-NEXT:    vbroadcastss %xmm7, %xmm8
1156; AVX512VL-NEXT:    vmulps %xmm0, %xmm8, %xmm8
1157; AVX512VL-NEXT:    vshufps {{.*#+}} xmm9 = xmm7[1,1,1,1]
1158; AVX512VL-NEXT:    vmulps %xmm2, %xmm9, %xmm9
1159; AVX512VL-NEXT:    vaddps %xmm9, %xmm8, %xmm8
1160; AVX512VL-NEXT:    vshufps {{.*#+}} xmm9 = xmm7[2,2,2,2]
1161; AVX512VL-NEXT:    vmulps %xmm3, %xmm9, %xmm9
1162; AVX512VL-NEXT:    vaddps %xmm9, %xmm8, %xmm8
1163; AVX512VL-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[3,3,3,3]
1164; AVX512VL-NEXT:    vmulps %xmm7, %xmm4, %xmm7
1165; AVX512VL-NEXT:    vaddps %xmm7, %xmm8, %xmm7
1166; AVX512VL-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
1167; AVX512VL-NEXT:    vbroadcastss %xmm1, %xmm8
1168; AVX512VL-NEXT:    vmulps %xmm0, %xmm8, %xmm0
1169; AVX512VL-NEXT:    vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1]
1170; AVX512VL-NEXT:    vmulps %xmm2, %xmm8, %xmm2
1171; AVX512VL-NEXT:    vaddps %xmm2, %xmm0, %xmm0
1172; AVX512VL-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[2,2,2,2]
1173; AVX512VL-NEXT:    vmulps %xmm2, %xmm3, %xmm2
1174; AVX512VL-NEXT:    vaddps %xmm2, %xmm0, %xmm0
1175; AVX512VL-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
1176; AVX512VL-NEXT:    vmulps %xmm1, %xmm4, %xmm1
1177; AVX512VL-NEXT:    vaddps %xmm1, %xmm0, %xmm0
1178; AVX512VL-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
1179; AVX512VL-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm1
1180; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
1181; AVX512VL-NEXT:    retq
1182entry:
1183  %split = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1184  %split1 = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1185  %split2 = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
1186  %split3 = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
1187  %splat.splat = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> zeroinitializer
1188  %0 = fmul <4 x float> %split, %splat.splat
1189  %splat.splat10 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1190  %1 = fmul <4 x float> %split1, %splat.splat10
1191  %2 = fadd <4 x float> %0, %1
1192  %splat.splat13 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
1193  %3 = fmul <4 x float> %split2, %splat.splat13
1194  %4 = fadd <4 x float> %2, %3
1195  %splat.splat16 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1196  %5 = fmul <4 x float> %split3, %splat.splat16
1197  %6 = fadd <4 x float> %4, %5
1198  %splat.splat19 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
1199  %7 = fmul <4 x float> %split, %splat.splat19
1200  %splat.splat22 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
1201  %8 = fmul <4 x float> %split1, %splat.splat22
1202  %9 = fadd <4 x float> %7, %8
1203  %splat.splat25 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
1204  %10 = fmul <4 x float> %split2, %splat.splat25
1205  %11 = fadd <4 x float> %9, %10
1206  %splat.splat28 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1207  %12 = fmul <4 x float> %split3, %splat.splat28
1208  %13 = fadd <4 x float> %11, %12
1209  %splat.splat31 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 8, i32 8, i32 8, i32 8>
1210  %14 = fmul <4 x float> %split, %splat.splat31
1211  %splat.splat34 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 9, i32 9, i32 9, i32 9>
1212  %15 = fmul <4 x float> %split1, %splat.splat34
1213  %16 = fadd <4 x float> %14, %15
1214  %splat.splat37 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 10, i32 10, i32 10, i32 10>
1215  %17 = fmul <4 x float> %split2, %splat.splat37
1216  %18 = fadd <4 x float> %16, %17
1217  %splat.splat40 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 11, i32 11, i32 11, i32 11>
1218  %19 = fmul <4 x float> %split3, %splat.splat40
1219  %20 = fadd <4 x float> %18, %19
1220  %splat.splat43 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 12, i32 12>
1221  %21 = fmul <4 x float> %split, %splat.splat43
1222  %splat.splat46 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 13, i32 13, i32 13, i32 13>
1223  %22 = fmul <4 x float> %split1, %splat.splat46
1224  %23 = fadd <4 x float> %21, %22
1225  %splat.splat49 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 14, i32 14, i32 14, i32 14>
1226  %24 = fmul <4 x float> %split2, %splat.splat49
1227  %25 = fadd <4 x float> %23, %24
1228  %splat.splat52 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 15, i32 15, i32 15, i32 15>
1229  %26 = fmul <4 x float> %split3, %splat.splat52
1230  %27 = fadd <4 x float> %25, %26
1231  %28 = shufflevector <4 x float> %6, <4 x float> %13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1232  %29 = shufflevector <4 x float> %20, <4 x float> %27, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1233  %30 = shufflevector <8 x float> %28, <8 x float> %29, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1234  ret <16 x float> %30
1235}
1236
1237define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) nounwind {
1238; SSE-LABEL: test_mul4x4_f64:
1239; SSE:       # %bb.0: # %entry
1240; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1241; SSE-NEXT:    movapd %xmm5, %xmm6
1242; SSE-NEXT:    movapd %xmm4, %xmm5
1243; SSE-NEXT:    movq %rdi, %rax
1244; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm11
1245; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
1246; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm12
1247; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
1248; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm10
1249; SSE-NEXT:    movapd %xmm10, %xmm13
1250; SSE-NEXT:    unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm10[0]
1251; SSE-NEXT:    movapd %xmm1, %xmm14
1252; SSE-NEXT:    mulpd %xmm13, %xmm14
1253; SSE-NEXT:    mulpd %xmm0, %xmm13
1254; SSE-NEXT:    unpckhpd {{.*#+}} xmm10 = xmm10[1,1]
1255; SSE-NEXT:    movapd %xmm3, %xmm15
1256; SSE-NEXT:    mulpd %xmm10, %xmm15
1257; SSE-NEXT:    addpd %xmm14, %xmm15
1258; SSE-NEXT:    mulpd %xmm2, %xmm10
1259; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1260; SSE-NEXT:    addpd %xmm13, %xmm10
1261; SSE-NEXT:    movapd %xmm8, %xmm13
1262; SSE-NEXT:    unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm8[0]
1263; SSE-NEXT:    movapd %xmm4, %xmm14
1264; SSE-NEXT:    mulpd %xmm13, %xmm14
1265; SSE-NEXT:    addpd %xmm10, %xmm14
1266; SSE-NEXT:    movapd %xmm6, %xmm4
1267; SSE-NEXT:    mulpd %xmm6, %xmm13
1268; SSE-NEXT:    addpd %xmm15, %xmm13
1269; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
1270; SSE-NEXT:    movapd %xmm7, %xmm10
1271; SSE-NEXT:    mulpd %xmm8, %xmm10
1272; SSE-NEXT:    addpd %xmm13, %xmm10
1273; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1274; SSE-NEXT:    mulpd %xmm6, %xmm8
1275; SSE-NEXT:    addpd %xmm14, %xmm8
1276; SSE-NEXT:    movapd %xmm12, %xmm13
1277; SSE-NEXT:    unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm12[0]
1278; SSE-NEXT:    movapd %xmm1, %xmm14
1279; SSE-NEXT:    mulpd %xmm13, %xmm14
1280; SSE-NEXT:    mulpd %xmm0, %xmm13
1281; SSE-NEXT:    unpckhpd {{.*#+}} xmm12 = xmm12[1,1]
1282; SSE-NEXT:    movapd %xmm3, %xmm15
1283; SSE-NEXT:    mulpd %xmm12, %xmm15
1284; SSE-NEXT:    addpd %xmm14, %xmm15
1285; SSE-NEXT:    mulpd %xmm2, %xmm12
1286; SSE-NEXT:    addpd %xmm13, %xmm12
1287; SSE-NEXT:    movapd %xmm9, %xmm13
1288; SSE-NEXT:    unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm9[0]
1289; SSE-NEXT:    movapd %xmm5, %xmm14
1290; SSE-NEXT:    mulpd %xmm13, %xmm14
1291; SSE-NEXT:    addpd %xmm12, %xmm14
1292; SSE-NEXT:    mulpd %xmm4, %xmm13
1293; SSE-NEXT:    movapd %xmm4, %xmm2
1294; SSE-NEXT:    addpd %xmm15, %xmm13
1295; SSE-NEXT:    unpckhpd {{.*#+}} xmm9 = xmm9[1,1]
1296; SSE-NEXT:    movapd %xmm7, %xmm12
1297; SSE-NEXT:    mulpd %xmm9, %xmm12
1298; SSE-NEXT:    addpd %xmm13, %xmm12
1299; SSE-NEXT:    mulpd %xmm6, %xmm9
1300; SSE-NEXT:    addpd %xmm14, %xmm9
1301; SSE-NEXT:    movapd %xmm11, %xmm14
1302; SSE-NEXT:    unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm11[0]
1303; SSE-NEXT:    movapd %xmm1, %xmm13
1304; SSE-NEXT:    mulpd %xmm14, %xmm13
1305; SSE-NEXT:    unpckhpd {{.*#+}} xmm11 = xmm11[1,1]
1306; SSE-NEXT:    movapd %xmm3, %xmm15
1307; SSE-NEXT:    mulpd %xmm11, %xmm15
1308; SSE-NEXT:    addpd %xmm13, %xmm15
1309; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm13
1310; SSE-NEXT:    mulpd %xmm0, %xmm14
1311; SSE-NEXT:    movapd %xmm0, %xmm6
1312; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1313; SSE-NEXT:    mulpd %xmm0, %xmm11
1314; SSE-NEXT:    addpd %xmm14, %xmm11
1315; SSE-NEXT:    movapd %xmm13, %xmm14
1316; SSE-NEXT:    unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm13[0]
1317; SSE-NEXT:    movapd %xmm5, %xmm4
1318; SSE-NEXT:    mulpd %xmm14, %xmm4
1319; SSE-NEXT:    addpd %xmm11, %xmm4
1320; SSE-NEXT:    mulpd %xmm2, %xmm14
1321; SSE-NEXT:    addpd %xmm15, %xmm14
1322; SSE-NEXT:    unpckhpd {{.*#+}} xmm13 = xmm13[1,1]
1323; SSE-NEXT:    movapd %xmm7, %xmm11
1324; SSE-NEXT:    mulpd %xmm13, %xmm11
1325; SSE-NEXT:    addpd %xmm14, %xmm11
1326; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm14
1327; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1328; SSE-NEXT:    mulpd %xmm15, %xmm13
1329; SSE-NEXT:    addpd %xmm4, %xmm13
1330; SSE-NEXT:    movapd %xmm14, %xmm4
1331; SSE-NEXT:    unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm14[0]
1332; SSE-NEXT:    mulpd %xmm4, %xmm1
1333; SSE-NEXT:    mulpd %xmm6, %xmm4
1334; SSE-NEXT:    unpckhpd {{.*#+}} xmm14 = xmm14[1,1]
1335; SSE-NEXT:    mulpd %xmm14, %xmm3
1336; SSE-NEXT:    addpd %xmm1, %xmm3
1337; SSE-NEXT:    mulpd %xmm0, %xmm14
1338; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
1339; SSE-NEXT:    addpd %xmm4, %xmm14
1340; SSE-NEXT:    movapd %xmm0, %xmm1
1341; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1342; SSE-NEXT:    mulpd %xmm1, %xmm5
1343; SSE-NEXT:    addpd %xmm14, %xmm5
1344; SSE-NEXT:    mulpd %xmm2, %xmm1
1345; SSE-NEXT:    addpd %xmm3, %xmm1
1346; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
1347; SSE-NEXT:    mulpd %xmm0, %xmm7
1348; SSE-NEXT:    addpd %xmm1, %xmm7
1349; SSE-NEXT:    mulpd %xmm15, %xmm0
1350; SSE-NEXT:    addpd %xmm5, %xmm0
1351; SSE-NEXT:    movapd %xmm7, 112(%rdi)
1352; SSE-NEXT:    movapd %xmm0, 96(%rdi)
1353; SSE-NEXT:    movapd %xmm11, 80(%rdi)
1354; SSE-NEXT:    movapd %xmm13, 64(%rdi)
1355; SSE-NEXT:    movapd %xmm12, 48(%rdi)
1356; SSE-NEXT:    movapd %xmm9, 32(%rdi)
1357; SSE-NEXT:    movapd %xmm10, 16(%rdi)
1358; SSE-NEXT:    movapd %xmm8, (%rdi)
1359; SSE-NEXT:    retq
1360;
1361; AVX1-LABEL: test_mul4x4_f64:
1362; AVX1:       # %bb.0: # %entry
1363; AVX1-NEXT:    vmovddup {{.*#+}} xmm8 = xmm4[0,0]
1364; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm8, %ymm8
1365; AVX1-NEXT:    vmulpd %ymm0, %ymm8, %ymm8
1366; AVX1-NEXT:    vshufpd {{.*#+}} xmm9 = xmm4[1,1]
1367; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm9, %ymm9
1368; AVX1-NEXT:    vmulpd %ymm1, %ymm9, %ymm9
1369; AVX1-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
1370; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
1371; AVX1-NEXT:    vmovddup {{.*#+}} ymm9 = ymm4[0,0,2,2]
1372; AVX1-NEXT:    vmulpd %ymm2, %ymm9, %ymm9
1373; AVX1-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
1374; AVX1-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1,1,3,3]
1375; AVX1-NEXT:    vmulpd %ymm4, %ymm3, %ymm4
1376; AVX1-NEXT:    vaddpd %ymm4, %ymm8, %ymm4
1377; AVX1-NEXT:    vmovddup {{.*#+}} xmm8 = xmm5[0,0]
1378; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm8, %ymm8
1379; AVX1-NEXT:    vmulpd %ymm0, %ymm8, %ymm8
1380; AVX1-NEXT:    vshufpd {{.*#+}} xmm9 = xmm5[1,1]
1381; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm9, %ymm9
1382; AVX1-NEXT:    vmulpd %ymm1, %ymm9, %ymm9
1383; AVX1-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
1384; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
1385; AVX1-NEXT:    vmovddup {{.*#+}} ymm9 = ymm5[0,0,2,2]
1386; AVX1-NEXT:    vmulpd %ymm2, %ymm9, %ymm9
1387; AVX1-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
1388; AVX1-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[1,1,3,3]
1389; AVX1-NEXT:    vmulpd %ymm5, %ymm3, %ymm5
1390; AVX1-NEXT:    vaddpd %ymm5, %ymm8, %ymm5
1391; AVX1-NEXT:    vmovddup {{.*#+}} xmm8 = xmm6[0,0]
1392; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm8, %ymm8
1393; AVX1-NEXT:    vmulpd %ymm0, %ymm8, %ymm8
1394; AVX1-NEXT:    vshufpd {{.*#+}} xmm9 = xmm6[1,1]
1395; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm9, %ymm9
1396; AVX1-NEXT:    vmulpd %ymm1, %ymm9, %ymm9
1397; AVX1-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
1398; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3]
1399; AVX1-NEXT:    vmovddup {{.*#+}} ymm9 = ymm6[0,0,2,2]
1400; AVX1-NEXT:    vmulpd %ymm2, %ymm9, %ymm9
1401; AVX1-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
1402; AVX1-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1,1,3,3]
1403; AVX1-NEXT:    vmulpd %ymm6, %ymm3, %ymm6
1404; AVX1-NEXT:    vaddpd %ymm6, %ymm8, %ymm6
1405; AVX1-NEXT:    vmovddup {{.*#+}} xmm8 = xmm7[0,0]
1406; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm8, %ymm8
1407; AVX1-NEXT:    vmulpd %ymm0, %ymm8, %ymm0
1408; AVX1-NEXT:    vshufpd {{.*#+}} xmm8 = xmm7[1,1]
1409; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm8, %ymm8
1410; AVX1-NEXT:    vmulpd %ymm1, %ymm8, %ymm1
1411; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1412; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm7[2,3,2,3]
1413; AVX1-NEXT:    vmovddup {{.*#+}} ymm7 = ymm1[0,0,2,2]
1414; AVX1-NEXT:    vmulpd %ymm7, %ymm2, %ymm2
1415; AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
1416; AVX1-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1,1,3,3]
1417; AVX1-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
1418; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm3
1419; AVX1-NEXT:    vmovapd %ymm4, %ymm0
1420; AVX1-NEXT:    vmovapd %ymm5, %ymm1
1421; AVX1-NEXT:    vmovapd %ymm6, %ymm2
1422; AVX1-NEXT:    retq
1423;
1424; AVX2-LABEL: test_mul4x4_f64:
1425; AVX2:       # %bb.0: # %entry
1426; AVX2-NEXT:    vbroadcastsd %xmm4, %ymm8
1427; AVX2-NEXT:    vmulpd %ymm0, %ymm8, %ymm8
1428; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm4[1,1,1,1]
1429; AVX2-NEXT:    vmulpd %ymm1, %ymm9, %ymm9
1430; AVX2-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
1431; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm4[2,2,2,2]
1432; AVX2-NEXT:    vmulpd %ymm2, %ymm9, %ymm9
1433; AVX2-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
1434; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3]
1435; AVX2-NEXT:    vmulpd %ymm4, %ymm3, %ymm4
1436; AVX2-NEXT:    vaddpd %ymm4, %ymm8, %ymm4
1437; AVX2-NEXT:    vbroadcastsd %xmm5, %ymm8
1438; AVX2-NEXT:    vmulpd %ymm0, %ymm8, %ymm8
1439; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm5[1,1,1,1]
1440; AVX2-NEXT:    vmulpd %ymm1, %ymm9, %ymm9
1441; AVX2-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
1442; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm5[2,2,2,2]
1443; AVX2-NEXT:    vmulpd %ymm2, %ymm9, %ymm9
1444; AVX2-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
1445; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3]
1446; AVX2-NEXT:    vmulpd %ymm5, %ymm3, %ymm5
1447; AVX2-NEXT:    vaddpd %ymm5, %ymm8, %ymm5
1448; AVX2-NEXT:    vbroadcastsd %xmm6, %ymm8
1449; AVX2-NEXT:    vmulpd %ymm0, %ymm8, %ymm8
1450; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm6[1,1,1,1]
1451; AVX2-NEXT:    vmulpd %ymm1, %ymm9, %ymm9
1452; AVX2-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
1453; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm6[2,2,2,2]
1454; AVX2-NEXT:    vmulpd %ymm2, %ymm9, %ymm9
1455; AVX2-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
1456; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3]
1457; AVX2-NEXT:    vmulpd %ymm6, %ymm3, %ymm6
1458; AVX2-NEXT:    vaddpd %ymm6, %ymm8, %ymm6
1459; AVX2-NEXT:    vbroadcastsd %xmm7, %ymm8
1460; AVX2-NEXT:    vmulpd %ymm0, %ymm8, %ymm0
1461; AVX2-NEXT:    vpermpd {{.*#+}} ymm8 = ymm7[1,1,1,1]
1462; AVX2-NEXT:    vmulpd %ymm1, %ymm8, %ymm1
1463; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1464; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm7[2,2,2,2]
1465; AVX2-NEXT:    vmulpd %ymm1, %ymm2, %ymm1
1466; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1467; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm7[3,3,3,3]
1468; AVX2-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
1469; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm3
1470; AVX2-NEXT:    vmovapd %ymm4, %ymm0
1471; AVX2-NEXT:    vmovapd %ymm5, %ymm1
1472; AVX2-NEXT:    vmovapd %ymm6, %ymm2
1473; AVX2-NEXT:    retq
1474;
1475; AVX512F-LABEL: test_mul4x4_f64:
1476; AVX512F:       # %bb.0: # %entry
1477; AVX512F-NEXT:    vextractf64x4 $1, %zmm0, %ymm5
1478; AVX512F-NEXT:    vextractf64x4 $1, %zmm1, %ymm4
1479; AVX512F-NEXT:    vbroadcastsd %xmm2, %ymm6
1480; AVX512F-NEXT:    vmulpd %ymm6, %ymm0, %ymm6
1481; AVX512F-NEXT:    vpermpd {{.*#+}} ymm7 = ymm2[1,1,1,1]
1482; AVX512F-NEXT:    vmulpd %ymm7, %ymm5, %ymm7
1483; AVX512F-NEXT:    vaddpd %ymm7, %ymm6, %ymm6
1484; AVX512F-NEXT:    vpermpd {{.*#+}} ymm7 = ymm2[2,2,2,2]
1485; AVX512F-NEXT:    vmulpd %ymm7, %ymm1, %ymm7
1486; AVX512F-NEXT:    vaddpd %ymm7, %ymm6, %ymm6
1487; AVX512F-NEXT:    vpermpd {{.*#+}} ymm7 = ymm2[3,3,3,3]
1488; AVX512F-NEXT:    vmulpd %ymm7, %ymm4, %ymm7
1489; AVX512F-NEXT:    vaddpd %ymm7, %ymm6, %ymm6
1490; AVX512F-NEXT:    vextractf64x4 $1, %zmm2, %ymm2
1491; AVX512F-NEXT:    vbroadcastsd %xmm2, %ymm7
1492; AVX512F-NEXT:    vmulpd %ymm7, %ymm0, %ymm7
1493; AVX512F-NEXT:    vpermpd {{.*#+}} ymm8 = ymm2[1,1,1,1]
1494; AVX512F-NEXT:    vmulpd %ymm5, %ymm8, %ymm8
1495; AVX512F-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
1496; AVX512F-NEXT:    vpermpd {{.*#+}} ymm8 = ymm2[2,2,2,2]
1497; AVX512F-NEXT:    vmulpd %ymm1, %ymm8, %ymm8
1498; AVX512F-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
1499; AVX512F-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3]
1500; AVX512F-NEXT:    vmulpd %ymm2, %ymm4, %ymm2
1501; AVX512F-NEXT:    vaddpd %ymm2, %ymm7, %ymm2
1502; AVX512F-NEXT:    vbroadcastsd %xmm3, %ymm7
1503; AVX512F-NEXT:    vmulpd %ymm7, %ymm0, %ymm7
1504; AVX512F-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1]
1505; AVX512F-NEXT:    vmulpd %ymm5, %ymm8, %ymm8
1506; AVX512F-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
1507; AVX512F-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[2,2,2,2]
1508; AVX512F-NEXT:    vmulpd %ymm1, %ymm8, %ymm8
1509; AVX512F-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
1510; AVX512F-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[3,3,3,3]
1511; AVX512F-NEXT:    vmulpd %ymm4, %ymm8, %ymm8
1512; AVX512F-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
1513; AVX512F-NEXT:    vextractf64x4 $1, %zmm3, %ymm3
1514; AVX512F-NEXT:    vbroadcastsd %xmm3, %ymm8
1515; AVX512F-NEXT:    vmulpd %ymm0, %ymm8, %ymm0
1516; AVX512F-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1]
1517; AVX512F-NEXT:    vmulpd %ymm5, %ymm8, %ymm5
1518; AVX512F-NEXT:    vaddpd %ymm5, %ymm0, %ymm0
1519; AVX512F-NEXT:    vpermpd {{.*#+}} ymm5 = ymm3[2,2,2,2]
1520; AVX512F-NEXT:    vmulpd %ymm5, %ymm1, %ymm1
1521; AVX512F-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1522; AVX512F-NEXT:    vpermpd {{.*#+}} ymm1 = ymm3[3,3,3,3]
1523; AVX512F-NEXT:    vmulpd %ymm1, %ymm4, %ymm1
1524; AVX512F-NEXT:    vaddpd %ymm1, %ymm0, %ymm1
1525; AVX512F-NEXT:    vinsertf64x4 $1, %ymm2, %zmm6, %zmm0
1526; AVX512F-NEXT:    vinsertf64x4 $1, %ymm1, %zmm7, %zmm1
1527; AVX512F-NEXT:    retq
1528;
1529; AVX512VL-LABEL: test_mul4x4_f64:
1530; AVX512VL:       # %bb.0: # %entry
1531; AVX512VL-NEXT:    vextractf64x4 $1, %zmm0, %ymm4
1532; AVX512VL-NEXT:    vextractf64x4 $1, %zmm1, %ymm5
1533; AVX512VL-NEXT:    vbroadcastsd %xmm2, %ymm6
1534; AVX512VL-NEXT:    vmulpd %ymm6, %ymm0, %ymm6
1535; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm7 = ymm2[1,1,1,1]
1536; AVX512VL-NEXT:    vmulpd %ymm7, %ymm4, %ymm7
1537; AVX512VL-NEXT:    vaddpd %ymm7, %ymm6, %ymm6
1538; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm7 = ymm2[2,2,2,2]
1539; AVX512VL-NEXT:    vmulpd %ymm7, %ymm1, %ymm7
1540; AVX512VL-NEXT:    vaddpd %ymm7, %ymm6, %ymm6
1541; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm7 = ymm2[3,3,3,3]
1542; AVX512VL-NEXT:    vmulpd %ymm7, %ymm5, %ymm7
1543; AVX512VL-NEXT:    vaddpd %ymm7, %ymm6, %ymm6
1544; AVX512VL-NEXT:    vextractf64x4 $1, %zmm2, %ymm2
1545; AVX512VL-NEXT:    vbroadcastsd %xmm2, %ymm7
1546; AVX512VL-NEXT:    vmulpd %ymm7, %ymm0, %ymm7
1547; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm8 = ymm2[1,1,1,1]
1548; AVX512VL-NEXT:    vmulpd %ymm4, %ymm8, %ymm8
1549; AVX512VL-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
1550; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm8 = ymm2[2,2,2,2]
1551; AVX512VL-NEXT:    vmulpd %ymm1, %ymm8, %ymm8
1552; AVX512VL-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
1553; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3]
1554; AVX512VL-NEXT:    vmulpd %ymm2, %ymm5, %ymm2
1555; AVX512VL-NEXT:    vaddpd %ymm2, %ymm7, %ymm2
1556; AVX512VL-NEXT:    vbroadcastsd %xmm3, %ymm7
1557; AVX512VL-NEXT:    vmulpd %ymm7, %ymm0, %ymm7
1558; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1]
1559; AVX512VL-NEXT:    vmulpd %ymm4, %ymm8, %ymm8
1560; AVX512VL-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
1561; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[2,2,2,2]
1562; AVX512VL-NEXT:    vmulpd %ymm1, %ymm8, %ymm8
1563; AVX512VL-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
1564; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[3,3,3,3]
1565; AVX512VL-NEXT:    vmulpd %ymm5, %ymm8, %ymm8
1566; AVX512VL-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
1567; AVX512VL-NEXT:    vextractf64x4 $1, %zmm3, %ymm3
1568; AVX512VL-NEXT:    vbroadcastsd %xmm3, %ymm8
1569; AVX512VL-NEXT:    vmulpd %ymm0, %ymm8, %ymm0
1570; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1]
1571; AVX512VL-NEXT:    vmulpd %ymm4, %ymm8, %ymm4
1572; AVX512VL-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
1573; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm4 = ymm3[2,2,2,2]
1574; AVX512VL-NEXT:    vmulpd %ymm4, %ymm1, %ymm1
1575; AVX512VL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1576; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm3[3,3,3,3]
1577; AVX512VL-NEXT:    vmulpd %ymm1, %ymm5, %ymm1
1578; AVX512VL-NEXT:    vaddpd %ymm1, %ymm0, %ymm1
1579; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm6, %zmm0
1580; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm7, %zmm1
1581; AVX512VL-NEXT:    retq
1582entry:
1583  %split = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1584  %split1 = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1585  %split2 = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
1586  %split3 = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
1587  %splat.splat = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> zeroinitializer
1588  %0 = fmul <4 x double> %split, %splat.splat
1589  %splat.splat10 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1590  %1 = fmul <4 x double> %split1, %splat.splat10
1591  %2 = fadd <4 x double> %0, %1
1592  %splat.splat13 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
1593  %3 = fmul <4 x double> %split2, %splat.splat13
1594  %4 = fadd <4 x double> %2, %3
1595  %splat.splat16 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1596  %5 = fmul <4 x double> %split3, %splat.splat16
1597  %6 = fadd <4 x double> %4, %5
1598  %splat.splat19 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
1599  %7 = fmul <4 x double> %split, %splat.splat19
1600  %splat.splat22 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
1601  %8 = fmul <4 x double> %split1, %splat.splat22
1602  %9 = fadd <4 x double> %7, %8
1603  %splat.splat25 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
1604  %10 = fmul <4 x double> %split2, %splat.splat25
1605  %11 = fadd <4 x double> %9, %10
1606  %splat.splat28 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1607  %12 = fmul <4 x double> %split3, %splat.splat28
1608  %13 = fadd <4 x double> %11, %12
1609  %splat.splat31 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 8, i32 8, i32 8, i32 8>
1610  %14 = fmul <4 x double> %split, %splat.splat31
1611  %splat.splat34 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 9, i32 9, i32 9, i32 9>
1612  %15 = fmul <4 x double> %split1, %splat.splat34
1613  %16 = fadd <4 x double> %14, %15
1614  %splat.splat37 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 10, i32 10, i32 10, i32 10>
1615  %17 = fmul <4 x double> %split2, %splat.splat37
1616  %18 = fadd <4 x double> %16, %17
1617  %splat.splat40 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 11, i32 11, i32 11, i32 11>
1618  %19 = fmul <4 x double> %split3, %splat.splat40
1619  %20 = fadd <4 x double> %18, %19
1620  %splat.splat43 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 12, i32 12, i32 12, i32 12>
1621  %21 = fmul <4 x double> %split, %splat.splat43
1622  %splat.splat46 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 13, i32 13, i32 13, i32 13>
1623  %22 = fmul <4 x double> %split1, %splat.splat46
1624  %23 = fadd <4 x double> %21, %22
1625  %splat.splat49 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 14, i32 14, i32 14, i32 14>
1626  %24 = fmul <4 x double> %split2, %splat.splat49
1627  %25 = fadd <4 x double> %23, %24
1628  %splat.splat52 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 15, i32 15, i32 15, i32 15>
1629  %26 = fmul <4 x double> %split3, %splat.splat52
1630  %27 = fadd <4 x double> %25, %26
1631  %28 = shufflevector <4 x double> %6, <4 x double> %13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1632  %29 = shufflevector <4 x double> %20, <4 x double> %27, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1633  %30 = shufflevector <8 x double> %28, <8 x double> %29, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1634  ret <16 x double> %30
1635}
1636
1637define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwind {
1638; SSE-LABEL: test_mul8x8_f32:
1639; SSE:       # %bb.0: # %entry
1640; SSE-NEXT:    subq $120, %rsp
1641; SSE-NEXT:    movaps %xmm5, %xmm11
1642; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1643; SSE-NEXT:    movaps %xmm1, %xmm9
1644; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1645; SSE-NEXT:    movq %rdi, %rax
1646; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm8
1647; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
1648; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
1649; SSE-NEXT:    movaps %xmm14, %xmm15
1650; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[0,0],xmm14[0,0]
1651; SSE-NEXT:    movaps %xmm1, %xmm5
1652; SSE-NEXT:    mulps %xmm15, %xmm5
1653; SSE-NEXT:    mulps %xmm0, %xmm15
1654; SSE-NEXT:    movaps %xmm14, %xmm0
1655; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1]
1656; SSE-NEXT:    movaps %xmm3, %xmm10
1657; SSE-NEXT:    movaps %xmm3, %xmm12
1658; SSE-NEXT:    mulps %xmm0, %xmm10
1659; SSE-NEXT:    addps %xmm5, %xmm10
1660; SSE-NEXT:    mulps %xmm2, %xmm0
1661; SSE-NEXT:    addps %xmm15, %xmm0
1662; SSE-NEXT:    movaps %xmm14, %xmm1
1663; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[2,2]
1664; SSE-NEXT:    movaps %xmm4, %xmm2
1665; SSE-NEXT:    movaps %xmm4, %xmm15
1666; SSE-NEXT:    mulps %xmm1, %xmm2
1667; SSE-NEXT:    addps %xmm0, %xmm2
1668; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
1669; SSE-NEXT:    mulps %xmm11, %xmm1
1670; SSE-NEXT:    addps %xmm10, %xmm1
1671; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[3,3,3,3]
1672; SSE-NEXT:    movaps %xmm7, %xmm3
1673; SSE-NEXT:    mulps %xmm14, %xmm3
1674; SSE-NEXT:    addps %xmm1, %xmm3
1675; SSE-NEXT:    mulps %xmm6, %xmm14
1676; SSE-NEXT:    addps %xmm2, %xmm14
1677; SSE-NEXT:    movaps %xmm5, %xmm1
1678; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm5[0,0]
1679; SSE-NEXT:    movaps %xmm13, %xmm2
1680; SSE-NEXT:    mulps %xmm1, %xmm2
1681; SSE-NEXT:    addps %xmm14, %xmm2
1682; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
1683; SSE-NEXT:    addps %xmm3, %xmm1
1684; SSE-NEXT:    movaps %xmm5, %xmm0
1685; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1]
1686; SSE-NEXT:    movaps %xmm8, %xmm3
1687; SSE-NEXT:    mulps %xmm0, %xmm3
1688; SSE-NEXT:    addps %xmm1, %xmm3
1689; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm0
1690; SSE-NEXT:    addps %xmm2, %xmm0
1691; SSE-NEXT:    movaps %xmm5, %xmm1
1692; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[2,2]
1693; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
1694; SSE-NEXT:    mulps %xmm1, %xmm2
1695; SSE-NEXT:    addps %xmm0, %xmm2
1696; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
1697; SSE-NEXT:    addps %xmm3, %xmm1
1698; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3,3,3]
1699; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
1700; SSE-NEXT:    mulps %xmm5, %xmm0
1701; SSE-NEXT:    addps %xmm1, %xmm0
1702; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1703; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm5
1704; SSE-NEXT:    addps %xmm2, %xmm5
1705; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1706; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
1707; SSE-NEXT:    movaps %xmm0, %xmm1
1708; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
1709; SSE-NEXT:    movaps %xmm9, %xmm2
1710; SSE-NEXT:    mulps %xmm1, %xmm2
1711; SSE-NEXT:    movaps %xmm0, %xmm3
1712; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
1713; SSE-NEXT:    movaps %xmm12, %xmm4
1714; SSE-NEXT:    mulps %xmm3, %xmm4
1715; SSE-NEXT:    addps %xmm2, %xmm4
1716; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1717; SSE-NEXT:    mulps %xmm10, %xmm1
1718; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1719; SSE-NEXT:    mulps %xmm13, %xmm3
1720; SSE-NEXT:    addps %xmm1, %xmm3
1721; SSE-NEXT:    movaps %xmm0, %xmm1
1722; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
1723; SSE-NEXT:    movaps %xmm15, %xmm2
1724; SSE-NEXT:    mulps %xmm1, %xmm2
1725; SSE-NEXT:    addps %xmm3, %xmm2
1726; SSE-NEXT:    movaps %xmm11, %xmm8
1727; SSE-NEXT:    mulps %xmm11, %xmm1
1728; SSE-NEXT:    addps %xmm4, %xmm1
1729; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1730; SSE-NEXT:    movaps %xmm7, %xmm3
1731; SSE-NEXT:    mulps %xmm0, %xmm3
1732; SSE-NEXT:    addps %xmm1, %xmm3
1733; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1734; SSE-NEXT:    mulps %xmm6, %xmm0
1735; SSE-NEXT:    addps %xmm2, %xmm0
1736; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
1737; SSE-NEXT:    movaps %xmm4, %xmm1
1738; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0]
1739; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
1740; SSE-NEXT:    movaps %xmm14, %xmm2
1741; SSE-NEXT:    mulps %xmm1, %xmm2
1742; SSE-NEXT:    addps %xmm0, %xmm2
1743; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
1744; SSE-NEXT:    addps %xmm3, %xmm1
1745; SSE-NEXT:    movaps %xmm4, %xmm0
1746; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1]
1747; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
1748; SSE-NEXT:    movaps %xmm11, %xmm3
1749; SSE-NEXT:    mulps %xmm0, %xmm3
1750; SSE-NEXT:    addps %xmm1, %xmm3
1751; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
1752; SSE-NEXT:    mulps %xmm1, %xmm0
1753; SSE-NEXT:    addps %xmm2, %xmm0
1754; SSE-NEXT:    movaps %xmm4, %xmm1
1755; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2]
1756; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
1757; SSE-NEXT:    mulps %xmm1, %xmm2
1758; SSE-NEXT:    addps %xmm0, %xmm2
1759; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
1760; SSE-NEXT:    addps %xmm3, %xmm1
1761; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
1762; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
1763; SSE-NEXT:    mulps %xmm4, %xmm0
1764; SSE-NEXT:    addps %xmm1, %xmm0
1765; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1766; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm4
1767; SSE-NEXT:    addps %xmm2, %xmm4
1768; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1769; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
1770; SSE-NEXT:    movaps %xmm0, %xmm1
1771; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
1772; SSE-NEXT:    movaps %xmm9, %xmm2
1773; SSE-NEXT:    mulps %xmm1, %xmm2
1774; SSE-NEXT:    movaps %xmm0, %xmm3
1775; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
1776; SSE-NEXT:    movaps %xmm12, %xmm4
1777; SSE-NEXT:    mulps %xmm3, %xmm4
1778; SSE-NEXT:    addps %xmm2, %xmm4
1779; SSE-NEXT:    mulps %xmm10, %xmm1
1780; SSE-NEXT:    mulps %xmm13, %xmm3
1781; SSE-NEXT:    addps %xmm1, %xmm3
1782; SSE-NEXT:    movaps %xmm0, %xmm1
1783; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
1784; SSE-NEXT:    movaps %xmm15, %xmm2
1785; SSE-NEXT:    movaps %xmm15, %xmm5
1786; SSE-NEXT:    mulps %xmm1, %xmm2
1787; SSE-NEXT:    addps %xmm3, %xmm2
1788; SSE-NEXT:    mulps %xmm8, %xmm1
1789; SSE-NEXT:    addps %xmm4, %xmm1
1790; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1791; SSE-NEXT:    movaps %xmm7, %xmm3
1792; SSE-NEXT:    mulps %xmm0, %xmm3
1793; SSE-NEXT:    addps %xmm1, %xmm3
1794; SSE-NEXT:    mulps %xmm6, %xmm0
1795; SSE-NEXT:    addps %xmm2, %xmm0
1796; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
1797; SSE-NEXT:    movaps %xmm4, %xmm1
1798; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0]
1799; SSE-NEXT:    movaps %xmm14, %xmm2
1800; SSE-NEXT:    mulps %xmm1, %xmm2
1801; SSE-NEXT:    addps %xmm0, %xmm2
1802; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
1803; SSE-NEXT:    mulps %xmm14, %xmm1
1804; SSE-NEXT:    addps %xmm3, %xmm1
1805; SSE-NEXT:    movaps %xmm4, %xmm0
1806; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1]
1807; SSE-NEXT:    movaps %xmm11, %xmm3
1808; SSE-NEXT:    mulps %xmm0, %xmm3
1809; SSE-NEXT:    addps %xmm1, %xmm3
1810; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm0
1811; SSE-NEXT:    addps %xmm2, %xmm0
1812; SSE-NEXT:    movaps %xmm4, %xmm1
1813; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2]
1814; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
1815; SSE-NEXT:    mulps %xmm1, %xmm2
1816; SSE-NEXT:    addps %xmm0, %xmm2
1817; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
1818; SSE-NEXT:    mulps %xmm11, %xmm1
1819; SSE-NEXT:    addps %xmm3, %xmm1
1820; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
1821; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
1822; SSE-NEXT:    mulps %xmm4, %xmm0
1823; SSE-NEXT:    addps %xmm1, %xmm0
1824; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1825; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
1826; SSE-NEXT:    mulps %xmm0, %xmm4
1827; SSE-NEXT:    addps %xmm2, %xmm4
1828; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1829; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
1830; SSE-NEXT:    movaps %xmm0, %xmm1
1831; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
1832; SSE-NEXT:    movaps %xmm9, %xmm2
1833; SSE-NEXT:    mulps %xmm1, %xmm2
1834; SSE-NEXT:    movaps %xmm0, %xmm3
1835; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
1836; SSE-NEXT:    movaps %xmm12, %xmm4
1837; SSE-NEXT:    mulps %xmm3, %xmm4
1838; SSE-NEXT:    addps %xmm2, %xmm4
1839; SSE-NEXT:    movaps %xmm10, %xmm15
1840; SSE-NEXT:    mulps %xmm10, %xmm1
1841; SSE-NEXT:    mulps %xmm13, %xmm3
1842; SSE-NEXT:    addps %xmm1, %xmm3
1843; SSE-NEXT:    movaps %xmm0, %xmm1
1844; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
1845; SSE-NEXT:    movaps %xmm5, %xmm2
1846; SSE-NEXT:    mulps %xmm1, %xmm2
1847; SSE-NEXT:    addps %xmm3, %xmm2
1848; SSE-NEXT:    mulps %xmm8, %xmm1
1849; SSE-NEXT:    addps %xmm4, %xmm1
1850; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1851; SSE-NEXT:    movaps %xmm7, %xmm4
1852; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1853; SSE-NEXT:    movaps %xmm7, %xmm3
1854; SSE-NEXT:    mulps %xmm0, %xmm3
1855; SSE-NEXT:    addps %xmm1, %xmm3
1856; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1857; SSE-NEXT:    mulps %xmm6, %xmm0
1858; SSE-NEXT:    addps %xmm2, %xmm0
1859; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm10
1860; SSE-NEXT:    movaps %xmm10, %xmm1
1861; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm10[0,0]
1862; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
1863; SSE-NEXT:    mulps %xmm1, %xmm2
1864; SSE-NEXT:    addps %xmm0, %xmm2
1865; SSE-NEXT:    mulps %xmm14, %xmm1
1866; SSE-NEXT:    addps %xmm3, %xmm1
1867; SSE-NEXT:    movaps %xmm10, %xmm0
1868; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1]
1869; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
1870; SSE-NEXT:    mulps %xmm0, %xmm3
1871; SSE-NEXT:    addps %xmm1, %xmm3
1872; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm0
1873; SSE-NEXT:    addps %xmm2, %xmm0
1874; SSE-NEXT:    movaps %xmm10, %xmm1
1875; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm10[2,2]
1876; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
1877; SSE-NEXT:    mulps %xmm1, %xmm2
1878; SSE-NEXT:    addps %xmm0, %xmm2
1879; SSE-NEXT:    mulps %xmm11, %xmm1
1880; SSE-NEXT:    addps %xmm3, %xmm1
1881; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[3,3,3,3]
1882; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
1883; SSE-NEXT:    movaps %xmm11, %xmm0
1884; SSE-NEXT:    mulps %xmm10, %xmm0
1885; SSE-NEXT:    addps %xmm1, %xmm0
1886; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1887; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm10
1888; SSE-NEXT:    addps %xmm2, %xmm10
1889; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
1890; SSE-NEXT:    movaps %xmm0, %xmm1
1891; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
1892; SSE-NEXT:    movaps %xmm9, %xmm2
1893; SSE-NEXT:    movaps %xmm9, %xmm14
1894; SSE-NEXT:    mulps %xmm1, %xmm2
1895; SSE-NEXT:    movaps %xmm0, %xmm3
1896; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
1897; SSE-NEXT:    movaps %xmm12, %xmm7
1898; SSE-NEXT:    mulps %xmm3, %xmm7
1899; SSE-NEXT:    addps %xmm2, %xmm7
1900; SSE-NEXT:    mulps %xmm15, %xmm1
1901; SSE-NEXT:    mulps %xmm13, %xmm3
1902; SSE-NEXT:    addps %xmm1, %xmm3
1903; SSE-NEXT:    movaps %xmm0, %xmm1
1904; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
1905; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1906; SSE-NEXT:    movaps %xmm5, %xmm2
1907; SSE-NEXT:    mulps %xmm1, %xmm2
1908; SSE-NEXT:    addps %xmm3, %xmm2
1909; SSE-NEXT:    movaps %xmm8, %xmm9
1910; SSE-NEXT:    mulps %xmm8, %xmm1
1911; SSE-NEXT:    addps %xmm7, %xmm1
1912; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1913; SSE-NEXT:    movaps %xmm4, %xmm7
1914; SSE-NEXT:    mulps %xmm0, %xmm7
1915; SSE-NEXT:    addps %xmm1, %xmm7
1916; SSE-NEXT:    movaps %xmm6, %xmm3
1917; SSE-NEXT:    mulps %xmm6, %xmm0
1918; SSE-NEXT:    addps %xmm2, %xmm0
1919; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
1920; SSE-NEXT:    movaps %xmm4, %xmm1
1921; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0]
1922; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
1923; SSE-NEXT:    mulps %xmm1, %xmm2
1924; SSE-NEXT:    addps %xmm0, %xmm2
1925; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
1926; SSE-NEXT:    addps %xmm7, %xmm1
1927; SSE-NEXT:    movaps %xmm4, %xmm0
1928; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1]
1929; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
1930; SSE-NEXT:    mulps %xmm0, %xmm7
1931; SSE-NEXT:    addps %xmm1, %xmm7
1932; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
1933; SSE-NEXT:    mulps %xmm1, %xmm0
1934; SSE-NEXT:    addps %xmm2, %xmm0
1935; SSE-NEXT:    movaps %xmm4, %xmm1
1936; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2]
1937; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
1938; SSE-NEXT:    mulps %xmm1, %xmm2
1939; SSE-NEXT:    addps %xmm0, %xmm2
1940; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
1941; SSE-NEXT:    addps %xmm7, %xmm1
1942; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
1943; SSE-NEXT:    movaps %xmm11, %xmm0
1944; SSE-NEXT:    mulps %xmm4, %xmm0
1945; SSE-NEXT:    addps %xmm1, %xmm0
1946; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
1947; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm4
1948; SSE-NEXT:    addps %xmm2, %xmm4
1949; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
1950; SSE-NEXT:    movaps %xmm0, %xmm1
1951; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
1952; SSE-NEXT:    movaps %xmm14, %xmm6
1953; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1954; SSE-NEXT:    movaps %xmm14, %xmm2
1955; SSE-NEXT:    mulps %xmm1, %xmm2
1956; SSE-NEXT:    movaps %xmm0, %xmm14
1957; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,1],xmm0[1,1]
1958; SSE-NEXT:    movaps %xmm12, %xmm15
1959; SSE-NEXT:    movaps %xmm12, %xmm13
1960; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1961; SSE-NEXT:    mulps %xmm14, %xmm15
1962; SSE-NEXT:    addps %xmm2, %xmm15
1963; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1964; SSE-NEXT:    mulps %xmm8, %xmm1
1965; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1966; SSE-NEXT:    mulps %xmm7, %xmm14
1967; SSE-NEXT:    addps %xmm1, %xmm14
1968; SSE-NEXT:    movaps %xmm0, %xmm1
1969; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
1970; SSE-NEXT:    movaps %xmm5, %xmm2
1971; SSE-NEXT:    mulps %xmm1, %xmm2
1972; SSE-NEXT:    addps %xmm14, %xmm2
1973; SSE-NEXT:    mulps %xmm9, %xmm1
1974; SSE-NEXT:    movaps %xmm9, %xmm11
1975; SSE-NEXT:    addps %xmm15, %xmm1
1976; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1977; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1978; SSE-NEXT:    movaps %xmm5, %xmm14
1979; SSE-NEXT:    mulps %xmm0, %xmm14
1980; SSE-NEXT:    addps %xmm1, %xmm14
1981; SSE-NEXT:    mulps %xmm3, %xmm0
1982; SSE-NEXT:    movaps %xmm3, %xmm12
1983; SSE-NEXT:    addps %xmm2, %xmm0
1984; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
1985; SSE-NEXT:    movaps %xmm3, %xmm1
1986; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0]
1987; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
1988; SSE-NEXT:    mulps %xmm1, %xmm15
1989; SSE-NEXT:    addps %xmm0, %xmm15
1990; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
1991; SSE-NEXT:    mulps %xmm0, %xmm1
1992; SSE-NEXT:    addps %xmm14, %xmm1
1993; SSE-NEXT:    movaps %xmm3, %xmm0
1994; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1]
1995; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
1996; SSE-NEXT:    mulps %xmm0, %xmm14
1997; SSE-NEXT:    addps %xmm1, %xmm14
1998; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm0
1999; SSE-NEXT:    addps %xmm15, %xmm0
2000; SSE-NEXT:    movaps %xmm3, %xmm1
2001; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[2,2]
2002; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
2003; SSE-NEXT:    mulps %xmm1, %xmm15
2004; SSE-NEXT:    addps %xmm0, %xmm15
2005; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
2006; SSE-NEXT:    addps %xmm14, %xmm1
2007; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
2008; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
2009; SSE-NEXT:    mulps %xmm3, %xmm14
2010; SSE-NEXT:    addps %xmm1, %xmm14
2011; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm3
2012; SSE-NEXT:    addps %xmm15, %xmm3
2013; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
2014; SSE-NEXT:    movaps %xmm0, %xmm1
2015; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
2016; SSE-NEXT:    mulps %xmm1, %xmm6
2017; SSE-NEXT:    movaps %xmm0, %xmm15
2018; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[1,1],xmm0[1,1]
2019; SSE-NEXT:    mulps %xmm15, %xmm13
2020; SSE-NEXT:    addps %xmm6, %xmm13
2021; SSE-NEXT:    mulps %xmm8, %xmm1
2022; SSE-NEXT:    mulps %xmm7, %xmm15
2023; SSE-NEXT:    addps %xmm1, %xmm15
2024; SSE-NEXT:    movaps %xmm0, %xmm1
2025; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
2026; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2027; SSE-NEXT:    movaps %xmm6, %xmm2
2028; SSE-NEXT:    mulps %xmm1, %xmm2
2029; SSE-NEXT:    addps %xmm15, %xmm2
2030; SSE-NEXT:    mulps %xmm9, %xmm1
2031; SSE-NEXT:    addps %xmm13, %xmm1
2032; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2033; SSE-NEXT:    movaps %xmm5, %xmm9
2034; SSE-NEXT:    mulps %xmm0, %xmm9
2035; SSE-NEXT:    addps %xmm1, %xmm9
2036; SSE-NEXT:    mulps %xmm12, %xmm0
2037; SSE-NEXT:    movaps %xmm12, %xmm5
2038; SSE-NEXT:    addps %xmm2, %xmm0
2039; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
2040; SSE-NEXT:    movaps %xmm1, %xmm2
2041; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
2042; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
2043; SSE-NEXT:    mulps %xmm2, %xmm15
2044; SSE-NEXT:    addps %xmm0, %xmm15
2045; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
2046; SSE-NEXT:    mulps %xmm0, %xmm2
2047; SSE-NEXT:    addps %xmm9, %xmm2
2048; SSE-NEXT:    movaps %xmm1, %xmm0
2049; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
2050; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm9
2051; SSE-NEXT:    mulps %xmm0, %xmm9
2052; SSE-NEXT:    addps %xmm2, %xmm9
2053; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm0
2054; SSE-NEXT:    addps %xmm15, %xmm0
2055; SSE-NEXT:    movaps %xmm1, %xmm2
2056; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2]
2057; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
2058; SSE-NEXT:    mulps %xmm2, %xmm13
2059; SSE-NEXT:    addps %xmm0, %xmm13
2060; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm2
2061; SSE-NEXT:    addps %xmm9, %xmm2
2062; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
2063; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
2064; SSE-NEXT:    mulps %xmm1, %xmm15
2065; SSE-NEXT:    addps %xmm2, %xmm15
2066; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
2067; SSE-NEXT:    addps %xmm13, %xmm1
2068; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
2069; SSE-NEXT:    movaps %xmm0, %xmm2
2070; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0]
2071; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2072; SSE-NEXT:    mulps %xmm2, %xmm13
2073; SSE-NEXT:    mulps %xmm8, %xmm2
2074; SSE-NEXT:    movaps %xmm0, %xmm9
2075; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1]
2076; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2077; SSE-NEXT:    mulps %xmm9, %xmm8
2078; SSE-NEXT:    addps %xmm13, %xmm8
2079; SSE-NEXT:    mulps %xmm7, %xmm9
2080; SSE-NEXT:    addps %xmm2, %xmm9
2081; SSE-NEXT:    movaps %xmm0, %xmm2
2082; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,2],xmm0[2,2]
2083; SSE-NEXT:    mulps %xmm2, %xmm6
2084; SSE-NEXT:    addps %xmm9, %xmm6
2085; SSE-NEXT:    mulps %xmm11, %xmm2
2086; SSE-NEXT:    addps %xmm8, %xmm2
2087; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
2088; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2089; SSE-NEXT:    mulps %xmm0, %xmm9
2090; SSE-NEXT:    addps %xmm2, %xmm9
2091; SSE-NEXT:    movaps %xmm9, %xmm12
2092; SSE-NEXT:    mulps %xmm5, %xmm0
2093; SSE-NEXT:    addps %xmm6, %xmm0
2094; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm9
2095; SSE-NEXT:    movaps %xmm9, %xmm2
2096; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm9[0,0]
2097; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
2098; SSE-NEXT:    mulps %xmm2, %xmm13
2099; SSE-NEXT:    addps %xmm0, %xmm13
2100; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm2
2101; SSE-NEXT:    addps %xmm12, %xmm2
2102; SSE-NEXT:    movaps %xmm9, %xmm0
2103; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1]
2104; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm12
2105; SSE-NEXT:    mulps %xmm0, %xmm12
2106; SSE-NEXT:    addps %xmm2, %xmm12
2107; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm0
2108; SSE-NEXT:    addps %xmm13, %xmm0
2109; SSE-NEXT:    movaps %xmm9, %xmm2
2110; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,2],xmm9[2,2]
2111; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
2112; SSE-NEXT:    mulps %xmm2, %xmm5
2113; SSE-NEXT:    addps %xmm0, %xmm5
2114; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm2
2115; SSE-NEXT:    addps %xmm12, %xmm2
2116; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[3,3,3,3]
2117; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
2118; SSE-NEXT:    mulps %xmm9, %xmm0
2119; SSE-NEXT:    addps %xmm2, %xmm0
2120; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm9
2121; SSE-NEXT:    addps %xmm5, %xmm9
2122; SSE-NEXT:    movaps %xmm0, 240(%rdi)
2123; SSE-NEXT:    movaps %xmm9, 224(%rdi)
2124; SSE-NEXT:    movaps %xmm15, 208(%rdi)
2125; SSE-NEXT:    movaps %xmm1, 192(%rdi)
2126; SSE-NEXT:    movaps %xmm14, 176(%rdi)
2127; SSE-NEXT:    movaps %xmm3, 160(%rdi)
2128; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
2129; SSE-NEXT:    movaps %xmm0, 144(%rdi)
2130; SSE-NEXT:    movaps %xmm4, 128(%rdi)
2131; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2132; SSE-NEXT:    movaps %xmm0, 112(%rdi)
2133; SSE-NEXT:    movaps %xmm10, 96(%rdi)
2134; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2135; SSE-NEXT:    movaps %xmm0, 80(%rdi)
2136; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2137; SSE-NEXT:    movaps %xmm0, 64(%rdi)
2138; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2139; SSE-NEXT:    movaps %xmm0, 48(%rdi)
2140; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2141; SSE-NEXT:    movaps %xmm0, 32(%rdi)
2142; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2143; SSE-NEXT:    movaps %xmm0, 16(%rdi)
2144; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2145; SSE-NEXT:    movaps %xmm0, (%rdi)
2146; SSE-NEXT:    addq $120, %rsp
2147; SSE-NEXT:    retq
2148;
2149; AVX1-LABEL: test_mul8x8_f32:
2150; AVX1:       # %bb.0: # %entry
2151; AVX1-NEXT:    pushq %rbp
2152; AVX1-NEXT:    movq %rsp, %rbp
2153; AVX1-NEXT:    andq $-32, %rsp
2154; AVX1-NEXT:    subq $32, %rsp
2155; AVX1-NEXT:    movq %rdi, %rax
2156; AVX1-NEXT:    vbroadcastss 16(%rbp), %ymm8
2157; AVX1-NEXT:    vmulps %ymm0, %ymm8, %ymm8
2158; AVX1-NEXT:    vbroadcastss 20(%rbp), %ymm9
2159; AVX1-NEXT:    vmulps %ymm1, %ymm9, %ymm9
2160; AVX1-NEXT:    vaddps %ymm9, %ymm8, %ymm8
2161; AVX1-NEXT:    vbroadcastss 24(%rbp), %ymm9
2162; AVX1-NEXT:    vmulps %ymm2, %ymm9, %ymm9
2163; AVX1-NEXT:    vaddps %ymm9, %ymm8, %ymm8
2164; AVX1-NEXT:    vbroadcastss 28(%rbp), %ymm9
2165; AVX1-NEXT:    vmulps %ymm3, %ymm9, %ymm9
2166; AVX1-NEXT:    vaddps %ymm9, %ymm8, %ymm8
2167; AVX1-NEXT:    vbroadcastss 32(%rbp), %ymm9
2168; AVX1-NEXT:    vmulps %ymm4, %ymm9, %ymm9
2169; AVX1-NEXT:    vaddps %ymm9, %ymm8, %ymm8
2170; AVX1-NEXT:    vbroadcastss 36(%rbp), %ymm9
2171; AVX1-NEXT:    vmulps %ymm5, %ymm9, %ymm9
2172; AVX1-NEXT:    vaddps %ymm9, %ymm8, %ymm8
2173; AVX1-NEXT:    vbroadcastss 40(%rbp), %ymm9
2174; AVX1-NEXT:    vmulps %ymm6, %ymm9, %ymm9
2175; AVX1-NEXT:    vaddps %ymm9, %ymm8, %ymm8
2176; AVX1-NEXT:    vbroadcastss 44(%rbp), %ymm9
2177; AVX1-NEXT:    vmulps %ymm7, %ymm9, %ymm9
2178; AVX1-NEXT:    vaddps %ymm9, %ymm8, %ymm8
2179; AVX1-NEXT:    vbroadcastss 48(%rbp), %ymm9
2180; AVX1-NEXT:    vmulps %ymm0, %ymm9, %ymm9
2181; AVX1-NEXT:    vbroadcastss 52(%rbp), %ymm10
2182; AVX1-NEXT:    vmulps %ymm1, %ymm10, %ymm10
2183; AVX1-NEXT:    vaddps %ymm10, %ymm9, %ymm9
2184; AVX1-NEXT:    vbroadcastss 56(%rbp), %ymm10
2185; AVX1-NEXT:    vmulps %ymm2, %ymm10, %ymm10
2186; AVX1-NEXT:    vaddps %ymm10, %ymm9, %ymm9
2187; AVX1-NEXT:    vbroadcastss 60(%rbp), %ymm10
2188; AVX1-NEXT:    vmulps %ymm3, %ymm10, %ymm10
2189; AVX1-NEXT:    vaddps %ymm10, %ymm9, %ymm9
2190; AVX1-NEXT:    vbroadcastss 64(%rbp), %ymm10
2191; AVX1-NEXT:    vmulps %ymm4, %ymm10, %ymm10
2192; AVX1-NEXT:    vaddps %ymm10, %ymm9, %ymm9
2193; AVX1-NEXT:    vbroadcastss 68(%rbp), %ymm10
2194; AVX1-NEXT:    vmulps %ymm5, %ymm10, %ymm10
2195; AVX1-NEXT:    vaddps %ymm10, %ymm9, %ymm9
2196; AVX1-NEXT:    vbroadcastss 72(%rbp), %ymm10
2197; AVX1-NEXT:    vmulps %ymm6, %ymm10, %ymm10
2198; AVX1-NEXT:    vaddps %ymm10, %ymm9, %ymm9
2199; AVX1-NEXT:    vbroadcastss 76(%rbp), %ymm10
2200; AVX1-NEXT:    vmulps %ymm7, %ymm10, %ymm10
2201; AVX1-NEXT:    vaddps %ymm10, %ymm9, %ymm9
2202; AVX1-NEXT:    vbroadcastss 80(%rbp), %ymm10
2203; AVX1-NEXT:    vmulps %ymm0, %ymm10, %ymm10
2204; AVX1-NEXT:    vbroadcastss 84(%rbp), %ymm11
2205; AVX1-NEXT:    vmulps %ymm1, %ymm11, %ymm11
2206; AVX1-NEXT:    vaddps %ymm11, %ymm10, %ymm10
2207; AVX1-NEXT:    vbroadcastss 88(%rbp), %ymm11
2208; AVX1-NEXT:    vmulps %ymm2, %ymm11, %ymm11
2209; AVX1-NEXT:    vaddps %ymm11, %ymm10, %ymm10
2210; AVX1-NEXT:    vbroadcastss 92(%rbp), %ymm11
2211; AVX1-NEXT:    vmulps %ymm3, %ymm11, %ymm11
2212; AVX1-NEXT:    vaddps %ymm11, %ymm10, %ymm10
2213; AVX1-NEXT:    vbroadcastss 96(%rbp), %ymm11
2214; AVX1-NEXT:    vmulps %ymm4, %ymm11, %ymm11
2215; AVX1-NEXT:    vaddps %ymm11, %ymm10, %ymm10
2216; AVX1-NEXT:    vbroadcastss 100(%rbp), %ymm11
2217; AVX1-NEXT:    vmulps %ymm5, %ymm11, %ymm11
2218; AVX1-NEXT:    vaddps %ymm11, %ymm10, %ymm10
2219; AVX1-NEXT:    vbroadcastss 104(%rbp), %ymm11
2220; AVX1-NEXT:    vmulps %ymm6, %ymm11, %ymm11
2221; AVX1-NEXT:    vaddps %ymm11, %ymm10, %ymm10
2222; AVX1-NEXT:    vbroadcastss 108(%rbp), %ymm11
2223; AVX1-NEXT:    vmulps %ymm7, %ymm11, %ymm11
2224; AVX1-NEXT:    vaddps %ymm11, %ymm10, %ymm10
2225; AVX1-NEXT:    vbroadcastss 112(%rbp), %ymm11
2226; AVX1-NEXT:    vmulps %ymm0, %ymm11, %ymm11
2227; AVX1-NEXT:    vbroadcastss 116(%rbp), %ymm12
2228; AVX1-NEXT:    vmulps %ymm1, %ymm12, %ymm12
2229; AVX1-NEXT:    vaddps %ymm12, %ymm11, %ymm11
2230; AVX1-NEXT:    vbroadcastss 120(%rbp), %ymm12
2231; AVX1-NEXT:    vmulps %ymm2, %ymm12, %ymm12
2232; AVX1-NEXT:    vaddps %ymm12, %ymm11, %ymm11
2233; AVX1-NEXT:    vbroadcastss 124(%rbp), %ymm12
2234; AVX1-NEXT:    vmulps %ymm3, %ymm12, %ymm12
2235; AVX1-NEXT:    vaddps %ymm12, %ymm11, %ymm11
2236; AVX1-NEXT:    vbroadcastss 128(%rbp), %ymm12
2237; AVX1-NEXT:    vmulps %ymm4, %ymm12, %ymm12
2238; AVX1-NEXT:    vaddps %ymm12, %ymm11, %ymm11
2239; AVX1-NEXT:    vbroadcastss 132(%rbp), %ymm12
2240; AVX1-NEXT:    vmulps %ymm5, %ymm12, %ymm12
2241; AVX1-NEXT:    vaddps %ymm12, %ymm11, %ymm11
2242; AVX1-NEXT:    vbroadcastss 136(%rbp), %ymm12
2243; AVX1-NEXT:    vmulps %ymm6, %ymm12, %ymm12
2244; AVX1-NEXT:    vaddps %ymm12, %ymm11, %ymm11
2245; AVX1-NEXT:    vbroadcastss 140(%rbp), %ymm12
2246; AVX1-NEXT:    vmulps %ymm7, %ymm12, %ymm12
2247; AVX1-NEXT:    vaddps %ymm12, %ymm11, %ymm11
2248; AVX1-NEXT:    vbroadcastss 144(%rbp), %ymm12
2249; AVX1-NEXT:    vmulps %ymm0, %ymm12, %ymm12
2250; AVX1-NEXT:    vbroadcastss 148(%rbp), %ymm13
2251; AVX1-NEXT:    vmulps %ymm1, %ymm13, %ymm13
2252; AVX1-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2253; AVX1-NEXT:    vbroadcastss 152(%rbp), %ymm13
2254; AVX1-NEXT:    vmulps %ymm2, %ymm13, %ymm13
2255; AVX1-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2256; AVX1-NEXT:    vbroadcastss 156(%rbp), %ymm13
2257; AVX1-NEXT:    vmulps %ymm3, %ymm13, %ymm13
2258; AVX1-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2259; AVX1-NEXT:    vbroadcastss 160(%rbp), %ymm13
2260; AVX1-NEXT:    vmulps %ymm4, %ymm13, %ymm13
2261; AVX1-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2262; AVX1-NEXT:    vbroadcastss 164(%rbp), %ymm13
2263; AVX1-NEXT:    vmulps %ymm5, %ymm13, %ymm13
2264; AVX1-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2265; AVX1-NEXT:    vbroadcastss 168(%rbp), %ymm13
2266; AVX1-NEXT:    vmulps %ymm6, %ymm13, %ymm13
2267; AVX1-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2268; AVX1-NEXT:    vbroadcastss 172(%rbp), %ymm13
2269; AVX1-NEXT:    vmulps %ymm7, %ymm13, %ymm13
2270; AVX1-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2271; AVX1-NEXT:    vbroadcastss 176(%rbp), %ymm13
2272; AVX1-NEXT:    vmulps %ymm0, %ymm13, %ymm13
2273; AVX1-NEXT:    vbroadcastss 180(%rbp), %ymm14
2274; AVX1-NEXT:    vmulps %ymm1, %ymm14, %ymm14
2275; AVX1-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2276; AVX1-NEXT:    vbroadcastss 184(%rbp), %ymm14
2277; AVX1-NEXT:    vmulps %ymm2, %ymm14, %ymm14
2278; AVX1-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2279; AVX1-NEXT:    vbroadcastss 188(%rbp), %ymm14
2280; AVX1-NEXT:    vmulps %ymm3, %ymm14, %ymm14
2281; AVX1-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2282; AVX1-NEXT:    vbroadcastss 192(%rbp), %ymm14
2283; AVX1-NEXT:    vmulps %ymm4, %ymm14, %ymm14
2284; AVX1-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2285; AVX1-NEXT:    vbroadcastss 196(%rbp), %ymm14
2286; AVX1-NEXT:    vmulps %ymm5, %ymm14, %ymm14
2287; AVX1-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2288; AVX1-NEXT:    vbroadcastss 200(%rbp), %ymm14
2289; AVX1-NEXT:    vmulps %ymm6, %ymm14, %ymm14
2290; AVX1-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2291; AVX1-NEXT:    vbroadcastss 204(%rbp), %ymm14
2292; AVX1-NEXT:    vmulps %ymm7, %ymm14, %ymm14
2293; AVX1-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2294; AVX1-NEXT:    vbroadcastss 208(%rbp), %ymm14
2295; AVX1-NEXT:    vmulps %ymm0, %ymm14, %ymm14
2296; AVX1-NEXT:    vbroadcastss 212(%rbp), %ymm15
2297; AVX1-NEXT:    vmulps %ymm1, %ymm15, %ymm15
2298; AVX1-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2299; AVX1-NEXT:    vbroadcastss 216(%rbp), %ymm15
2300; AVX1-NEXT:    vmulps %ymm2, %ymm15, %ymm15
2301; AVX1-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2302; AVX1-NEXT:    vbroadcastss 220(%rbp), %ymm15
2303; AVX1-NEXT:    vmulps %ymm3, %ymm15, %ymm15
2304; AVX1-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2305; AVX1-NEXT:    vbroadcastss 224(%rbp), %ymm15
2306; AVX1-NEXT:    vmulps %ymm4, %ymm15, %ymm15
2307; AVX1-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2308; AVX1-NEXT:    vbroadcastss 228(%rbp), %ymm15
2309; AVX1-NEXT:    vmulps %ymm5, %ymm15, %ymm15
2310; AVX1-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2311; AVX1-NEXT:    vbroadcastss 232(%rbp), %ymm15
2312; AVX1-NEXT:    vmulps %ymm6, %ymm15, %ymm15
2313; AVX1-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2314; AVX1-NEXT:    vbroadcastss 236(%rbp), %ymm15
2315; AVX1-NEXT:    vmulps %ymm7, %ymm15, %ymm15
2316; AVX1-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2317; AVX1-NEXT:    vbroadcastss 240(%rbp), %ymm15
2318; AVX1-NEXT:    vmulps %ymm0, %ymm15, %ymm0
2319; AVX1-NEXT:    vbroadcastss 244(%rbp), %ymm15
2320; AVX1-NEXT:    vmulps %ymm1, %ymm15, %ymm1
2321; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2322; AVX1-NEXT:    vbroadcastss 248(%rbp), %ymm1
2323; AVX1-NEXT:    vmulps %ymm1, %ymm2, %ymm1
2324; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2325; AVX1-NEXT:    vbroadcastss 252(%rbp), %ymm1
2326; AVX1-NEXT:    vmulps %ymm1, %ymm3, %ymm1
2327; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2328; AVX1-NEXT:    vbroadcastss 256(%rbp), %ymm1
2329; AVX1-NEXT:    vmulps %ymm1, %ymm4, %ymm1
2330; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2331; AVX1-NEXT:    vbroadcastss 260(%rbp), %ymm1
2332; AVX1-NEXT:    vmulps %ymm1, %ymm5, %ymm1
2333; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2334; AVX1-NEXT:    vbroadcastss 264(%rbp), %ymm1
2335; AVX1-NEXT:    vmulps %ymm1, %ymm6, %ymm1
2336; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2337; AVX1-NEXT:    vbroadcastss 268(%rbp), %ymm1
2338; AVX1-NEXT:    vmulps %ymm1, %ymm7, %ymm1
2339; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2340; AVX1-NEXT:    vmovaps %ymm0, 224(%rdi)
2341; AVX1-NEXT:    vmovaps %ymm14, 192(%rdi)
2342; AVX1-NEXT:    vmovaps %ymm13, 160(%rdi)
2343; AVX1-NEXT:    vmovaps %ymm12, 128(%rdi)
2344; AVX1-NEXT:    vmovaps %ymm11, 96(%rdi)
2345; AVX1-NEXT:    vmovaps %ymm10, 64(%rdi)
2346; AVX1-NEXT:    vmovaps %ymm9, 32(%rdi)
2347; AVX1-NEXT:    vmovaps %ymm8, (%rdi)
2348; AVX1-NEXT:    movq %rbp, %rsp
2349; AVX1-NEXT:    popq %rbp
2350; AVX1-NEXT:    vzeroupper
2351; AVX1-NEXT:    retq
2352;
2353; AVX2-LABEL: test_mul8x8_f32:
2354; AVX2:       # %bb.0: # %entry
2355; AVX2-NEXT:    pushq %rbp
2356; AVX2-NEXT:    movq %rsp, %rbp
2357; AVX2-NEXT:    andq $-32, %rsp
2358; AVX2-NEXT:    subq $32, %rsp
2359; AVX2-NEXT:    movq %rdi, %rax
2360; AVX2-NEXT:    vbroadcastss 16(%rbp), %ymm8
2361; AVX2-NEXT:    vmulps %ymm0, %ymm8, %ymm8
2362; AVX2-NEXT:    vbroadcastss 20(%rbp), %ymm9
2363; AVX2-NEXT:    vmulps %ymm1, %ymm9, %ymm9
2364; AVX2-NEXT:    vaddps %ymm9, %ymm8, %ymm8
2365; AVX2-NEXT:    vbroadcastss 24(%rbp), %ymm9
2366; AVX2-NEXT:    vmulps %ymm2, %ymm9, %ymm9
2367; AVX2-NEXT:    vaddps %ymm9, %ymm8, %ymm8
2368; AVX2-NEXT:    vbroadcastss 28(%rbp), %ymm9
2369; AVX2-NEXT:    vmulps %ymm3, %ymm9, %ymm9
2370; AVX2-NEXT:    vaddps %ymm9, %ymm8, %ymm8
2371; AVX2-NEXT:    vbroadcastss 32(%rbp), %ymm9
2372; AVX2-NEXT:    vmulps %ymm4, %ymm9, %ymm9
2373; AVX2-NEXT:    vaddps %ymm9, %ymm8, %ymm8
2374; AVX2-NEXT:    vbroadcastss 36(%rbp), %ymm9
2375; AVX2-NEXT:    vmulps %ymm5, %ymm9, %ymm9
2376; AVX2-NEXT:    vaddps %ymm9, %ymm8, %ymm8
2377; AVX2-NEXT:    vbroadcastss 40(%rbp), %ymm9
2378; AVX2-NEXT:    vmulps %ymm6, %ymm9, %ymm9
2379; AVX2-NEXT:    vaddps %ymm9, %ymm8, %ymm8
2380; AVX2-NEXT:    vbroadcastss 44(%rbp), %ymm9
2381; AVX2-NEXT:    vmulps %ymm7, %ymm9, %ymm9
2382; AVX2-NEXT:    vaddps %ymm9, %ymm8, %ymm8
2383; AVX2-NEXT:    vbroadcastss 48(%rbp), %ymm9
2384; AVX2-NEXT:    vmulps %ymm0, %ymm9, %ymm9
2385; AVX2-NEXT:    vbroadcastss 52(%rbp), %ymm10
2386; AVX2-NEXT:    vmulps %ymm1, %ymm10, %ymm10
2387; AVX2-NEXT:    vaddps %ymm10, %ymm9, %ymm9
2388; AVX2-NEXT:    vbroadcastss 56(%rbp), %ymm10
2389; AVX2-NEXT:    vmulps %ymm2, %ymm10, %ymm10
2390; AVX2-NEXT:    vaddps %ymm10, %ymm9, %ymm9
2391; AVX2-NEXT:    vbroadcastss 60(%rbp), %ymm10
2392; AVX2-NEXT:    vmulps %ymm3, %ymm10, %ymm10
2393; AVX2-NEXT:    vaddps %ymm10, %ymm9, %ymm9
2394; AVX2-NEXT:    vbroadcastss 64(%rbp), %ymm10
2395; AVX2-NEXT:    vmulps %ymm4, %ymm10, %ymm10
2396; AVX2-NEXT:    vaddps %ymm10, %ymm9, %ymm9
2397; AVX2-NEXT:    vbroadcastss 68(%rbp), %ymm10
2398; AVX2-NEXT:    vmulps %ymm5, %ymm10, %ymm10
2399; AVX2-NEXT:    vaddps %ymm10, %ymm9, %ymm9
2400; AVX2-NEXT:    vbroadcastss 72(%rbp), %ymm10
2401; AVX2-NEXT:    vmulps %ymm6, %ymm10, %ymm10
2402; AVX2-NEXT:    vaddps %ymm10, %ymm9, %ymm9
2403; AVX2-NEXT:    vbroadcastss 76(%rbp), %ymm10
2404; AVX2-NEXT:    vmulps %ymm7, %ymm10, %ymm10
2405; AVX2-NEXT:    vaddps %ymm10, %ymm9, %ymm9
2406; AVX2-NEXT:    vbroadcastss 80(%rbp), %ymm10
2407; AVX2-NEXT:    vmulps %ymm0, %ymm10, %ymm10
2408; AVX2-NEXT:    vbroadcastss 84(%rbp), %ymm11
2409; AVX2-NEXT:    vmulps %ymm1, %ymm11, %ymm11
2410; AVX2-NEXT:    vaddps %ymm11, %ymm10, %ymm10
2411; AVX2-NEXT:    vbroadcastss 88(%rbp), %ymm11
2412; AVX2-NEXT:    vmulps %ymm2, %ymm11, %ymm11
2413; AVX2-NEXT:    vaddps %ymm11, %ymm10, %ymm10
2414; AVX2-NEXT:    vbroadcastss 92(%rbp), %ymm11
2415; AVX2-NEXT:    vmulps %ymm3, %ymm11, %ymm11
2416; AVX2-NEXT:    vaddps %ymm11, %ymm10, %ymm10
2417; AVX2-NEXT:    vbroadcastss 96(%rbp), %ymm11
2418; AVX2-NEXT:    vmulps %ymm4, %ymm11, %ymm11
2419; AVX2-NEXT:    vaddps %ymm11, %ymm10, %ymm10
2420; AVX2-NEXT:    vbroadcastss 100(%rbp), %ymm11
2421; AVX2-NEXT:    vmulps %ymm5, %ymm11, %ymm11
2422; AVX2-NEXT:    vaddps %ymm11, %ymm10, %ymm10
2423; AVX2-NEXT:    vbroadcastss 104(%rbp), %ymm11
2424; AVX2-NEXT:    vmulps %ymm6, %ymm11, %ymm11
2425; AVX2-NEXT:    vaddps %ymm11, %ymm10, %ymm10
2426; AVX2-NEXT:    vbroadcastss 108(%rbp), %ymm11
2427; AVX2-NEXT:    vmulps %ymm7, %ymm11, %ymm11
2428; AVX2-NEXT:    vaddps %ymm11, %ymm10, %ymm10
2429; AVX2-NEXT:    vbroadcastss 112(%rbp), %ymm11
2430; AVX2-NEXT:    vmulps %ymm0, %ymm11, %ymm11
2431; AVX2-NEXT:    vbroadcastss 116(%rbp), %ymm12
2432; AVX2-NEXT:    vmulps %ymm1, %ymm12, %ymm12
2433; AVX2-NEXT:    vaddps %ymm12, %ymm11, %ymm11
2434; AVX2-NEXT:    vbroadcastss 120(%rbp), %ymm12
2435; AVX2-NEXT:    vmulps %ymm2, %ymm12, %ymm12
2436; AVX2-NEXT:    vaddps %ymm12, %ymm11, %ymm11
2437; AVX2-NEXT:    vbroadcastss 124(%rbp), %ymm12
2438; AVX2-NEXT:    vmulps %ymm3, %ymm12, %ymm12
2439; AVX2-NEXT:    vaddps %ymm12, %ymm11, %ymm11
2440; AVX2-NEXT:    vbroadcastss 128(%rbp), %ymm12
2441; AVX2-NEXT:    vmulps %ymm4, %ymm12, %ymm12
2442; AVX2-NEXT:    vaddps %ymm12, %ymm11, %ymm11
2443; AVX2-NEXT:    vbroadcastss 132(%rbp), %ymm12
2444; AVX2-NEXT:    vmulps %ymm5, %ymm12, %ymm12
2445; AVX2-NEXT:    vaddps %ymm12, %ymm11, %ymm11
2446; AVX2-NEXT:    vbroadcastss 136(%rbp), %ymm12
2447; AVX2-NEXT:    vmulps %ymm6, %ymm12, %ymm12
2448; AVX2-NEXT:    vaddps %ymm12, %ymm11, %ymm11
2449; AVX2-NEXT:    vbroadcastss 140(%rbp), %ymm12
2450; AVX2-NEXT:    vmulps %ymm7, %ymm12, %ymm12
2451; AVX2-NEXT:    vaddps %ymm12, %ymm11, %ymm11
2452; AVX2-NEXT:    vbroadcastss 144(%rbp), %ymm12
2453; AVX2-NEXT:    vmulps %ymm0, %ymm12, %ymm12
2454; AVX2-NEXT:    vbroadcastss 148(%rbp), %ymm13
2455; AVX2-NEXT:    vmulps %ymm1, %ymm13, %ymm13
2456; AVX2-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2457; AVX2-NEXT:    vbroadcastss 152(%rbp), %ymm13
2458; AVX2-NEXT:    vmulps %ymm2, %ymm13, %ymm13
2459; AVX2-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2460; AVX2-NEXT:    vbroadcastss 156(%rbp), %ymm13
2461; AVX2-NEXT:    vmulps %ymm3, %ymm13, %ymm13
2462; AVX2-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2463; AVX2-NEXT:    vbroadcastss 160(%rbp), %ymm13
2464; AVX2-NEXT:    vmulps %ymm4, %ymm13, %ymm13
2465; AVX2-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2466; AVX2-NEXT:    vbroadcastss 164(%rbp), %ymm13
2467; AVX2-NEXT:    vmulps %ymm5, %ymm13, %ymm13
2468; AVX2-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2469; AVX2-NEXT:    vbroadcastss 168(%rbp), %ymm13
2470; AVX2-NEXT:    vmulps %ymm6, %ymm13, %ymm13
2471; AVX2-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2472; AVX2-NEXT:    vbroadcastss 172(%rbp), %ymm13
2473; AVX2-NEXT:    vmulps %ymm7, %ymm13, %ymm13
2474; AVX2-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2475; AVX2-NEXT:    vbroadcastss 176(%rbp), %ymm13
2476; AVX2-NEXT:    vmulps %ymm0, %ymm13, %ymm13
2477; AVX2-NEXT:    vbroadcastss 180(%rbp), %ymm14
2478; AVX2-NEXT:    vmulps %ymm1, %ymm14, %ymm14
2479; AVX2-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2480; AVX2-NEXT:    vbroadcastss 184(%rbp), %ymm14
2481; AVX2-NEXT:    vmulps %ymm2, %ymm14, %ymm14
2482; AVX2-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2483; AVX2-NEXT:    vbroadcastss 188(%rbp), %ymm14
2484; AVX2-NEXT:    vmulps %ymm3, %ymm14, %ymm14
2485; AVX2-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2486; AVX2-NEXT:    vbroadcastss 192(%rbp), %ymm14
2487; AVX2-NEXT:    vmulps %ymm4, %ymm14, %ymm14
2488; AVX2-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2489; AVX2-NEXT:    vbroadcastss 196(%rbp), %ymm14
2490; AVX2-NEXT:    vmulps %ymm5, %ymm14, %ymm14
2491; AVX2-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2492; AVX2-NEXT:    vbroadcastss 200(%rbp), %ymm14
2493; AVX2-NEXT:    vmulps %ymm6, %ymm14, %ymm14
2494; AVX2-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2495; AVX2-NEXT:    vbroadcastss 204(%rbp), %ymm14
2496; AVX2-NEXT:    vmulps %ymm7, %ymm14, %ymm14
2497; AVX2-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2498; AVX2-NEXT:    vbroadcastss 208(%rbp), %ymm14
2499; AVX2-NEXT:    vmulps %ymm0, %ymm14, %ymm14
2500; AVX2-NEXT:    vbroadcastss 212(%rbp), %ymm15
2501; AVX2-NEXT:    vmulps %ymm1, %ymm15, %ymm15
2502; AVX2-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2503; AVX2-NEXT:    vbroadcastss 216(%rbp), %ymm15
2504; AVX2-NEXT:    vmulps %ymm2, %ymm15, %ymm15
2505; AVX2-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2506; AVX2-NEXT:    vbroadcastss 220(%rbp), %ymm15
2507; AVX2-NEXT:    vmulps %ymm3, %ymm15, %ymm15
2508; AVX2-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2509; AVX2-NEXT:    vbroadcastss 224(%rbp), %ymm15
2510; AVX2-NEXT:    vmulps %ymm4, %ymm15, %ymm15
2511; AVX2-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2512; AVX2-NEXT:    vbroadcastss 228(%rbp), %ymm15
2513; AVX2-NEXT:    vmulps %ymm5, %ymm15, %ymm15
2514; AVX2-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2515; AVX2-NEXT:    vbroadcastss 232(%rbp), %ymm15
2516; AVX2-NEXT:    vmulps %ymm6, %ymm15, %ymm15
2517; AVX2-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2518; AVX2-NEXT:    vbroadcastss 236(%rbp), %ymm15
2519; AVX2-NEXT:    vmulps %ymm7, %ymm15, %ymm15
2520; AVX2-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2521; AVX2-NEXT:    vbroadcastss 240(%rbp), %ymm15
2522; AVX2-NEXT:    vmulps %ymm0, %ymm15, %ymm0
2523; AVX2-NEXT:    vbroadcastss 244(%rbp), %ymm15
2524; AVX2-NEXT:    vmulps %ymm1, %ymm15, %ymm1
2525; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2526; AVX2-NEXT:    vbroadcastss 248(%rbp), %ymm1
2527; AVX2-NEXT:    vmulps %ymm1, %ymm2, %ymm1
2528; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2529; AVX2-NEXT:    vbroadcastss 252(%rbp), %ymm1
2530; AVX2-NEXT:    vmulps %ymm1, %ymm3, %ymm1
2531; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2532; AVX2-NEXT:    vbroadcastss 256(%rbp), %ymm1
2533; AVX2-NEXT:    vmulps %ymm1, %ymm4, %ymm1
2534; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2535; AVX2-NEXT:    vbroadcastss 260(%rbp), %ymm1
2536; AVX2-NEXT:    vmulps %ymm1, %ymm5, %ymm1
2537; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2538; AVX2-NEXT:    vbroadcastss 264(%rbp), %ymm1
2539; AVX2-NEXT:    vmulps %ymm1, %ymm6, %ymm1
2540; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2541; AVX2-NEXT:    vbroadcastss 268(%rbp), %ymm1
2542; AVX2-NEXT:    vmulps %ymm1, %ymm7, %ymm1
2543; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2544; AVX2-NEXT:    vmovaps %ymm0, 224(%rdi)
2545; AVX2-NEXT:    vmovaps %ymm14, 192(%rdi)
2546; AVX2-NEXT:    vmovaps %ymm13, 160(%rdi)
2547; AVX2-NEXT:    vmovaps %ymm12, 128(%rdi)
2548; AVX2-NEXT:    vmovaps %ymm11, 96(%rdi)
2549; AVX2-NEXT:    vmovaps %ymm10, 64(%rdi)
2550; AVX2-NEXT:    vmovaps %ymm9, 32(%rdi)
2551; AVX2-NEXT:    vmovaps %ymm8, (%rdi)
2552; AVX2-NEXT:    movq %rbp, %rsp
2553; AVX2-NEXT:    popq %rbp
2554; AVX2-NEXT:    vzeroupper
2555; AVX2-NEXT:    retq
2556;
2557; AVX512F-LABEL: test_mul8x8_f32:
2558; AVX512F:       # %bb.0: # %entry
2559; AVX512F-NEXT:    vextractf64x4 $1, %zmm0, %ymm11
2560; AVX512F-NEXT:    vextractf64x4 $1, %zmm1, %ymm10
2561; AVX512F-NEXT:    vextractf64x4 $1, %zmm2, %ymm9
2562; AVX512F-NEXT:    vextractf64x4 $1, %zmm3, %ymm8
2563; AVX512F-NEXT:    vbroadcastss %xmm4, %ymm12
2564; AVX512F-NEXT:    vmulps %ymm0, %ymm12, %ymm12
2565; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm13 = xmm4[1,1,3,3]
2566; AVX512F-NEXT:    vbroadcastss %xmm13, %ymm13
2567; AVX512F-NEXT:    vmulps %ymm13, %ymm11, %ymm13
2568; AVX512F-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2569; AVX512F-NEXT:    vshufpd {{.*#+}} xmm13 = xmm4[1,0]
2570; AVX512F-NEXT:    vbroadcastss %xmm13, %ymm13
2571; AVX512F-NEXT:    vmulps %ymm1, %ymm13, %ymm13
2572; AVX512F-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2573; AVX512F-NEXT:    vshufps {{.*#+}} xmm13 = xmm4[3,3,3,3]
2574; AVX512F-NEXT:    vbroadcastss %xmm13, %ymm13
2575; AVX512F-NEXT:    vmulps %ymm13, %ymm10, %ymm13
2576; AVX512F-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2577; AVX512F-NEXT:    vextractf128 $1, %ymm4, %xmm13
2578; AVX512F-NEXT:    vbroadcastss %xmm13, %ymm13
2579; AVX512F-NEXT:    vmulps %ymm2, %ymm13, %ymm13
2580; AVX512F-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2581; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm13 = ymm4[1,1,3,3,5,5,7,7]
2582; AVX512F-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
2583; AVX512F-NEXT:    vmulps %ymm13, %ymm9, %ymm13
2584; AVX512F-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2585; AVX512F-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[2,2,2,2,6,6,6,6]
2586; AVX512F-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
2587; AVX512F-NEXT:    vmulps %ymm3, %ymm13, %ymm13
2588; AVX512F-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2589; AVX512F-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[3,3,3,3,7,7,7,7]
2590; AVX512F-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
2591; AVX512F-NEXT:    vmulps %ymm13, %ymm8, %ymm13
2592; AVX512F-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2593; AVX512F-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2594; AVX512F-NEXT:    vextractf64x4 $1, %zmm4, %ymm13
2595; AVX512F-NEXT:    vextractf32x4 $2, %zmm4, %xmm14
2596; AVX512F-NEXT:    vbroadcastss %xmm14, %ymm14
2597; AVX512F-NEXT:    vmulps %ymm0, %ymm14, %ymm14
2598; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm15 = xmm13[1,1,3,3]
2599; AVX512F-NEXT:    vbroadcastsd %xmm15, %ymm15
2600; AVX512F-NEXT:    vmulps %ymm15, %ymm11, %ymm15
2601; AVX512F-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2602; AVX512F-NEXT:    vshufps {{.*#+}} xmm15 = xmm13[2,2,2,2]
2603; AVX512F-NEXT:    vbroadcastsd %xmm15, %ymm15
2604; AVX512F-NEXT:    vmulps %ymm1, %ymm15, %ymm15
2605; AVX512F-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2606; AVX512F-NEXT:    vshufps {{.*#+}} xmm15 = xmm13[3,3,3,3]
2607; AVX512F-NEXT:    vbroadcastsd %xmm15, %ymm15
2608; AVX512F-NEXT:    vmulps %ymm15, %ymm10, %ymm15
2609; AVX512F-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2610; AVX512F-NEXT:    vextractf32x4 $3, %zmm4, %xmm4
2611; AVX512F-NEXT:    vbroadcastss %xmm4, %ymm4
2612; AVX512F-NEXT:    vmulps %ymm4, %ymm2, %ymm4
2613; AVX512F-NEXT:    vaddps %ymm4, %ymm14, %ymm4
2614; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm14 = ymm13[1,1,3,3,5,5,7,7]
2615; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
2616; AVX512F-NEXT:    vmulps %ymm14, %ymm9, %ymm14
2617; AVX512F-NEXT:    vaddps %ymm4, %ymm14, %ymm4
2618; AVX512F-NEXT:    vshufps {{.*#+}} ymm14 = ymm13[2,2,2,2,6,6,6,6]
2619; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
2620; AVX512F-NEXT:    vmulps %ymm3, %ymm14, %ymm14
2621; AVX512F-NEXT:    vaddps %ymm4, %ymm14, %ymm4
2622; AVX512F-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[3,3,3,3,7,7,7,7]
2623; AVX512F-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
2624; AVX512F-NEXT:    vmulps %ymm13, %ymm8, %ymm13
2625; AVX512F-NEXT:    vaddps %ymm4, %ymm13, %ymm4
2626; AVX512F-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2627; AVX512F-NEXT:    vbroadcastss %xmm5, %ymm13
2628; AVX512F-NEXT:    vmulps %ymm0, %ymm13, %ymm13
2629; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm14 = xmm5[1,1,3,3]
2630; AVX512F-NEXT:    vbroadcastss %xmm14, %ymm14
2631; AVX512F-NEXT:    vmulps %ymm14, %ymm11, %ymm14
2632; AVX512F-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2633; AVX512F-NEXT:    vshufpd {{.*#+}} xmm14 = xmm5[1,0]
2634; AVX512F-NEXT:    vbroadcastss %xmm14, %ymm14
2635; AVX512F-NEXT:    vmulps %ymm1, %ymm14, %ymm14
2636; AVX512F-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2637; AVX512F-NEXT:    vshufps {{.*#+}} xmm14 = xmm5[3,3,3,3]
2638; AVX512F-NEXT:    vbroadcastss %xmm14, %ymm14
2639; AVX512F-NEXT:    vmulps %ymm14, %ymm10, %ymm14
2640; AVX512F-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2641; AVX512F-NEXT:    vextractf128 $1, %ymm5, %xmm14
2642; AVX512F-NEXT:    vbroadcastss %xmm14, %ymm14
2643; AVX512F-NEXT:    vmulps %ymm2, %ymm14, %ymm14
2644; AVX512F-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2645; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm14 = ymm5[1,1,3,3,5,5,7,7]
2646; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
2647; AVX512F-NEXT:    vmulps %ymm14, %ymm9, %ymm14
2648; AVX512F-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2649; AVX512F-NEXT:    vshufps {{.*#+}} ymm14 = ymm5[2,2,2,2,6,6,6,6]
2650; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
2651; AVX512F-NEXT:    vmulps %ymm3, %ymm14, %ymm14
2652; AVX512F-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2653; AVX512F-NEXT:    vshufps {{.*#+}} ymm14 = ymm5[3,3,3,3,7,7,7,7]
2654; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
2655; AVX512F-NEXT:    vmulps %ymm14, %ymm8, %ymm14
2656; AVX512F-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2657; AVX512F-NEXT:    vextractf64x4 $1, %zmm5, %ymm14
2658; AVX512F-NEXT:    vextractf32x4 $2, %zmm5, %xmm15
2659; AVX512F-NEXT:    vbroadcastss %xmm15, %ymm15
2660; AVX512F-NEXT:    vmulps %ymm0, %ymm15, %ymm15
2661; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm12 = xmm14[1,1,3,3]
2662; AVX512F-NEXT:    vbroadcastsd %xmm12, %ymm12
2663; AVX512F-NEXT:    vmulps %ymm12, %ymm11, %ymm12
2664; AVX512F-NEXT:    vaddps %ymm12, %ymm15, %ymm12
2665; AVX512F-NEXT:    vshufps {{.*#+}} xmm15 = xmm14[2,2,2,2]
2666; AVX512F-NEXT:    vbroadcastsd %xmm15, %ymm15
2667; AVX512F-NEXT:    vmulps %ymm1, %ymm15, %ymm15
2668; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
2669; AVX512F-NEXT:    vshufps {{.*#+}} xmm15 = xmm14[3,3,3,3]
2670; AVX512F-NEXT:    vbroadcastsd %xmm15, %ymm15
2671; AVX512F-NEXT:    vmulps %ymm15, %ymm10, %ymm15
2672; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
2673; AVX512F-NEXT:    vextractf32x4 $3, %zmm5, %xmm5
2674; AVX512F-NEXT:    vbroadcastss %xmm5, %ymm5
2675; AVX512F-NEXT:    vmulps %ymm5, %ymm2, %ymm5
2676; AVX512F-NEXT:    vaddps %ymm5, %ymm12, %ymm5
2677; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm12 = ymm14[1,1,3,3,5,5,7,7]
2678; AVX512F-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2]
2679; AVX512F-NEXT:    vmulps %ymm12, %ymm9, %ymm12
2680; AVX512F-NEXT:    vaddps %ymm5, %ymm12, %ymm5
2681; AVX512F-NEXT:    vshufps {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6]
2682; AVX512F-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2]
2683; AVX512F-NEXT:    vmulps %ymm3, %ymm12, %ymm12
2684; AVX512F-NEXT:    vaddps %ymm5, %ymm12, %ymm5
2685; AVX512F-NEXT:    vshufps {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7]
2686; AVX512F-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2]
2687; AVX512F-NEXT:    vmulps %ymm12, %ymm8, %ymm12
2688; AVX512F-NEXT:    vaddps %ymm5, %ymm12, %ymm5
2689; AVX512F-NEXT:    vbroadcastss %xmm6, %ymm12
2690; AVX512F-NEXT:    vmulps %ymm0, %ymm12, %ymm12
2691; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm14 = xmm6[1,1,3,3]
2692; AVX512F-NEXT:    vbroadcastss %xmm14, %ymm14
2693; AVX512F-NEXT:    vmulps %ymm14, %ymm11, %ymm14
2694; AVX512F-NEXT:    vaddps %ymm14, %ymm12, %ymm12
2695; AVX512F-NEXT:    vshufpd {{.*#+}} xmm14 = xmm6[1,0]
2696; AVX512F-NEXT:    vbroadcastss %xmm14, %ymm14
2697; AVX512F-NEXT:    vmulps %ymm1, %ymm14, %ymm14
2698; AVX512F-NEXT:    vaddps %ymm14, %ymm12, %ymm12
2699; AVX512F-NEXT:    vshufps {{.*#+}} xmm14 = xmm6[3,3,3,3]
2700; AVX512F-NEXT:    vbroadcastss %xmm14, %ymm14
2701; AVX512F-NEXT:    vmulps %ymm14, %ymm10, %ymm14
2702; AVX512F-NEXT:    vaddps %ymm14, %ymm12, %ymm12
2703; AVX512F-NEXT:    vextractf128 $1, %ymm6, %xmm14
2704; AVX512F-NEXT:    vbroadcastss %xmm14, %ymm14
2705; AVX512F-NEXT:    vmulps %ymm2, %ymm14, %ymm14
2706; AVX512F-NEXT:    vaddps %ymm14, %ymm12, %ymm12
2707; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm14 = ymm6[1,1,3,3,5,5,7,7]
2708; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
2709; AVX512F-NEXT:    vmulps %ymm14, %ymm9, %ymm14
2710; AVX512F-NEXT:    vaddps %ymm14, %ymm12, %ymm12
2711; AVX512F-NEXT:    vshufps {{.*#+}} ymm14 = ymm6[2,2,2,2,6,6,6,6]
2712; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
2713; AVX512F-NEXT:    vmulps %ymm3, %ymm14, %ymm14
2714; AVX512F-NEXT:    vaddps %ymm14, %ymm12, %ymm12
2715; AVX512F-NEXT:    vshufps {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7]
2716; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
2717; AVX512F-NEXT:    vmulps %ymm14, %ymm8, %ymm14
2718; AVX512F-NEXT:    vaddps %ymm14, %ymm12, %ymm14
2719; AVX512F-NEXT:    vextractf32x4 $2, %zmm6, %xmm12
2720; AVX512F-NEXT:    vbroadcastss %xmm12, %ymm12
2721; AVX512F-NEXT:    vmulps %ymm0, %ymm12, %ymm12
2722; AVX512F-NEXT:    vextractf64x4 $1, %zmm6, %ymm15
2723; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm15[1,1,3,3]
2724; AVX512F-NEXT:    vbroadcastsd %xmm4, %ymm4
2725; AVX512F-NEXT:    vmulps %ymm4, %ymm11, %ymm4
2726; AVX512F-NEXT:    vaddps %ymm4, %ymm12, %ymm4
2727; AVX512F-NEXT:    vshufps {{.*#+}} xmm12 = xmm15[2,2,2,2]
2728; AVX512F-NEXT:    vbroadcastsd %xmm12, %ymm12
2729; AVX512F-NEXT:    vmulps %ymm1, %ymm12, %ymm12
2730; AVX512F-NEXT:    vaddps %ymm4, %ymm12, %ymm4
2731; AVX512F-NEXT:    vshufps {{.*#+}} xmm12 = xmm15[3,3,3,3]
2732; AVX512F-NEXT:    vbroadcastsd %xmm12, %ymm12
2733; AVX512F-NEXT:    vmulps %ymm12, %ymm10, %ymm12
2734; AVX512F-NEXT:    vaddps %ymm4, %ymm12, %ymm4
2735; AVX512F-NEXT:    vextractf32x4 $3, %zmm6, %xmm6
2736; AVX512F-NEXT:    vbroadcastss %xmm6, %ymm6
2737; AVX512F-NEXT:    vmulps %ymm6, %ymm2, %ymm6
2738; AVX512F-NEXT:    vaddps %ymm6, %ymm4, %ymm4
2739; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm6 = ymm15[1,1,3,3,5,5,7,7]
2740; AVX512F-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2]
2741; AVX512F-NEXT:    vmulps %ymm6, %ymm9, %ymm6
2742; AVX512F-NEXT:    vaddps %ymm6, %ymm4, %ymm4
2743; AVX512F-NEXT:    vshufps {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6]
2744; AVX512F-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2]
2745; AVX512F-NEXT:    vmulps %ymm6, %ymm3, %ymm6
2746; AVX512F-NEXT:    vaddps %ymm6, %ymm4, %ymm4
2747; AVX512F-NEXT:    vshufps {{.*#+}} ymm6 = ymm15[3,3,3,3,7,7,7,7]
2748; AVX512F-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2]
2749; AVX512F-NEXT:    vmulps %ymm6, %ymm8, %ymm6
2750; AVX512F-NEXT:    vaddps %ymm6, %ymm4, %ymm6
2751; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
2752; AVX512F-NEXT:    vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
2753; AVX512F-NEXT:    vbroadcastss %xmm7, %ymm12
2754; AVX512F-NEXT:    vmulps %ymm0, %ymm12, %ymm12
2755; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm15 = xmm7[1,1,3,3]
2756; AVX512F-NEXT:    vbroadcastss %xmm15, %ymm15
2757; AVX512F-NEXT:    vmulps %ymm15, %ymm11, %ymm15
2758; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
2759; AVX512F-NEXT:    vshufpd {{.*#+}} xmm15 = xmm7[1,0]
2760; AVX512F-NEXT:    vbroadcastss %xmm15, %ymm15
2761; AVX512F-NEXT:    vmulps %ymm1, %ymm15, %ymm15
2762; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
2763; AVX512F-NEXT:    vshufps {{.*#+}} xmm15 = xmm7[3,3,3,3]
2764; AVX512F-NEXT:    vbroadcastss %xmm15, %ymm15
2765; AVX512F-NEXT:    vmulps %ymm15, %ymm10, %ymm15
2766; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
2767; AVX512F-NEXT:    vextractf128 $1, %ymm7, %xmm15
2768; AVX512F-NEXT:    vbroadcastss %xmm15, %ymm15
2769; AVX512F-NEXT:    vmulps %ymm2, %ymm15, %ymm15
2770; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
2771; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm15 = ymm7[1,1,3,3,5,5,7,7]
2772; AVX512F-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
2773; AVX512F-NEXT:    vmulps %ymm15, %ymm9, %ymm15
2774; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
2775; AVX512F-NEXT:    vshufps {{.*#+}} ymm15 = ymm7[2,2,2,2,6,6,6,6]
2776; AVX512F-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
2777; AVX512F-NEXT:    vmulps %ymm3, %ymm15, %ymm15
2778; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
2779; AVX512F-NEXT:    vshufps {{.*#+}} ymm15 = ymm7[3,3,3,3,7,7,7,7]
2780; AVX512F-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
2781; AVX512F-NEXT:    vmulps %ymm15, %ymm8, %ymm15
2782; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
2783; AVX512F-NEXT:    vinsertf64x4 $1, %ymm5, %zmm13, %zmm5
2784; AVX512F-NEXT:    vextractf64x4 $1, %zmm7, %ymm13
2785; AVX512F-NEXT:    vextractf32x4 $2, %zmm7, %xmm15
2786; AVX512F-NEXT:    vbroadcastss %xmm15, %ymm15
2787; AVX512F-NEXT:    vmulps %ymm0, %ymm15, %ymm0
2788; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm15 = xmm13[1,1,3,3]
2789; AVX512F-NEXT:    vbroadcastsd %xmm15, %ymm15
2790; AVX512F-NEXT:    vmulps %ymm15, %ymm11, %ymm11
2791; AVX512F-NEXT:    vaddps %ymm0, %ymm11, %ymm0
2792; AVX512F-NEXT:    vshufps {{.*#+}} xmm11 = xmm13[2,2,2,2]
2793; AVX512F-NEXT:    vbroadcastsd %xmm11, %ymm11
2794; AVX512F-NEXT:    vmulps %ymm1, %ymm11, %ymm1
2795; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2796; AVX512F-NEXT:    vshufps {{.*#+}} xmm1 = xmm13[3,3,3,3]
2797; AVX512F-NEXT:    vbroadcastsd %xmm1, %ymm1
2798; AVX512F-NEXT:    vmulps %ymm1, %ymm10, %ymm1
2799; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2800; AVX512F-NEXT:    vextractf32x4 $3, %zmm7, %xmm1
2801; AVX512F-NEXT:    vbroadcastss %xmm1, %ymm1
2802; AVX512F-NEXT:    vmulps %ymm1, %ymm2, %ymm1
2803; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2804; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm1 = ymm13[1,1,3,3,5,5,7,7]
2805; AVX512F-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
2806; AVX512F-NEXT:    vmulps %ymm1, %ymm9, %ymm1
2807; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2808; AVX512F-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[2,2,2,2,6,6,6,6]
2809; AVX512F-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
2810; AVX512F-NEXT:    vmulps %ymm1, %ymm3, %ymm1
2811; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2812; AVX512F-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7]
2813; AVX512F-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
2814; AVX512F-NEXT:    vmulps %ymm1, %ymm8, %ymm1
2815; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
2816; AVX512F-NEXT:    vinsertf64x4 $1, %ymm6, %zmm14, %zmm2
2817; AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm12, %zmm3
2818; AVX512F-NEXT:    vmovaps %zmm4, %zmm0
2819; AVX512F-NEXT:    vmovaps %zmm5, %zmm1
2820; AVX512F-NEXT:    retq
2821;
2822; AVX512VL-LABEL: test_mul8x8_f32:
2823; AVX512VL:       # %bb.0: # %entry
2824; AVX512VL-NEXT:    vextractf64x4 $1, %zmm0, %ymm11
2825; AVX512VL-NEXT:    vextractf64x4 $1, %zmm1, %ymm10
2826; AVX512VL-NEXT:    vextractf64x4 $1, %zmm2, %ymm9
2827; AVX512VL-NEXT:    vextractf64x4 $1, %zmm3, %ymm8
2828; AVX512VL-NEXT:    vbroadcastss %xmm4, %ymm12
2829; AVX512VL-NEXT:    vmulps %ymm0, %ymm12, %ymm12
2830; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm13 = xmm4[1,1,3,3]
2831; AVX512VL-NEXT:    vbroadcastss %xmm13, %ymm13
2832; AVX512VL-NEXT:    vmulps %ymm13, %ymm11, %ymm13
2833; AVX512VL-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2834; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm13 = xmm4[1,0]
2835; AVX512VL-NEXT:    vbroadcastss %xmm13, %ymm13
2836; AVX512VL-NEXT:    vmulps %ymm1, %ymm13, %ymm13
2837; AVX512VL-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2838; AVX512VL-NEXT:    vshufps {{.*#+}} xmm13 = xmm4[3,3,3,3]
2839; AVX512VL-NEXT:    vbroadcastss %xmm13, %ymm13
2840; AVX512VL-NEXT:    vmulps %ymm13, %ymm10, %ymm13
2841; AVX512VL-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2842; AVX512VL-NEXT:    vextractf128 $1, %ymm4, %xmm13
2843; AVX512VL-NEXT:    vbroadcastss %xmm13, %ymm13
2844; AVX512VL-NEXT:    vmulps %ymm2, %ymm13, %ymm13
2845; AVX512VL-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2846; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm13 = ymm4[1,1,3,3,5,5,7,7]
2847; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
2848; AVX512VL-NEXT:    vmulps %ymm13, %ymm9, %ymm13
2849; AVX512VL-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2850; AVX512VL-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[2,2,2,2,6,6,6,6]
2851; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
2852; AVX512VL-NEXT:    vmulps %ymm3, %ymm13, %ymm13
2853; AVX512VL-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2854; AVX512VL-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[3,3,3,3,7,7,7,7]
2855; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
2856; AVX512VL-NEXT:    vmulps %ymm13, %ymm8, %ymm13
2857; AVX512VL-NEXT:    vaddps %ymm13, %ymm12, %ymm12
2858; AVX512VL-NEXT:    vextractf64x4 $1, %zmm4, %ymm13
2859; AVX512VL-NEXT:    vextractf32x4 $2, %zmm4, %xmm14
2860; AVX512VL-NEXT:    vbroadcastss %xmm14, %ymm14
2861; AVX512VL-NEXT:    vmulps %ymm0, %ymm14, %ymm14
2862; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm15 = xmm13[1,1,3,3]
2863; AVX512VL-NEXT:    vbroadcastsd %xmm15, %ymm15
2864; AVX512VL-NEXT:    vmulps %ymm15, %ymm11, %ymm15
2865; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2866; AVX512VL-NEXT:    vshufps {{.*#+}} xmm15 = xmm13[2,2,2,2]
2867; AVX512VL-NEXT:    vbroadcastsd %xmm15, %ymm15
2868; AVX512VL-NEXT:    vmulps %ymm1, %ymm15, %ymm15
2869; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2870; AVX512VL-NEXT:    vshufps {{.*#+}} xmm15 = xmm13[3,3,3,3]
2871; AVX512VL-NEXT:    vbroadcastsd %xmm15, %ymm15
2872; AVX512VL-NEXT:    vmulps %ymm15, %ymm10, %ymm15
2873; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2874; AVX512VL-NEXT:    vextractf32x4 $3, %zmm4, %xmm4
2875; AVX512VL-NEXT:    vbroadcastss %xmm4, %ymm4
2876; AVX512VL-NEXT:    vmulps %ymm4, %ymm2, %ymm4
2877; AVX512VL-NEXT:    vaddps %ymm4, %ymm14, %ymm4
2878; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm14 = ymm13[1,1,3,3,5,5,7,7]
2879; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
2880; AVX512VL-NEXT:    vmulps %ymm14, %ymm9, %ymm14
2881; AVX512VL-NEXT:    vaddps %ymm4, %ymm14, %ymm4
2882; AVX512VL-NEXT:    vshufps {{.*#+}} ymm14 = ymm13[2,2,2,2,6,6,6,6]
2883; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
2884; AVX512VL-NEXT:    vmulps %ymm3, %ymm14, %ymm14
2885; AVX512VL-NEXT:    vaddps %ymm4, %ymm14, %ymm4
2886; AVX512VL-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[3,3,3,3,7,7,7,7]
2887; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
2888; AVX512VL-NEXT:    vmulps %ymm13, %ymm8, %ymm13
2889; AVX512VL-NEXT:    vaddps %ymm4, %ymm13, %ymm4
2890; AVX512VL-NEXT:    vbroadcastss %xmm5, %ymm13
2891; AVX512VL-NEXT:    vmulps %ymm0, %ymm13, %ymm13
2892; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm14 = xmm5[1,1,3,3]
2893; AVX512VL-NEXT:    vbroadcastss %xmm14, %ymm14
2894; AVX512VL-NEXT:    vmulps %ymm14, %ymm11, %ymm14
2895; AVX512VL-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2896; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm14 = xmm5[1,0]
2897; AVX512VL-NEXT:    vbroadcastss %xmm14, %ymm14
2898; AVX512VL-NEXT:    vmulps %ymm1, %ymm14, %ymm14
2899; AVX512VL-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2900; AVX512VL-NEXT:    vshufps {{.*#+}} xmm14 = xmm5[3,3,3,3]
2901; AVX512VL-NEXT:    vbroadcastss %xmm14, %ymm14
2902; AVX512VL-NEXT:    vmulps %ymm14, %ymm10, %ymm14
2903; AVX512VL-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2904; AVX512VL-NEXT:    vextractf128 $1, %ymm5, %xmm14
2905; AVX512VL-NEXT:    vbroadcastss %xmm14, %ymm14
2906; AVX512VL-NEXT:    vmulps %ymm2, %ymm14, %ymm14
2907; AVX512VL-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2908; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm14 = ymm5[1,1,3,3,5,5,7,7]
2909; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
2910; AVX512VL-NEXT:    vmulps %ymm14, %ymm9, %ymm14
2911; AVX512VL-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2912; AVX512VL-NEXT:    vshufps {{.*#+}} ymm14 = ymm5[2,2,2,2,6,6,6,6]
2913; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
2914; AVX512VL-NEXT:    vmulps %ymm3, %ymm14, %ymm14
2915; AVX512VL-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2916; AVX512VL-NEXT:    vshufps {{.*#+}} ymm14 = ymm5[3,3,3,3,7,7,7,7]
2917; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
2918; AVX512VL-NEXT:    vmulps %ymm14, %ymm8, %ymm14
2919; AVX512VL-NEXT:    vaddps %ymm14, %ymm13, %ymm13
2920; AVX512VL-NEXT:    vextractf64x4 $1, %zmm5, %ymm14
2921; AVX512VL-NEXT:    vextractf32x4 $2, %zmm5, %xmm15
2922; AVX512VL-NEXT:    vbroadcastss %xmm15, %ymm15
2923; AVX512VL-NEXT:    vmulps %ymm0, %ymm15, %ymm15
2924; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm16 = xmm14[1,1,3,3]
2925; AVX512VL-NEXT:    vbroadcastsd %xmm16, %ymm16
2926; AVX512VL-NEXT:    vmulps %ymm16, %ymm11, %ymm16
2927; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
2928; AVX512VL-NEXT:    vshufps {{.*#+}} xmm16 = xmm14[2,2,2,2]
2929; AVX512VL-NEXT:    vbroadcastsd %xmm16, %ymm16
2930; AVX512VL-NEXT:    vmulps %ymm16, %ymm1, %ymm16
2931; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
2932; AVX512VL-NEXT:    vshufps {{.*#+}} xmm16 = xmm14[3,3,3,3]
2933; AVX512VL-NEXT:    vbroadcastsd %xmm16, %ymm16
2934; AVX512VL-NEXT:    vmulps %ymm16, %ymm10, %ymm16
2935; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
2936; AVX512VL-NEXT:    vextractf32x4 $3, %zmm5, %xmm5
2937; AVX512VL-NEXT:    vbroadcastss %xmm5, %ymm5
2938; AVX512VL-NEXT:    vmulps %ymm5, %ymm2, %ymm5
2939; AVX512VL-NEXT:    vaddps %ymm5, %ymm15, %ymm5
2940; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm15 = ymm14[1,1,3,3,5,5,7,7]
2941; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
2942; AVX512VL-NEXT:    vmulps %ymm15, %ymm9, %ymm15
2943; AVX512VL-NEXT:    vaddps %ymm5, %ymm15, %ymm5
2944; AVX512VL-NEXT:    vshufps {{.*#+}} ymm15 = ymm14[2,2,2,2,6,6,6,6]
2945; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
2946; AVX512VL-NEXT:    vmulps %ymm3, %ymm15, %ymm15
2947; AVX512VL-NEXT:    vaddps %ymm5, %ymm15, %ymm5
2948; AVX512VL-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[3,3,3,3,7,7,7,7]
2949; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
2950; AVX512VL-NEXT:    vmulps %ymm14, %ymm8, %ymm14
2951; AVX512VL-NEXT:    vaddps %ymm5, %ymm14, %ymm5
2952; AVX512VL-NEXT:    vbroadcastss %xmm6, %ymm14
2953; AVX512VL-NEXT:    vmulps %ymm0, %ymm14, %ymm14
2954; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm15 = xmm6[1,1,3,3]
2955; AVX512VL-NEXT:    vbroadcastss %xmm15, %ymm15
2956; AVX512VL-NEXT:    vmulps %ymm15, %ymm11, %ymm15
2957; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2958; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm15 = xmm6[1,0]
2959; AVX512VL-NEXT:    vbroadcastss %xmm15, %ymm15
2960; AVX512VL-NEXT:    vmulps %ymm1, %ymm15, %ymm15
2961; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2962; AVX512VL-NEXT:    vshufps {{.*#+}} xmm15 = xmm6[3,3,3,3]
2963; AVX512VL-NEXT:    vbroadcastss %xmm15, %ymm15
2964; AVX512VL-NEXT:    vmulps %ymm15, %ymm10, %ymm15
2965; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2966; AVX512VL-NEXT:    vextractf128 $1, %ymm6, %xmm15
2967; AVX512VL-NEXT:    vbroadcastss %xmm15, %ymm15
2968; AVX512VL-NEXT:    vmulps %ymm2, %ymm15, %ymm15
2969; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2970; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm15 = ymm6[1,1,3,3,5,5,7,7]
2971; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
2972; AVX512VL-NEXT:    vmulps %ymm15, %ymm9, %ymm15
2973; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2974; AVX512VL-NEXT:    vshufps {{.*#+}} ymm15 = ymm6[2,2,2,2,6,6,6,6]
2975; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
2976; AVX512VL-NEXT:    vmulps %ymm3, %ymm15, %ymm15
2977; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2978; AVX512VL-NEXT:    vshufps {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7]
2979; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
2980; AVX512VL-NEXT:    vmulps %ymm15, %ymm8, %ymm15
2981; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
2982; AVX512VL-NEXT:    vextractf64x4 $1, %zmm6, %ymm15
2983; AVX512VL-NEXT:    vextractf32x4 $2, %zmm6, %xmm16
2984; AVX512VL-NEXT:    vbroadcastss %xmm16, %ymm16
2985; AVX512VL-NEXT:    vmulps %ymm16, %ymm0, %ymm16
2986; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm17 = xmm15[1,1,3,3]
2987; AVX512VL-NEXT:    vbroadcastsd %xmm17, %ymm17
2988; AVX512VL-NEXT:    vmulps %ymm17, %ymm11, %ymm17
2989; AVX512VL-NEXT:    vaddps %ymm17, %ymm16, %ymm16
2990; AVX512VL-NEXT:    vshufps {{.*#+}} xmm17 = xmm15[2,2,2,2]
2991; AVX512VL-NEXT:    vbroadcastsd %xmm17, %ymm17
2992; AVX512VL-NEXT:    vmulps %ymm17, %ymm1, %ymm17
2993; AVX512VL-NEXT:    vaddps %ymm17, %ymm16, %ymm16
2994; AVX512VL-NEXT:    vshufps {{.*#+}} xmm17 = xmm15[3,3,3,3]
2995; AVX512VL-NEXT:    vbroadcastsd %xmm17, %ymm17
2996; AVX512VL-NEXT:    vmulps %ymm17, %ymm10, %ymm17
2997; AVX512VL-NEXT:    vaddps %ymm17, %ymm16, %ymm16
2998; AVX512VL-NEXT:    vextractf32x4 $3, %zmm6, %xmm6
2999; AVX512VL-NEXT:    vbroadcastss %xmm6, %ymm6
3000; AVX512VL-NEXT:    vmulps %ymm6, %ymm2, %ymm6
3001; AVX512VL-NEXT:    vaddps %ymm6, %ymm16, %ymm6
3002; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm16 = ymm15[1,1,3,3,5,5,7,7]
3003; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm16 = ymm16[2,2,2,2]
3004; AVX512VL-NEXT:    vmulps %ymm16, %ymm9, %ymm16
3005; AVX512VL-NEXT:    vaddps %ymm16, %ymm6, %ymm6
3006; AVX512VL-NEXT:    vshufps {{.*#+}} ymm16 = ymm15[2,2,2,2,6,6,6,6]
3007; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm16 = ymm16[2,2,2,2]
3008; AVX512VL-NEXT:    vmulps %ymm16, %ymm3, %ymm16
3009; AVX512VL-NEXT:    vaddps %ymm16, %ymm6, %ymm6
3010; AVX512VL-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[3,3,3,3,7,7,7,7]
3011; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
3012; AVX512VL-NEXT:    vmulps %ymm15, %ymm8, %ymm15
3013; AVX512VL-NEXT:    vaddps %ymm6, %ymm15, %ymm6
3014; AVX512VL-NEXT:    vbroadcastss %xmm7, %ymm15
3015; AVX512VL-NEXT:    vmulps %ymm0, %ymm15, %ymm15
3016; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm16 = xmm7[1,1,3,3]
3017; AVX512VL-NEXT:    vbroadcastss %xmm16, %ymm16
3018; AVX512VL-NEXT:    vmulps %ymm16, %ymm11, %ymm16
3019; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
3020; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm16 = xmm7[1,0]
3021; AVX512VL-NEXT:    vbroadcastss %xmm16, %ymm16
3022; AVX512VL-NEXT:    vmulps %ymm16, %ymm1, %ymm16
3023; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
3024; AVX512VL-NEXT:    vshufps {{.*#+}} xmm16 = xmm7[3,3,3,3]
3025; AVX512VL-NEXT:    vbroadcastss %xmm16, %ymm16
3026; AVX512VL-NEXT:    vmulps %ymm16, %ymm10, %ymm16
3027; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
3028; AVX512VL-NEXT:    vextractf32x4 $1, %ymm7, %xmm16
3029; AVX512VL-NEXT:    vbroadcastss %xmm16, %ymm16
3030; AVX512VL-NEXT:    vmulps %ymm16, %ymm2, %ymm16
3031; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
3032; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm16 = ymm7[1,1,3,3,5,5,7,7]
3033; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm16 = ymm16[2,2,2,2]
3034; AVX512VL-NEXT:    vmulps %ymm16, %ymm9, %ymm16
3035; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
3036; AVX512VL-NEXT:    vshufps {{.*#+}} ymm16 = ymm7[2,2,2,2,6,6,6,6]
3037; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm16 = ymm16[2,2,2,2]
3038; AVX512VL-NEXT:    vmulps %ymm16, %ymm3, %ymm16
3039; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
3040; AVX512VL-NEXT:    vshufps {{.*#+}} ymm16 = ymm7[3,3,3,3,7,7,7,7]
3041; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm16 = ymm16[2,2,2,2]
3042; AVX512VL-NEXT:    vmulps %ymm16, %ymm8, %ymm16
3043; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
3044; AVX512VL-NEXT:    vextractf64x4 $1, %zmm7, %ymm16
3045; AVX512VL-NEXT:    vextractf32x4 $2, %zmm7, %xmm17
3046; AVX512VL-NEXT:    vbroadcastss %xmm17, %ymm17
3047; AVX512VL-NEXT:    vmulps %ymm17, %ymm0, %ymm0
3048; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm17 = xmm16[1,1,3,3]
3049; AVX512VL-NEXT:    vbroadcastsd %xmm17, %ymm17
3050; AVX512VL-NEXT:    vmulps %ymm17, %ymm11, %ymm11
3051; AVX512VL-NEXT:    vaddps %ymm0, %ymm11, %ymm0
3052; AVX512VL-NEXT:    vshufps {{.*#+}} xmm11 = xmm16[2,2,2,2]
3053; AVX512VL-NEXT:    vbroadcastsd %xmm11, %ymm11
3054; AVX512VL-NEXT:    vmulps %ymm1, %ymm11, %ymm1
3055; AVX512VL-NEXT:    vaddps %ymm1, %ymm0, %ymm0
3056; AVX512VL-NEXT:    vshufps {{.*#+}} xmm1 = xmm16[3,3,3,3]
3057; AVX512VL-NEXT:    vbroadcastsd %xmm1, %ymm1
3058; AVX512VL-NEXT:    vmulps %ymm1, %ymm10, %ymm1
3059; AVX512VL-NEXT:    vaddps %ymm1, %ymm0, %ymm0
3060; AVX512VL-NEXT:    vextractf32x4 $3, %zmm7, %xmm1
3061; AVX512VL-NEXT:    vbroadcastss %xmm1, %ymm1
3062; AVX512VL-NEXT:    vmulps %ymm1, %ymm2, %ymm1
3063; AVX512VL-NEXT:    vaddps %ymm1, %ymm0, %ymm0
3064; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm1 = ymm16[1,1,3,3,5,5,7,7]
3065; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
3066; AVX512VL-NEXT:    vmulps %ymm1, %ymm9, %ymm1
3067; AVX512VL-NEXT:    vaddps %ymm1, %ymm0, %ymm0
3068; AVX512VL-NEXT:    vshufps {{.*#+}} ymm1 = ymm16[2,2,2,2,6,6,6,6]
3069; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
3070; AVX512VL-NEXT:    vmulps %ymm1, %ymm3, %ymm1
3071; AVX512VL-NEXT:    vaddps %ymm1, %ymm0, %ymm0
3072; AVX512VL-NEXT:    vshufps {{.*#+}} ymm1 = ymm16[3,3,3,3,7,7,7,7]
3073; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
3074; AVX512VL-NEXT:    vmulps %ymm1, %ymm8, %ymm1
3075; AVX512VL-NEXT:    vaddps %ymm1, %ymm0, %ymm3
3076; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm4, %zmm12, %zmm0
3077; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm5, %zmm13, %zmm1
3078; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm6, %zmm14, %zmm2
3079; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm3, %zmm15, %zmm3
3080; AVX512VL-NEXT:    retq
3081entry:
3082  %split = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3083  %split1 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3084  %split2 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
3085  %split3 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
3086  %split4 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
3087  %split5 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3088  %split6 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
3089  %split7 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
3090  %splat.splat = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> zeroinitializer
3091  %0 = fmul <8 x float> %split, %splat.splat
3092  %splat.splat18 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3093  %1 = fmul <8 x float> %split1, %splat.splat18
3094  %2 = fadd <8 x float> %0, %1
3095  %splat.splat21 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3096  %3 = fmul <8 x float> %split2, %splat.splat21
3097  %4 = fadd <8 x float> %2, %3
3098  %splat.splat24 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
3099  %5 = fmul <8 x float> %split3, %splat.splat24
3100  %6 = fadd <8 x float> %4, %5
3101  %splat.splat27 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
3102  %7 = fmul <8 x float> %split4, %splat.splat27
3103  %8 = fadd <8 x float> %6, %7
3104  %splat.splat30 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
3105  %9 = fmul <8 x float> %split5, %splat.splat30
3106  %10 = fadd <8 x float> %8, %9
3107  %splat.splat33 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
3108  %11 = fmul <8 x float> %split6, %splat.splat33
3109  %12 = fadd <8 x float> %10, %11
3110  %splat.splat36 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
3111  %13 = fmul <8 x float> %split7, %splat.splat36
3112  %14 = fadd <8 x float> %12, %13
3113  %splat.splat39 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
3114  %15 = fmul <8 x float> %split, %splat.splat39
3115  %splat.splat42 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
3116  %16 = fmul <8 x float> %split1, %splat.splat42
3117  %17 = fadd <8 x float> %15, %16
3118  %splat.splat45 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
3119  %18 = fmul <8 x float> %split2, %splat.splat45
3120  %19 = fadd <8 x float> %17, %18
3121  %splat.splat48 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
3122  %20 = fmul <8 x float> %split3, %splat.splat48
3123  %21 = fadd <8 x float> %19, %20
3124  %splat.splat51 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
3125  %22 = fmul <8 x float> %split4, %splat.splat51
3126  %23 = fadd <8 x float> %21, %22
3127  %splat.splat54 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
3128  %24 = fmul <8 x float> %split5, %splat.splat54
3129  %25 = fadd <8 x float> %23, %24
3130  %splat.splat57 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14>
3131  %26 = fmul <8 x float> %split6, %splat.splat57
3132  %27 = fadd <8 x float> %25, %26
3133  %splat.splat60 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
3134  %28 = fmul <8 x float> %split7, %splat.splat60
3135  %29 = fadd <8 x float> %27, %28
3136  %splat.splat63 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
3137  %30 = fmul <8 x float> %split, %splat.splat63
3138  %splat.splat66 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
3139  %31 = fmul <8 x float> %split1, %splat.splat66
3140  %32 = fadd <8 x float> %30, %31
3141  %splat.splat69 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18>
3142  %33 = fmul <8 x float> %split2, %splat.splat69
3143  %34 = fadd <8 x float> %32, %33
3144  %splat.splat72 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19>
3145  %35 = fmul <8 x float> %split3, %splat.splat72
3146  %36 = fadd <8 x float> %34, %35
3147  %splat.splat75 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20>
3148  %37 = fmul <8 x float> %split4, %splat.splat75
3149  %38 = fadd <8 x float> %36, %37
3150  %splat.splat78 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21>
3151  %39 = fmul <8 x float> %split5, %splat.splat78
3152  %40 = fadd <8 x float> %38, %39
3153  %splat.splat81 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22>
3154  %41 = fmul <8 x float> %split6, %splat.splat81
3155  %42 = fadd <8 x float> %40, %41
3156  %splat.splat84 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
3157  %43 = fmul <8 x float> %split7, %splat.splat84
3158  %44 = fadd <8 x float> %42, %43
3159  %splat.splat87 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
3160  %45 = fmul <8 x float> %split, %splat.splat87
3161  %splat.splat90 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
3162  %46 = fmul <8 x float> %split1, %splat.splat90
3163  %47 = fadd <8 x float> %45, %46
3164  %splat.splat93 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26>
3165  %48 = fmul <8 x float> %split2, %splat.splat93
3166  %49 = fadd <8 x float> %47, %48
3167  %splat.splat96 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
3168  %50 = fmul <8 x float> %split3, %splat.splat96
3169  %51 = fadd <8 x float> %49, %50
3170  %splat.splat99 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
3171  %52 = fmul <8 x float> %split4, %splat.splat99
3172  %53 = fadd <8 x float> %51, %52
3173  %splat.splat102 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29>
3174  %54 = fmul <8 x float> %split5, %splat.splat102
3175  %55 = fadd <8 x float> %53, %54
3176  %splat.splat105 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30>
3177  %56 = fmul <8 x float> %split6, %splat.splat105
3178  %57 = fadd <8 x float> %55, %56
3179  %splat.splat108 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
3180  %58 = fmul <8 x float> %split7, %splat.splat108
3181  %59 = fadd <8 x float> %57, %58
3182  %splat.splat111 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
3183  %60 = fmul <8 x float> %split, %splat.splat111
3184  %splat.splat114 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33>
3185  %61 = fmul <8 x float> %split1, %splat.splat114
3186  %62 = fadd <8 x float> %60, %61
3187  %splat.splat117 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34>
3188  %63 = fmul <8 x float> %split2, %splat.splat117
3189  %64 = fadd <8 x float> %62, %63
3190  %splat.splat120 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35>
3191  %65 = fmul <8 x float> %split3, %splat.splat120
3192  %66 = fadd <8 x float> %64, %65
3193  %splat.splat123 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36>
3194  %67 = fmul <8 x float> %split4, %splat.splat123
3195  %68 = fadd <8 x float> %66, %67
3196  %splat.splat126 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37>
3197  %69 = fmul <8 x float> %split5, %splat.splat126
3198  %70 = fadd <8 x float> %68, %69
3199  %splat.splat129 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38>
3200  %71 = fmul <8 x float> %split6, %splat.splat129
3201  %72 = fadd <8 x float> %70, %71
3202  %splat.splat132 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39>
3203  %73 = fmul <8 x float> %split7, %splat.splat132
3204  %74 = fadd <8 x float> %72, %73
3205  %splat.splat135 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>
3206  %75 = fmul <8 x float> %split, %splat.splat135
3207  %splat.splat138 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41>
3208  %76 = fmul <8 x float> %split1, %splat.splat138
3209  %77 = fadd <8 x float> %75, %76
3210  %splat.splat141 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
3211  %78 = fmul <8 x float> %split2, %splat.splat141
3212  %79 = fadd <8 x float> %77, %78
3213  %splat.splat144 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43>
3214  %80 = fmul <8 x float> %split3, %splat.splat144
3215  %81 = fadd <8 x float> %79, %80
3216  %splat.splat147 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44>
3217  %82 = fmul <8 x float> %split4, %splat.splat147
3218  %83 = fadd <8 x float> %81, %82
3219  %splat.splat150 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45>
3220  %84 = fmul <8 x float> %split5, %splat.splat150
3221  %85 = fadd <8 x float> %83, %84
3222  %splat.splat153 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46>
3223  %86 = fmul <8 x float> %split6, %splat.splat153
3224  %87 = fadd <8 x float> %85, %86
3225  %splat.splat156 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47>
3226  %88 = fmul <8 x float> %split7, %splat.splat156
3227  %89 = fadd <8 x float> %87, %88
3228  %splat.splat159 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
3229  %90 = fmul <8 x float> %split, %splat.splat159
3230  %splat.splat162 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49>
3231  %91 = fmul <8 x float> %split1, %splat.splat162
3232  %92 = fadd <8 x float> %90, %91
3233  %splat.splat165 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50>
3234  %93 = fmul <8 x float> %split2, %splat.splat165
3235  %94 = fadd <8 x float> %92, %93
3236  %splat.splat168 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51>
3237  %95 = fmul <8 x float> %split3, %splat.splat168
3238  %96 = fadd <8 x float> %94, %95
3239  %splat.splat171 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52>
3240  %97 = fmul <8 x float> %split4, %splat.splat171
3241  %98 = fadd <8 x float> %96, %97
3242  %splat.splat174 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53>
3243  %99 = fmul <8 x float> %split5, %splat.splat174
3244  %100 = fadd <8 x float> %98, %99
3245  %splat.splat177 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54>
3246  %101 = fmul <8 x float> %split6, %splat.splat177
3247  %102 = fadd <8 x float> %100, %101
3248  %splat.splat180 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55>
3249  %103 = fmul <8 x float> %split7, %splat.splat180
3250  %104 = fadd <8 x float> %102, %103
3251  %splat.splat183 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
3252  %105 = fmul <8 x float> %split, %splat.splat183
3253  %splat.splat186 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57>
3254  %106 = fmul <8 x float> %split1, %splat.splat186
3255  %107 = fadd <8 x float> %105, %106
3256  %splat.splat189 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58>
3257  %108 = fmul <8 x float> %split2, %splat.splat189
3258  %109 = fadd <8 x float> %107, %108
3259  %splat.splat192 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59>
3260  %110 = fmul <8 x float> %split3, %splat.splat192
3261  %111 = fadd <8 x float> %109, %110
3262  %splat.splat195 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60>
3263  %112 = fmul <8 x float> %split4, %splat.splat195
3264  %113 = fadd <8 x float> %111, %112
3265  %splat.splat198 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61>
3266  %114 = fmul <8 x float> %split5, %splat.splat198
3267  %115 = fadd <8 x float> %113, %114
3268  %splat.splat201 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62>
3269  %116 = fmul <8 x float> %split6, %splat.splat201
3270  %117 = fadd <8 x float> %115, %116
3271  %splat.splat204 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
3272  %118 = fmul <8 x float> %split7, %splat.splat204
3273  %119 = fadd <8 x float> %117, %118
3274  %120 = shufflevector <8 x float> %14, <8 x float> %29, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3275  %121 = shufflevector <8 x float> %44, <8 x float> %59, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3276  %122 = shufflevector <8 x float> %74, <8 x float> %89, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3277  %123 = shufflevector <8 x float> %104, <8 x float> %119, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3278  %124 = shufflevector <16 x float> %120, <16 x float> %121, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
3279  %125 = shufflevector <16 x float> %122, <16 x float> %123, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
3280  %126 = shufflevector <32 x float> %124, <32 x float> %125, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
3281  ret <64 x float> %126
3282}
3283
3284define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) nounwind {
3285; SSE-LABEL: test_mul8x8_f64:
3286; SSE:       # %bb.0: # %entry
3287; SSE-NEXT:    subq $328, %rsp # imm = 0x148
3288; SSE-NEXT:    movapd %xmm7, %xmm15
3289; SSE-NEXT:    movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3290; SSE-NEXT:    movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3291; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3292; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3293; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3294; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3295; SSE-NEXT:    movq %rdi, %rax
3296; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm14
3297; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm11
3298; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm13
3299; SSE-NEXT:    movapd %xmm13, %xmm12
3300; SSE-NEXT:    unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm13[0]
3301; SSE-NEXT:    movapd %xmm3, %xmm10
3302; SSE-NEXT:    mulpd %xmm12, %xmm10
3303; SSE-NEXT:    movapd %xmm2, %xmm8
3304; SSE-NEXT:    mulpd %xmm12, %xmm8
3305; SSE-NEXT:    movapd %xmm1, %xmm9
3306; SSE-NEXT:    mulpd %xmm12, %xmm9
3307; SSE-NEXT:    mulpd %xmm0, %xmm12
3308; SSE-NEXT:    unpckhpd {{.*#+}} xmm13 = xmm13[1,1]
3309; SSE-NEXT:    movapd %xmm7, %xmm2
3310; SSE-NEXT:    mulpd %xmm13, %xmm2
3311; SSE-NEXT:    addpd %xmm10, %xmm2
3312; SSE-NEXT:    movapd %xmm6, %xmm7
3313; SSE-NEXT:    movapd %xmm6, %xmm10
3314; SSE-NEXT:    mulpd %xmm13, %xmm7
3315; SSE-NEXT:    addpd %xmm8, %xmm7
3316; SSE-NEXT:    movapd %xmm5, %xmm8
3317; SSE-NEXT:    mulpd %xmm13, %xmm8
3318; SSE-NEXT:    addpd %xmm9, %xmm8
3319; SSE-NEXT:    mulpd %xmm4, %xmm13
3320; SSE-NEXT:    addpd %xmm12, %xmm13
3321; SSE-NEXT:    movapd %xmm11, %xmm6
3322; SSE-NEXT:    unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm11[0]
3323; SSE-NEXT:    movapd %xmm14, %xmm1
3324; SSE-NEXT:    mulpd %xmm6, %xmm1
3325; SSE-NEXT:    addpd %xmm13, %xmm1
3326; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
3327; SSE-NEXT:    mulpd %xmm6, %xmm3
3328; SSE-NEXT:    addpd %xmm8, %xmm3
3329; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3330; SSE-NEXT:    mulpd %xmm6, %xmm5
3331; SSE-NEXT:    addpd %xmm7, %xmm5
3332; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm6
3333; SSE-NEXT:    addpd %xmm2, %xmm6
3334; SSE-NEXT:    unpckhpd {{.*#+}} xmm11 = xmm11[1,1]
3335; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
3336; SSE-NEXT:    mulpd %xmm11, %xmm2
3337; SSE-NEXT:    addpd %xmm6, %xmm2
3338; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
3339; SSE-NEXT:    mulpd %xmm11, %xmm4
3340; SSE-NEXT:    addpd %xmm5, %xmm4
3341; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3342; SSE-NEXT:    mulpd %xmm11, %xmm5
3343; SSE-NEXT:    addpd %xmm3, %xmm5
3344; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm11
3345; SSE-NEXT:    addpd %xmm1, %xmm11
3346; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3347; SSE-NEXT:    movapd %xmm1, %xmm6
3348; SSE-NEXT:    unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0]
3349; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
3350; SSE-NEXT:    mulpd %xmm6, %xmm3
3351; SSE-NEXT:    addpd %xmm11, %xmm3
3352; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
3353; SSE-NEXT:    mulpd %xmm6, %xmm7
3354; SSE-NEXT:    addpd %xmm5, %xmm7
3355; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3356; SSE-NEXT:    mulpd %xmm6, %xmm5
3357; SSE-NEXT:    addpd %xmm4, %xmm5
3358; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm6
3359; SSE-NEXT:    addpd %xmm2, %xmm6
3360; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
3361; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3362; SSE-NEXT:    mulpd %xmm1, %xmm0
3363; SSE-NEXT:    addpd %xmm6, %xmm0
3364; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
3365; SSE-NEXT:    mulpd %xmm1, %xmm4
3366; SSE-NEXT:    addpd %xmm5, %xmm4
3367; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3368; SSE-NEXT:    mulpd %xmm1, %xmm5
3369; SSE-NEXT:    addpd %xmm7, %xmm5
3370; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm1
3371; SSE-NEXT:    addpd %xmm3, %xmm1
3372; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
3373; SSE-NEXT:    movapd %xmm6, %xmm3
3374; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0]
3375; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
3376; SSE-NEXT:    mulpd %xmm3, %xmm2
3377; SSE-NEXT:    addpd %xmm1, %xmm2
3378; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3379; SSE-NEXT:    mulpd %xmm3, %xmm1
3380; SSE-NEXT:    addpd %xmm5, %xmm1
3381; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3382; SSE-NEXT:    mulpd %xmm3, %xmm5
3383; SSE-NEXT:    addpd %xmm4, %xmm5
3384; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm3
3385; SSE-NEXT:    addpd %xmm0, %xmm3
3386; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
3387; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3388; SSE-NEXT:    mulpd %xmm6, %xmm0
3389; SSE-NEXT:    addpd %xmm3, %xmm0
3390; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3391; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3392; SSE-NEXT:    mulpd %xmm6, %xmm0
3393; SSE-NEXT:    addpd %xmm5, %xmm0
3394; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3395; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3396; SSE-NEXT:    mulpd %xmm6, %xmm0
3397; SSE-NEXT:    addpd %xmm1, %xmm0
3398; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3399; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm6
3400; SSE-NEXT:    addpd %xmm2, %xmm6
3401; SSE-NEXT:    movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3402; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3403; SSE-NEXT:    movapd %xmm1, %xmm0
3404; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3405; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3406; SSE-NEXT:    movapd %xmm11, %xmm3
3407; SSE-NEXT:    mulpd %xmm0, %xmm3
3408; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
3409; SSE-NEXT:    movapd %xmm15, %xmm8
3410; SSE-NEXT:    movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3411; SSE-NEXT:    movapd %xmm15, %xmm2
3412; SSE-NEXT:    mulpd %xmm1, %xmm2
3413; SSE-NEXT:    addpd %xmm3, %xmm2
3414; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3415; SSE-NEXT:    movapd %xmm9, %xmm3
3416; SSE-NEXT:    mulpd %xmm0, %xmm3
3417; SSE-NEXT:    movapd %xmm10, %xmm15
3418; SSE-NEXT:    movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3419; SSE-NEXT:    movapd %xmm10, %xmm4
3420; SSE-NEXT:    mulpd %xmm1, %xmm4
3421; SSE-NEXT:    addpd %xmm3, %xmm4
3422; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3423; SSE-NEXT:    movapd %xmm13, %xmm3
3424; SSE-NEXT:    mulpd %xmm0, %xmm3
3425; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3426; SSE-NEXT:    movapd %xmm10, %xmm5
3427; SSE-NEXT:    mulpd %xmm1, %xmm5
3428; SSE-NEXT:    addpd %xmm3, %xmm5
3429; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3430; SSE-NEXT:    mulpd %xmm12, %xmm0
3431; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3432; SSE-NEXT:    mulpd %xmm14, %xmm1
3433; SSE-NEXT:    addpd %xmm0, %xmm1
3434; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3435; SSE-NEXT:    movapd %xmm0, %xmm6
3436; SSE-NEXT:    unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0]
3437; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
3438; SSE-NEXT:    mulpd %xmm6, %xmm3
3439; SSE-NEXT:    addpd %xmm1, %xmm3
3440; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3441; SSE-NEXT:    mulpd %xmm6, %xmm1
3442; SSE-NEXT:    addpd %xmm5, %xmm1
3443; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3444; SSE-NEXT:    mulpd %xmm6, %xmm5
3445; SSE-NEXT:    addpd %xmm4, %xmm5
3446; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm6
3447; SSE-NEXT:    addpd %xmm2, %xmm6
3448; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
3449; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
3450; SSE-NEXT:    mulpd %xmm0, %xmm2
3451; SSE-NEXT:    addpd %xmm6, %xmm2
3452; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
3453; SSE-NEXT:    mulpd %xmm0, %xmm4
3454; SSE-NEXT:    addpd %xmm5, %xmm4
3455; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3456; SSE-NEXT:    mulpd %xmm0, %xmm5
3457; SSE-NEXT:    addpd %xmm1, %xmm5
3458; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3459; SSE-NEXT:    mulpd %xmm1, %xmm0
3460; SSE-NEXT:    addpd %xmm3, %xmm0
3461; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3462; SSE-NEXT:    movapd %xmm1, %xmm6
3463; SSE-NEXT:    unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0]
3464; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
3465; SSE-NEXT:    mulpd %xmm6, %xmm3
3466; SSE-NEXT:    addpd %xmm0, %xmm3
3467; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
3468; SSE-NEXT:    mulpd %xmm6, %xmm7
3469; SSE-NEXT:    addpd %xmm5, %xmm7
3470; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3471; SSE-NEXT:    mulpd %xmm6, %xmm5
3472; SSE-NEXT:    addpd %xmm4, %xmm5
3473; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm6
3474; SSE-NEXT:    addpd %xmm2, %xmm6
3475; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
3476; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3477; SSE-NEXT:    mulpd %xmm1, %xmm0
3478; SSE-NEXT:    addpd %xmm6, %xmm0
3479; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
3480; SSE-NEXT:    mulpd %xmm1, %xmm4
3481; SSE-NEXT:    addpd %xmm5, %xmm4
3482; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3483; SSE-NEXT:    mulpd %xmm1, %xmm5
3484; SSE-NEXT:    addpd %xmm7, %xmm5
3485; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
3486; SSE-NEXT:    mulpd %xmm2, %xmm1
3487; SSE-NEXT:    addpd %xmm3, %xmm1
3488; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
3489; SSE-NEXT:    movapd %xmm6, %xmm3
3490; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0]
3491; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
3492; SSE-NEXT:    mulpd %xmm3, %xmm2
3493; SSE-NEXT:    addpd %xmm1, %xmm2
3494; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3495; SSE-NEXT:    mulpd %xmm3, %xmm1
3496; SSE-NEXT:    addpd %xmm5, %xmm1
3497; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3498; SSE-NEXT:    mulpd %xmm3, %xmm5
3499; SSE-NEXT:    addpd %xmm4, %xmm5
3500; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm3
3501; SSE-NEXT:    addpd %xmm0, %xmm3
3502; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
3503; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3504; SSE-NEXT:    mulpd %xmm6, %xmm0
3505; SSE-NEXT:    addpd %xmm3, %xmm0
3506; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3507; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3508; SSE-NEXT:    mulpd %xmm6, %xmm0
3509; SSE-NEXT:    addpd %xmm5, %xmm0
3510; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3511; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3512; SSE-NEXT:    mulpd %xmm6, %xmm0
3513; SSE-NEXT:    addpd %xmm1, %xmm0
3514; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3515; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm6
3516; SSE-NEXT:    addpd %xmm2, %xmm6
3517; SSE-NEXT:    movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3518; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3519; SSE-NEXT:    movapd %xmm1, %xmm0
3520; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3521; SSE-NEXT:    movapd %xmm11, %xmm3
3522; SSE-NEXT:    mulpd %xmm0, %xmm3
3523; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
3524; SSE-NEXT:    movapd %xmm8, %xmm2
3525; SSE-NEXT:    mulpd %xmm1, %xmm2
3526; SSE-NEXT:    addpd %xmm3, %xmm2
3527; SSE-NEXT:    movapd %xmm9, %xmm3
3528; SSE-NEXT:    mulpd %xmm0, %xmm3
3529; SSE-NEXT:    movapd %xmm15, %xmm4
3530; SSE-NEXT:    mulpd %xmm1, %xmm4
3531; SSE-NEXT:    addpd %xmm3, %xmm4
3532; SSE-NEXT:    movapd %xmm13, %xmm8
3533; SSE-NEXT:    movapd %xmm13, %xmm3
3534; SSE-NEXT:    mulpd %xmm0, %xmm3
3535; SSE-NEXT:    movapd %xmm10, %xmm5
3536; SSE-NEXT:    movapd %xmm10, %xmm15
3537; SSE-NEXT:    mulpd %xmm1, %xmm5
3538; SSE-NEXT:    addpd %xmm3, %xmm5
3539; SSE-NEXT:    movapd %xmm12, %xmm10
3540; SSE-NEXT:    mulpd %xmm12, %xmm0
3541; SSE-NEXT:    movapd %xmm14, %xmm9
3542; SSE-NEXT:    mulpd %xmm14, %xmm1
3543; SSE-NEXT:    addpd %xmm0, %xmm1
3544; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3545; SSE-NEXT:    movapd %xmm0, %xmm6
3546; SSE-NEXT:    unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0]
3547; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
3548; SSE-NEXT:    mulpd %xmm6, %xmm3
3549; SSE-NEXT:    addpd %xmm1, %xmm3
3550; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3551; SSE-NEXT:    mulpd %xmm6, %xmm1
3552; SSE-NEXT:    addpd %xmm5, %xmm1
3553; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3554; SSE-NEXT:    mulpd %xmm6, %xmm5
3555; SSE-NEXT:    addpd %xmm4, %xmm5
3556; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
3557; SSE-NEXT:    mulpd %xmm4, %xmm6
3558; SSE-NEXT:    addpd %xmm2, %xmm6
3559; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
3560; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
3561; SSE-NEXT:    mulpd %xmm0, %xmm2
3562; SSE-NEXT:    addpd %xmm6, %xmm2
3563; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
3564; SSE-NEXT:    mulpd %xmm0, %xmm4
3565; SSE-NEXT:    addpd %xmm5, %xmm4
3566; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3567; SSE-NEXT:    mulpd %xmm0, %xmm5
3568; SSE-NEXT:    addpd %xmm1, %xmm5
3569; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm0
3570; SSE-NEXT:    addpd %xmm3, %xmm0
3571; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3572; SSE-NEXT:    movapd %xmm1, %xmm6
3573; SSE-NEXT:    unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0]
3574; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
3575; SSE-NEXT:    mulpd %xmm6, %xmm3
3576; SSE-NEXT:    addpd %xmm0, %xmm3
3577; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
3578; SSE-NEXT:    mulpd %xmm6, %xmm7
3579; SSE-NEXT:    addpd %xmm5, %xmm7
3580; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3581; SSE-NEXT:    mulpd %xmm6, %xmm5
3582; SSE-NEXT:    addpd %xmm4, %xmm5
3583; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm6
3584; SSE-NEXT:    addpd %xmm2, %xmm6
3585; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
3586; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3587; SSE-NEXT:    mulpd %xmm1, %xmm0
3588; SSE-NEXT:    addpd %xmm6, %xmm0
3589; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
3590; SSE-NEXT:    mulpd %xmm1, %xmm4
3591; SSE-NEXT:    addpd %xmm5, %xmm4
3592; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3593; SSE-NEXT:    mulpd %xmm1, %xmm5
3594; SSE-NEXT:    addpd %xmm7, %xmm5
3595; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
3596; SSE-NEXT:    mulpd %xmm2, %xmm1
3597; SSE-NEXT:    addpd %xmm3, %xmm1
3598; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
3599; SSE-NEXT:    movapd %xmm7, %xmm3
3600; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0]
3601; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
3602; SSE-NEXT:    mulpd %xmm3, %xmm2
3603; SSE-NEXT:    addpd %xmm1, %xmm2
3604; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3605; SSE-NEXT:    mulpd %xmm3, %xmm1
3606; SSE-NEXT:    addpd %xmm5, %xmm1
3607; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3608; SSE-NEXT:    mulpd %xmm3, %xmm5
3609; SSE-NEXT:    addpd %xmm4, %xmm5
3610; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
3611; SSE-NEXT:    mulpd %xmm4, %xmm3
3612; SSE-NEXT:    addpd %xmm0, %xmm3
3613; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
3614; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3615; SSE-NEXT:    mulpd %xmm7, %xmm0
3616; SSE-NEXT:    addpd %xmm3, %xmm0
3617; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3618; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3619; SSE-NEXT:    mulpd %xmm7, %xmm0
3620; SSE-NEXT:    addpd %xmm5, %xmm0
3621; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3622; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3623; SSE-NEXT:    mulpd %xmm7, %xmm0
3624; SSE-NEXT:    addpd %xmm1, %xmm0
3625; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3626; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3627; SSE-NEXT:    mulpd %xmm0, %xmm7
3628; SSE-NEXT:    addpd %xmm2, %xmm7
3629; SSE-NEXT:    movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3630; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3631; SSE-NEXT:    movapd %xmm1, %xmm0
3632; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3633; SSE-NEXT:    movapd %xmm11, %xmm3
3634; SSE-NEXT:    movapd %xmm11, %xmm12
3635; SSE-NEXT:    mulpd %xmm0, %xmm3
3636; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
3637; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3638; SSE-NEXT:    movapd %xmm6, %xmm2
3639; SSE-NEXT:    mulpd %xmm1, %xmm2
3640; SSE-NEXT:    addpd %xmm3, %xmm2
3641; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3642; SSE-NEXT:    movapd %xmm11, %xmm3
3643; SSE-NEXT:    mulpd %xmm0, %xmm3
3644; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3645; SSE-NEXT:    movapd %xmm13, %xmm4
3646; SSE-NEXT:    mulpd %xmm1, %xmm4
3647; SSE-NEXT:    addpd %xmm3, %xmm4
3648; SSE-NEXT:    movapd %xmm8, %xmm3
3649; SSE-NEXT:    movapd %xmm8, %xmm14
3650; SSE-NEXT:    mulpd %xmm0, %xmm3
3651; SSE-NEXT:    movapd %xmm15, %xmm8
3652; SSE-NEXT:    movapd %xmm15, %xmm5
3653; SSE-NEXT:    mulpd %xmm1, %xmm5
3654; SSE-NEXT:    addpd %xmm3, %xmm5
3655; SSE-NEXT:    mulpd %xmm10, %xmm0
3656; SSE-NEXT:    mulpd %xmm9, %xmm1
3657; SSE-NEXT:    movapd %xmm9, %xmm10
3658; SSE-NEXT:    addpd %xmm0, %xmm1
3659; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3660; SSE-NEXT:    movapd %xmm0, %xmm7
3661; SSE-NEXT:    unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm0[0]
3662; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
3663; SSE-NEXT:    mulpd %xmm7, %xmm3
3664; SSE-NEXT:    addpd %xmm1, %xmm3
3665; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3666; SSE-NEXT:    mulpd %xmm7, %xmm1
3667; SSE-NEXT:    addpd %xmm5, %xmm1
3668; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3669; SSE-NEXT:    mulpd %xmm7, %xmm5
3670; SSE-NEXT:    addpd %xmm4, %xmm5
3671; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm7
3672; SSE-NEXT:    addpd %xmm2, %xmm7
3673; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
3674; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
3675; SSE-NEXT:    mulpd %xmm0, %xmm2
3676; SSE-NEXT:    addpd %xmm7, %xmm2
3677; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
3678; SSE-NEXT:    mulpd %xmm0, %xmm4
3679; SSE-NEXT:    addpd %xmm5, %xmm4
3680; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3681; SSE-NEXT:    mulpd %xmm0, %xmm5
3682; SSE-NEXT:    addpd %xmm1, %xmm5
3683; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3684; SSE-NEXT:    mulpd %xmm1, %xmm0
3685; SSE-NEXT:    addpd %xmm3, %xmm0
3686; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3687; SSE-NEXT:    movapd %xmm1, %xmm7
3688; SSE-NEXT:    unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0]
3689; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
3690; SSE-NEXT:    mulpd %xmm7, %xmm3
3691; SSE-NEXT:    addpd %xmm0, %xmm3
3692; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
3693; SSE-NEXT:    mulpd %xmm7, %xmm9
3694; SSE-NEXT:    addpd %xmm5, %xmm9
3695; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
3696; SSE-NEXT:    mulpd %xmm7, %xmm5
3697; SSE-NEXT:    addpd %xmm4, %xmm5
3698; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3699; SSE-NEXT:    mulpd %xmm0, %xmm7
3700; SSE-NEXT:    addpd %xmm2, %xmm7
3701; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
3702; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3703; SSE-NEXT:    mulpd %xmm1, %xmm0
3704; SSE-NEXT:    addpd %xmm7, %xmm0
3705; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
3706; SSE-NEXT:    mulpd %xmm1, %xmm4
3707; SSE-NEXT:    addpd %xmm5, %xmm4
3708; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
3709; SSE-NEXT:    mulpd %xmm1, %xmm7
3710; SSE-NEXT:    addpd %xmm9, %xmm7
3711; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm1
3712; SSE-NEXT:    addpd %xmm3, %xmm1
3713; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm15
3714; SSE-NEXT:    movapd %xmm15, %xmm3
3715; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm15[0]
3716; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
3717; SSE-NEXT:    mulpd %xmm3, %xmm2
3718; SSE-NEXT:    addpd %xmm1, %xmm2
3719; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3720; SSE-NEXT:    mulpd %xmm3, %xmm1
3721; SSE-NEXT:    addpd %xmm7, %xmm1
3722; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
3723; SSE-NEXT:    mulpd %xmm3, %xmm7
3724; SSE-NEXT:    addpd %xmm4, %xmm7
3725; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm3
3726; SSE-NEXT:    addpd %xmm0, %xmm3
3727; SSE-NEXT:    unpckhpd {{.*#+}} xmm15 = xmm15[1,1]
3728; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3729; SSE-NEXT:    mulpd %xmm15, %xmm0
3730; SSE-NEXT:    addpd %xmm3, %xmm0
3731; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3732; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3733; SSE-NEXT:    mulpd %xmm15, %xmm0
3734; SSE-NEXT:    addpd %xmm7, %xmm0
3735; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3736; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3737; SSE-NEXT:    mulpd %xmm15, %xmm0
3738; SSE-NEXT:    addpd %xmm1, %xmm0
3739; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3740; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm15
3741; SSE-NEXT:    addpd %xmm2, %xmm15
3742; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3743; SSE-NEXT:    movapd %xmm1, %xmm0
3744; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3745; SSE-NEXT:    movapd %xmm12, %xmm3
3746; SSE-NEXT:    mulpd %xmm0, %xmm3
3747; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
3748; SSE-NEXT:    movapd %xmm6, %xmm2
3749; SSE-NEXT:    movapd %xmm6, %xmm12
3750; SSE-NEXT:    mulpd %xmm1, %xmm2
3751; SSE-NEXT:    addpd %xmm3, %xmm2
3752; SSE-NEXT:    mulpd %xmm0, %xmm11
3753; SSE-NEXT:    movapd %xmm13, %xmm6
3754; SSE-NEXT:    movapd %xmm13, %xmm4
3755; SSE-NEXT:    mulpd %xmm1, %xmm4
3756; SSE-NEXT:    addpd %xmm11, %xmm4
3757; SSE-NEXT:    mulpd %xmm0, %xmm14
3758; SSE-NEXT:    movapd %xmm8, %xmm7
3759; SSE-NEXT:    mulpd %xmm1, %xmm7
3760; SSE-NEXT:    addpd %xmm14, %xmm7
3761; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3762; SSE-NEXT:    mulpd %xmm8, %xmm0
3763; SSE-NEXT:    movapd %xmm10, %xmm5
3764; SSE-NEXT:    mulpd %xmm10, %xmm1
3765; SSE-NEXT:    addpd %xmm0, %xmm1
3766; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3767; SSE-NEXT:    movapd %xmm0, %xmm9
3768; SSE-NEXT:    unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm0[0]
3769; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
3770; SSE-NEXT:    mulpd %xmm9, %xmm3
3771; SSE-NEXT:    addpd %xmm1, %xmm3
3772; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3773; SSE-NEXT:    mulpd %xmm9, %xmm1
3774; SSE-NEXT:    addpd %xmm7, %xmm1
3775; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
3776; SSE-NEXT:    mulpd %xmm9, %xmm7
3777; SSE-NEXT:    addpd %xmm4, %xmm7
3778; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm9
3779; SSE-NEXT:    addpd %xmm2, %xmm9
3780; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
3781; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
3782; SSE-NEXT:    mulpd %xmm0, %xmm2
3783; SSE-NEXT:    addpd %xmm9, %xmm2
3784; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
3785; SSE-NEXT:    mulpd %xmm0, %xmm4
3786; SSE-NEXT:    addpd %xmm7, %xmm4
3787; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
3788; SSE-NEXT:    mulpd %xmm0, %xmm7
3789; SSE-NEXT:    addpd %xmm1, %xmm7
3790; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm0
3791; SSE-NEXT:    addpd %xmm3, %xmm0
3792; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3793; SSE-NEXT:    movapd %xmm1, %xmm9
3794; SSE-NEXT:    unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm1[0]
3795; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
3796; SSE-NEXT:    mulpd %xmm9, %xmm3
3797; SSE-NEXT:    addpd %xmm0, %xmm3
3798; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm10
3799; SSE-NEXT:    mulpd %xmm9, %xmm10
3800; SSE-NEXT:    addpd %xmm7, %xmm10
3801; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
3802; SSE-NEXT:    mulpd %xmm9, %xmm7
3803; SSE-NEXT:    addpd %xmm4, %xmm7
3804; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm9
3805; SSE-NEXT:    addpd %xmm2, %xmm9
3806; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
3807; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3808; SSE-NEXT:    mulpd %xmm1, %xmm0
3809; SSE-NEXT:    addpd %xmm9, %xmm0
3810; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
3811; SSE-NEXT:    mulpd %xmm1, %xmm9
3812; SSE-NEXT:    addpd %xmm7, %xmm9
3813; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
3814; SSE-NEXT:    mulpd %xmm1, %xmm7
3815; SSE-NEXT:    addpd %xmm10, %xmm7
3816; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm1
3817; SSE-NEXT:    addpd %xmm3, %xmm1
3818; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm11
3819; SSE-NEXT:    movapd %xmm11, %xmm3
3820; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm11[0]
3821; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
3822; SSE-NEXT:    mulpd %xmm3, %xmm2
3823; SSE-NEXT:    addpd %xmm1, %xmm2
3824; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3825; SSE-NEXT:    mulpd %xmm3, %xmm1
3826; SSE-NEXT:    addpd %xmm7, %xmm1
3827; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
3828; SSE-NEXT:    mulpd %xmm3, %xmm7
3829; SSE-NEXT:    addpd %xmm9, %xmm7
3830; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm3
3831; SSE-NEXT:    addpd %xmm0, %xmm3
3832; SSE-NEXT:    unpckhpd {{.*#+}} xmm11 = xmm11[1,1]
3833; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3834; SSE-NEXT:    mulpd %xmm11, %xmm0
3835; SSE-NEXT:    addpd %xmm3, %xmm0
3836; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3837; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3838; SSE-NEXT:    mulpd %xmm11, %xmm0
3839; SSE-NEXT:    addpd %xmm7, %xmm0
3840; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3841; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3842; SSE-NEXT:    mulpd %xmm11, %xmm0
3843; SSE-NEXT:    addpd %xmm1, %xmm0
3844; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3845; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm11
3846; SSE-NEXT:    addpd %xmm2, %xmm11
3847; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3848; SSE-NEXT:    movapd %xmm1, %xmm0
3849; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3850; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3851; SSE-NEXT:    movapd %xmm13, %xmm3
3852; SSE-NEXT:    mulpd %xmm0, %xmm3
3853; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
3854; SSE-NEXT:    movapd %xmm12, %xmm2
3855; SSE-NEXT:    mulpd %xmm1, %xmm2
3856; SSE-NEXT:    addpd %xmm3, %xmm2
3857; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3858; SSE-NEXT:    movapd %xmm14, %xmm3
3859; SSE-NEXT:    mulpd %xmm0, %xmm3
3860; SSE-NEXT:    movapd %xmm6, %xmm7
3861; SSE-NEXT:    mulpd %xmm1, %xmm7
3862; SSE-NEXT:    addpd %xmm3, %xmm7
3863; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3864; SSE-NEXT:    movapd %xmm4, %xmm3
3865; SSE-NEXT:    mulpd %xmm0, %xmm3
3866; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3867; SSE-NEXT:    movapd %xmm6, %xmm9
3868; SSE-NEXT:    mulpd %xmm1, %xmm9
3869; SSE-NEXT:    addpd %xmm3, %xmm9
3870; SSE-NEXT:    mulpd %xmm8, %xmm0
3871; SSE-NEXT:    mulpd %xmm5, %xmm1
3872; SSE-NEXT:    addpd %xmm0, %xmm1
3873; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3874; SSE-NEXT:    movapd %xmm0, %xmm10
3875; SSE-NEXT:    unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm0[0]
3876; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
3877; SSE-NEXT:    mulpd %xmm10, %xmm3
3878; SSE-NEXT:    addpd %xmm1, %xmm3
3879; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm12
3880; SSE-NEXT:    mulpd %xmm10, %xmm12
3881; SSE-NEXT:    addpd %xmm9, %xmm12
3882; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
3883; SSE-NEXT:    mulpd %xmm10, %xmm9
3884; SSE-NEXT:    addpd %xmm7, %xmm9
3885; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm10
3886; SSE-NEXT:    addpd %xmm2, %xmm10
3887; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
3888; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3889; SSE-NEXT:    mulpd %xmm0, %xmm1
3890; SSE-NEXT:    addpd %xmm10, %xmm1
3891; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm10
3892; SSE-NEXT:    mulpd %xmm0, %xmm10
3893; SSE-NEXT:    addpd %xmm9, %xmm10
3894; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
3895; SSE-NEXT:    mulpd %xmm0, %xmm9
3896; SSE-NEXT:    addpd %xmm12, %xmm9
3897; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
3898; SSE-NEXT:    mulpd %xmm2, %xmm0
3899; SSE-NEXT:    addpd %xmm3, %xmm0
3900; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
3901; SSE-NEXT:    movapd %xmm7, %xmm3
3902; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0]
3903; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
3904; SSE-NEXT:    mulpd %xmm3, %xmm2
3905; SSE-NEXT:    addpd %xmm0, %xmm2
3906; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm12
3907; SSE-NEXT:    mulpd %xmm3, %xmm12
3908; SSE-NEXT:    addpd %xmm9, %xmm12
3909; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
3910; SSE-NEXT:    mulpd %xmm3, %xmm9
3911; SSE-NEXT:    addpd %xmm10, %xmm9
3912; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3913; SSE-NEXT:    mulpd %xmm0, %xmm3
3914; SSE-NEXT:    addpd %xmm1, %xmm3
3915; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
3916; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3917; SSE-NEXT:    mulpd %xmm7, %xmm0
3918; SSE-NEXT:    addpd %xmm3, %xmm0
3919; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm10
3920; SSE-NEXT:    mulpd %xmm7, %xmm10
3921; SSE-NEXT:    addpd %xmm9, %xmm10
3922; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
3923; SSE-NEXT:    mulpd %xmm7, %xmm9
3924; SSE-NEXT:    addpd %xmm12, %xmm9
3925; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm7
3926; SSE-NEXT:    addpd %xmm2, %xmm7
3927; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
3928; SSE-NEXT:    movapd %xmm8, %xmm2
3929; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm8[0]
3930; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3931; SSE-NEXT:    mulpd %xmm2, %xmm1
3932; SSE-NEXT:    addpd %xmm7, %xmm1
3933; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm12
3934; SSE-NEXT:    mulpd %xmm2, %xmm12
3935; SSE-NEXT:    addpd %xmm9, %xmm12
3936; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
3937; SSE-NEXT:    mulpd %xmm2, %xmm7
3938; SSE-NEXT:    addpd %xmm10, %xmm7
3939; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
3940; SSE-NEXT:    mulpd %xmm3, %xmm2
3941; SSE-NEXT:    addpd %xmm0, %xmm2
3942; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
3943; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3944; SSE-NEXT:    mulpd %xmm8, %xmm0
3945; SSE-NEXT:    addpd %xmm2, %xmm0
3946; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3947; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3948; SSE-NEXT:    mulpd %xmm8, %xmm0
3949; SSE-NEXT:    addpd %xmm7, %xmm0
3950; SSE-NEXT:    movapd %xmm0, (%rsp) # 16-byte Spill
3951; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
3952; SSE-NEXT:    mulpd %xmm8, %xmm9
3953; SSE-NEXT:    addpd %xmm12, %xmm9
3954; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
3955; SSE-NEXT:    mulpd %xmm0, %xmm8
3956; SSE-NEXT:    addpd %xmm1, %xmm8
3957; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3958; SSE-NEXT:    movapd %xmm1, %xmm0
3959; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3960; SSE-NEXT:    movapd %xmm13, %xmm12
3961; SSE-NEXT:    mulpd %xmm0, %xmm12
3962; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
3963; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3964; SSE-NEXT:    mulpd %xmm1, %xmm3
3965; SSE-NEXT:    addpd %xmm12, %xmm3
3966; SSE-NEXT:    movapd %xmm14, %xmm12
3967; SSE-NEXT:    movapd %xmm14, %xmm5
3968; SSE-NEXT:    mulpd %xmm0, %xmm12
3969; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3970; SSE-NEXT:    mulpd %xmm1, %xmm13
3971; SSE-NEXT:    addpd %xmm12, %xmm13
3972; SSE-NEXT:    mulpd %xmm0, %xmm4
3973; SSE-NEXT:    movapd %xmm6, %xmm14
3974; SSE-NEXT:    mulpd %xmm1, %xmm14
3975; SSE-NEXT:    addpd %xmm4, %xmm14
3976; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3977; SSE-NEXT:    mulpd %xmm6, %xmm0
3978; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3979; SSE-NEXT:    mulpd %xmm10, %xmm1
3980; SSE-NEXT:    addpd %xmm0, %xmm1
3981; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
3982; SSE-NEXT:    movapd %xmm2, %xmm0
3983; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3984; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm12
3985; SSE-NEXT:    mulpd %xmm0, %xmm12
3986; SSE-NEXT:    addpd %xmm1, %xmm12
3987; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
3988; SSE-NEXT:    mulpd %xmm0, %xmm1
3989; SSE-NEXT:    addpd %xmm14, %xmm1
3990; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm14
3991; SSE-NEXT:    mulpd %xmm0, %xmm14
3992; SSE-NEXT:    addpd %xmm13, %xmm14
3993; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm0
3994; SSE-NEXT:    addpd %xmm3, %xmm0
3995; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
3996; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm13
3997; SSE-NEXT:    mulpd %xmm2, %xmm13
3998; SSE-NEXT:    addpd %xmm0, %xmm13
3999; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
4000; SSE-NEXT:    mulpd %xmm2, %xmm0
4001; SSE-NEXT:    addpd %xmm14, %xmm0
4002; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm14
4003; SSE-NEXT:    mulpd %xmm2, %xmm14
4004; SSE-NEXT:    addpd %xmm1, %xmm14
4005; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
4006; SSE-NEXT:    mulpd %xmm1, %xmm2
4007; SSE-NEXT:    addpd %xmm12, %xmm2
4008; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm12
4009; SSE-NEXT:    movapd %xmm12, %xmm1
4010; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm12[0]
4011; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
4012; SSE-NEXT:    mulpd %xmm1, %xmm3
4013; SSE-NEXT:    addpd %xmm2, %xmm3
4014; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
4015; SSE-NEXT:    mulpd %xmm1, %xmm2
4016; SSE-NEXT:    addpd %xmm14, %xmm2
4017; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm14
4018; SSE-NEXT:    mulpd %xmm1, %xmm14
4019; SSE-NEXT:    addpd %xmm0, %xmm14
4020; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
4021; SSE-NEXT:    mulpd %xmm0, %xmm1
4022; SSE-NEXT:    addpd %xmm13, %xmm1
4023; SSE-NEXT:    unpckhpd {{.*#+}} xmm12 = xmm12[1,1]
4024; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
4025; SSE-NEXT:    mulpd %xmm12, %xmm4
4026; SSE-NEXT:    addpd %xmm1, %xmm4
4027; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm13
4028; SSE-NEXT:    mulpd %xmm12, %xmm13
4029; SSE-NEXT:    addpd %xmm14, %xmm13
4030; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm14
4031; SSE-NEXT:    mulpd %xmm12, %xmm14
4032; SSE-NEXT:    addpd %xmm2, %xmm14
4033; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm12
4034; SSE-NEXT:    addpd %xmm3, %xmm12
4035; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
4036; SSE-NEXT:    movapd %xmm2, %xmm3
4037; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0]
4038; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
4039; SSE-NEXT:    mulpd %xmm3, %xmm1
4040; SSE-NEXT:    addpd %xmm12, %xmm1
4041; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm12
4042; SSE-NEXT:    mulpd %xmm3, %xmm12
4043; SSE-NEXT:    addpd %xmm14, %xmm12
4044; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
4045; SSE-NEXT:    mulpd %xmm3, %xmm0
4046; SSE-NEXT:    addpd %xmm13, %xmm0
4047; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
4048; SSE-NEXT:    mulpd %xmm7, %xmm3
4049; SSE-NEXT:    addpd %xmm4, %xmm3
4050; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
4051; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm14
4052; SSE-NEXT:    mulpd %xmm2, %xmm14
4053; SSE-NEXT:    addpd %xmm3, %xmm14
4054; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm13
4055; SSE-NEXT:    mulpd %xmm2, %xmm13
4056; SSE-NEXT:    addpd %xmm0, %xmm13
4057; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
4058; SSE-NEXT:    mulpd %xmm2, %xmm7
4059; SSE-NEXT:    addpd %xmm12, %xmm7
4060; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
4061; SSE-NEXT:    mulpd %xmm0, %xmm2
4062; SSE-NEXT:    addpd %xmm1, %xmm2
4063; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
4064; SSE-NEXT:    movapd %xmm1, %xmm0
4065; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4066; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4067; SSE-NEXT:    mulpd %xmm0, %xmm12
4068; SSE-NEXT:    mulpd %xmm0, %xmm5
4069; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4070; SSE-NEXT:    mulpd %xmm0, %xmm3
4071; SSE-NEXT:    mulpd %xmm6, %xmm0
4072; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
4073; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4074; SSE-NEXT:    mulpd %xmm1, %xmm4
4075; SSE-NEXT:    addpd %xmm12, %xmm4
4076; SSE-NEXT:    movapd %xmm4, %xmm12
4077; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4078; SSE-NEXT:    mulpd %xmm1, %xmm4
4079; SSE-NEXT:    addpd %xmm5, %xmm4
4080; SSE-NEXT:    movapd %xmm4, %xmm5
4081; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4082; SSE-NEXT:    mulpd %xmm1, %xmm4
4083; SSE-NEXT:    addpd %xmm3, %xmm4
4084; SSE-NEXT:    movapd %xmm4, %xmm3
4085; SSE-NEXT:    mulpd %xmm10, %xmm1
4086; SSE-NEXT:    addpd %xmm0, %xmm1
4087; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
4088; SSE-NEXT:    movapd %xmm0, %xmm4
4089; SSE-NEXT:    unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0]
4090; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm10
4091; SSE-NEXT:    mulpd %xmm4, %xmm10
4092; SSE-NEXT:    addpd %xmm1, %xmm10
4093; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
4094; SSE-NEXT:    mulpd %xmm4, %xmm1
4095; SSE-NEXT:    addpd %xmm3, %xmm1
4096; SSE-NEXT:    movapd %xmm1, %xmm3
4097; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
4098; SSE-NEXT:    mulpd %xmm4, %xmm1
4099; SSE-NEXT:    addpd %xmm5, %xmm1
4100; SSE-NEXT:    movapd %xmm1, %xmm5
4101; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm4
4102; SSE-NEXT:    addpd %xmm12, %xmm4
4103; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
4104; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
4105; SSE-NEXT:    mulpd %xmm0, %xmm1
4106; SSE-NEXT:    addpd %xmm4, %xmm1
4107; SSE-NEXT:    movapd %xmm1, %xmm12
4108; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
4109; SSE-NEXT:    mulpd %xmm0, %xmm6
4110; SSE-NEXT:    addpd %xmm5, %xmm6
4111; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
4112; SSE-NEXT:    mulpd %xmm0, %xmm1
4113; SSE-NEXT:    addpd %xmm3, %xmm1
4114; SSE-NEXT:    movapd %xmm1, %xmm3
4115; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm0
4116; SSE-NEXT:    addpd %xmm10, %xmm0
4117; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
4118; SSE-NEXT:    movapd %xmm1, %xmm4
4119; SSE-NEXT:    unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm1[0]
4120; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
4121; SSE-NEXT:    mulpd %xmm4, %xmm5
4122; SSE-NEXT:    addpd %xmm0, %xmm5
4123; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
4124; SSE-NEXT:    mulpd %xmm4, %xmm0
4125; SSE-NEXT:    addpd %xmm3, %xmm0
4126; SSE-NEXT:    movapd %xmm0, %xmm10
4127; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
4128; SSE-NEXT:    mulpd %xmm4, %xmm0
4129; SSE-NEXT:    addpd %xmm6, %xmm0
4130; SSE-NEXT:    movapd %xmm0, %xmm6
4131; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm4
4132; SSE-NEXT:    addpd %xmm12, %xmm4
4133; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
4134; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
4135; SSE-NEXT:    mulpd %xmm1, %xmm0
4136; SSE-NEXT:    addpd %xmm4, %xmm0
4137; SSE-NEXT:    movapd %xmm0, %xmm3
4138; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
4139; SSE-NEXT:    mulpd %xmm1, %xmm0
4140; SSE-NEXT:    addpd %xmm6, %xmm0
4141; SSE-NEXT:    movapd %xmm0, %xmm6
4142; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
4143; SSE-NEXT:    mulpd %xmm1, %xmm0
4144; SSE-NEXT:    addpd %xmm10, %xmm0
4145; SSE-NEXT:    movapd %xmm0, %xmm10
4146; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm1
4147; SSE-NEXT:    addpd %xmm5, %xmm1
4148; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
4149; SSE-NEXT:    movapd %xmm0, %xmm4
4150; SSE-NEXT:    unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0]
4151; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
4152; SSE-NEXT:    mulpd %xmm4, %xmm5
4153; SSE-NEXT:    addpd %xmm1, %xmm5
4154; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
4155; SSE-NEXT:    mulpd %xmm4, %xmm1
4156; SSE-NEXT:    addpd %xmm10, %xmm1
4157; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm10
4158; SSE-NEXT:    mulpd %xmm4, %xmm10
4159; SSE-NEXT:    addpd %xmm6, %xmm10
4160; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm4
4161; SSE-NEXT:    addpd %xmm3, %xmm4
4162; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
4163; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
4164; SSE-NEXT:    mulpd %xmm0, %xmm3
4165; SSE-NEXT:    addpd %xmm4, %xmm3
4166; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
4167; SSE-NEXT:    mulpd %xmm0, %xmm4
4168; SSE-NEXT:    addpd %xmm10, %xmm4
4169; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
4170; SSE-NEXT:    mulpd %xmm0, %xmm6
4171; SSE-NEXT:    addpd %xmm1, %xmm6
4172; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm0
4173; SSE-NEXT:    addpd %xmm5, %xmm0
4174; SSE-NEXT:    movapd %xmm3, 496(%rdi)
4175; SSE-NEXT:    movapd %xmm4, 480(%rdi)
4176; SSE-NEXT:    movapd %xmm6, 464(%rdi)
4177; SSE-NEXT:    movapd %xmm0, 448(%rdi)
4178; SSE-NEXT:    movapd %xmm14, 432(%rdi)
4179; SSE-NEXT:    movapd %xmm13, 416(%rdi)
4180; SSE-NEXT:    movapd %xmm7, 400(%rdi)
4181; SSE-NEXT:    movapd %xmm2, 384(%rdi)
4182; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4183; SSE-NEXT:    movaps %xmm0, 368(%rdi)
4184; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
4185; SSE-NEXT:    movaps %xmm0, 352(%rdi)
4186; SSE-NEXT:    movapd %xmm9, 336(%rdi)
4187; SSE-NEXT:    movapd %xmm8, 320(%rdi)
4188; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4189; SSE-NEXT:    movaps %xmm0, 304(%rdi)
4190; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4191; SSE-NEXT:    movaps %xmm0, 288(%rdi)
4192; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4193; SSE-NEXT:    movaps %xmm0, 272(%rdi)
4194; SSE-NEXT:    movapd %xmm11, 256(%rdi)
4195; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4196; SSE-NEXT:    movaps %xmm0, 240(%rdi)
4197; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4198; SSE-NEXT:    movaps %xmm0, 224(%rdi)
4199; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4200; SSE-NEXT:    movaps %xmm0, 208(%rdi)
4201; SSE-NEXT:    movapd %xmm15, 192(%rdi)
4202; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4203; SSE-NEXT:    movaps %xmm0, 176(%rdi)
4204; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4205; SSE-NEXT:    movaps %xmm0, 160(%rdi)
4206; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4207; SSE-NEXT:    movaps %xmm0, 144(%rdi)
4208; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4209; SSE-NEXT:    movaps %xmm0, 128(%rdi)
4210; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4211; SSE-NEXT:    movaps %xmm0, 112(%rdi)
4212; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4213; SSE-NEXT:    movaps %xmm0, 96(%rdi)
4214; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4215; SSE-NEXT:    movaps %xmm0, 80(%rdi)
4216; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4217; SSE-NEXT:    movaps %xmm0, 64(%rdi)
4218; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4219; SSE-NEXT:    movaps %xmm0, 48(%rdi)
4220; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4221; SSE-NEXT:    movaps %xmm0, 32(%rdi)
4222; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4223; SSE-NEXT:    movaps %xmm0, 16(%rdi)
4224; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4225; SSE-NEXT:    movaps %xmm0, (%rdi)
4226; SSE-NEXT:    addq $328, %rsp # imm = 0x148
4227; SSE-NEXT:    retq
4228;
4229; AVX1-LABEL: test_mul8x8_f64:
4230; AVX1:       # %bb.0: # %entry
4231; AVX1-NEXT:    pushq %rbp
4232; AVX1-NEXT:    movq %rsp, %rbp
4233; AVX1-NEXT:    andq $-32, %rsp
4234; AVX1-NEXT:    subq $448, %rsp # imm = 0x1C0
4235; AVX1-NEXT:    vmovapd %ymm2, %ymm12
4236; AVX1-NEXT:    vmovapd %ymm0, (%rsp) # 32-byte Spill
4237; AVX1-NEXT:    movq %rdi, %rax
4238; AVX1-NEXT:    vmovapd 144(%rbp), %ymm13
4239; AVX1-NEXT:    vmovapd 112(%rbp), %ymm14
4240; AVX1-NEXT:    vbroadcastsd 272(%rbp), %ymm10
4241; AVX1-NEXT:    vmulpd %ymm1, %ymm10, %ymm8
4242; AVX1-NEXT:    vmovapd %ymm1, %ymm9
4243; AVX1-NEXT:    vmulpd %ymm0, %ymm10, %ymm0
4244; AVX1-NEXT:    vbroadcastsd 280(%rbp), %ymm10
4245; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
4246; AVX1-NEXT:    vaddpd %ymm11, %ymm8, %ymm1
4247; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
4248; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4249; AVX1-NEXT:    vbroadcastsd 288(%rbp), %ymm10
4250; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
4251; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4252; AVX1-NEXT:    vmulpd %ymm5, %ymm10, %ymm10
4253; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4254; AVX1-NEXT:    vbroadcastsd 296(%rbp), %ymm10
4255; AVX1-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
4256; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4257; AVX1-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
4258; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4259; AVX1-NEXT:    vbroadcastsd 304(%rbp), %ymm10
4260; AVX1-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
4261; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4262; AVX1-NEXT:    vmulpd 48(%rbp), %ymm10, %ymm10
4263; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4264; AVX1-NEXT:    vbroadcastsd 312(%rbp), %ymm10
4265; AVX1-NEXT:    vmulpd %ymm10, %ymm14, %ymm11
4266; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4267; AVX1-NEXT:    vmulpd 80(%rbp), %ymm10, %ymm10
4268; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4269; AVX1-NEXT:    vbroadcastsd 320(%rbp), %ymm10
4270; AVX1-NEXT:    vmulpd %ymm10, %ymm13, %ymm11
4271; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4272; AVX1-NEXT:    vmulpd 176(%rbp), %ymm10, %ymm10
4273; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4274; AVX1-NEXT:    vbroadcastsd 328(%rbp), %ymm10
4275; AVX1-NEXT:    vmulpd 240(%rbp), %ymm10, %ymm11
4276; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4277; AVX1-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4278; AVX1-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
4279; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
4280; AVX1-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4281; AVX1-NEXT:    vbroadcastsd 336(%rbp), %ymm0
4282; AVX1-NEXT:    vmulpd %ymm0, %ymm9, %ymm1
4283; AVX1-NEXT:    vbroadcastsd 344(%rbp), %ymm10
4284; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
4285; AVX1-NEXT:    vmovapd %ymm3, %ymm8
4286; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4287; AVX1-NEXT:    vmovapd (%rsp), %ymm15 # 32-byte Reload
4288; AVX1-NEXT:    vmulpd %ymm0, %ymm15, %ymm0
4289; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
4290; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4291; AVX1-NEXT:    vbroadcastsd 352(%rbp), %ymm10
4292; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
4293; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4294; AVX1-NEXT:    vmulpd %ymm5, %ymm10, %ymm10
4295; AVX1-NEXT:    vmovapd %ymm5, %ymm3
4296; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4297; AVX1-NEXT:    vbroadcastsd 360(%rbp), %ymm10
4298; AVX1-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
4299; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4300; AVX1-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
4301; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4302; AVX1-NEXT:    vbroadcastsd 368(%rbp), %ymm10
4303; AVX1-NEXT:    vmovapd 16(%rbp), %ymm2
4304; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm11
4305; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4306; AVX1-NEXT:    vmulpd 48(%rbp), %ymm10, %ymm10
4307; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4308; AVX1-NEXT:    vbroadcastsd 376(%rbp), %ymm10
4309; AVX1-NEXT:    vmulpd %ymm10, %ymm14, %ymm11
4310; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4311; AVX1-NEXT:    vmovapd 80(%rbp), %ymm2
4312; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
4313; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4314; AVX1-NEXT:    vbroadcastsd 384(%rbp), %ymm10
4315; AVX1-NEXT:    vmulpd %ymm10, %ymm13, %ymm11
4316; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4317; AVX1-NEXT:    vmovapd 176(%rbp), %ymm14
4318; AVX1-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
4319; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4320; AVX1-NEXT:    vbroadcastsd 392(%rbp), %ymm10
4321; AVX1-NEXT:    vmovapd 240(%rbp), %ymm2
4322; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm11
4323; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4324; AVX1-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4325; AVX1-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
4326; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
4327; AVX1-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4328; AVX1-NEXT:    vbroadcastsd 400(%rbp), %ymm0
4329; AVX1-NEXT:    vmulpd %ymm0, %ymm9, %ymm1
4330; AVX1-NEXT:    vbroadcastsd 408(%rbp), %ymm10
4331; AVX1-NEXT:    vmovapd %ymm8, %ymm5
4332; AVX1-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
4333; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4334; AVX1-NEXT:    vmulpd %ymm0, %ymm15, %ymm0
4335; AVX1-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
4336; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4337; AVX1-NEXT:    vbroadcastsd 416(%rbp), %ymm10
4338; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
4339; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4340; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
4341; AVX1-NEXT:    vmovapd %ymm3, %ymm2
4342; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4343; AVX1-NEXT:    vbroadcastsd 424(%rbp), %ymm10
4344; AVX1-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
4345; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4346; AVX1-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
4347; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4348; AVX1-NEXT:    vbroadcastsd 432(%rbp), %ymm10
4349; AVX1-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
4350; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4351; AVX1-NEXT:    vmulpd 48(%rbp), %ymm10, %ymm10
4352; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4353; AVX1-NEXT:    vbroadcastsd 440(%rbp), %ymm10
4354; AVX1-NEXT:    vmulpd 112(%rbp), %ymm10, %ymm11
4355; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4356; AVX1-NEXT:    vmulpd 80(%rbp), %ymm10, %ymm10
4357; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4358; AVX1-NEXT:    vbroadcastsd 448(%rbp), %ymm10
4359; AVX1-NEXT:    vmulpd %ymm10, %ymm13, %ymm11
4360; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4361; AVX1-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
4362; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4363; AVX1-NEXT:    vbroadcastsd 456(%rbp), %ymm10
4364; AVX1-NEXT:    vmulpd 240(%rbp), %ymm10, %ymm11
4365; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4366; AVX1-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4367; AVX1-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
4368; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
4369; AVX1-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4370; AVX1-NEXT:    vbroadcastsd 464(%rbp), %ymm0
4371; AVX1-NEXT:    vmulpd %ymm0, %ymm9, %ymm1
4372; AVX1-NEXT:    vmovapd %ymm9, %ymm13
4373; AVX1-NEXT:    vbroadcastsd 472(%rbp), %ymm10
4374; AVX1-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
4375; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4376; AVX1-NEXT:    vmulpd %ymm0, %ymm15, %ymm0
4377; AVX1-NEXT:    vmovapd %ymm15, %ymm9
4378; AVX1-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
4379; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4380; AVX1-NEXT:    vbroadcastsd 480(%rbp), %ymm10
4381; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
4382; AVX1-NEXT:    vmovapd %ymm4, %ymm3
4383; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4384; AVX1-NEXT:    vmovapd %ymm2, %ymm15
4385; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
4386; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4387; AVX1-NEXT:    vbroadcastsd 488(%rbp), %ymm10
4388; AVX1-NEXT:    vmovapd %ymm7, %ymm8
4389; AVX1-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
4390; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4391; AVX1-NEXT:    vmovapd %ymm6, %ymm7
4392; AVX1-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
4393; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4394; AVX1-NEXT:    vbroadcastsd 496(%rbp), %ymm10
4395; AVX1-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
4396; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4397; AVX1-NEXT:    vmovapd 48(%rbp), %ymm4
4398; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm10
4399; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4400; AVX1-NEXT:    vbroadcastsd 504(%rbp), %ymm10
4401; AVX1-NEXT:    vmovapd 112(%rbp), %ymm2
4402; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm11
4403; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4404; AVX1-NEXT:    vmovapd 80(%rbp), %ymm14
4405; AVX1-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
4406; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4407; AVX1-NEXT:    vbroadcastsd 512(%rbp), %ymm10
4408; AVX1-NEXT:    vmulpd 144(%rbp), %ymm10, %ymm11
4409; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4410; AVX1-NEXT:    vmovapd 176(%rbp), %ymm2
4411; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
4412; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4413; AVX1-NEXT:    vbroadcastsd 520(%rbp), %ymm10
4414; AVX1-NEXT:    vmulpd 240(%rbp), %ymm10, %ymm11
4415; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4416; AVX1-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4417; AVX1-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
4418; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
4419; AVX1-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4420; AVX1-NEXT:    vbroadcastsd 528(%rbp), %ymm0
4421; AVX1-NEXT:    vmulpd %ymm0, %ymm13, %ymm1
4422; AVX1-NEXT:    vbroadcastsd 536(%rbp), %ymm10
4423; AVX1-NEXT:    vmulpd %ymm5, %ymm10, %ymm11
4424; AVX1-NEXT:    vmovapd %ymm5, %ymm6
4425; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4426; AVX1-NEXT:    vmulpd %ymm0, %ymm9, %ymm0
4427; AVX1-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
4428; AVX1-NEXT:    vmovapd %ymm12, %ymm5
4429; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4430; AVX1-NEXT:    vbroadcastsd 544(%rbp), %ymm10
4431; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
4432; AVX1-NEXT:    vmovapd %ymm3, %ymm12
4433; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4434; AVX1-NEXT:    vmulpd %ymm10, %ymm15, %ymm10
4435; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4436; AVX1-NEXT:    vbroadcastsd 552(%rbp), %ymm10
4437; AVX1-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
4438; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4439; AVX1-NEXT:    vmulpd %ymm7, %ymm10, %ymm10
4440; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4441; AVX1-NEXT:    vbroadcastsd 560(%rbp), %ymm10
4442; AVX1-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
4443; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4444; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm10
4445; AVX1-NEXT:    vmovapd %ymm4, %ymm3
4446; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4447; AVX1-NEXT:    vbroadcastsd 568(%rbp), %ymm10
4448; AVX1-NEXT:    vmulpd 112(%rbp), %ymm10, %ymm11
4449; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4450; AVX1-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
4451; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4452; AVX1-NEXT:    vbroadcastsd 576(%rbp), %ymm10
4453; AVX1-NEXT:    vmovapd 144(%rbp), %ymm4
4454; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
4455; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4456; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
4457; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4458; AVX1-NEXT:    vbroadcastsd 584(%rbp), %ymm10
4459; AVX1-NEXT:    vmovapd 240(%rbp), %ymm14
4460; AVX1-NEXT:    vmulpd %ymm10, %ymm14, %ymm11
4461; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4462; AVX1-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4463; AVX1-NEXT:    vmovapd 208(%rbp), %ymm2
4464; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm1
4465; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
4466; AVX1-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4467; AVX1-NEXT:    vbroadcastsd 592(%rbp), %ymm0
4468; AVX1-NEXT:    vmulpd %ymm0, %ymm13, %ymm1
4469; AVX1-NEXT:    vbroadcastsd 600(%rbp), %ymm10
4470; AVX1-NEXT:    vmulpd %ymm6, %ymm10, %ymm11
4471; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4472; AVX1-NEXT:    vmulpd %ymm0, %ymm9, %ymm0
4473; AVX1-NEXT:    vmulpd %ymm5, %ymm10, %ymm10
4474; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4475; AVX1-NEXT:    vbroadcastsd 608(%rbp), %ymm10
4476; AVX1-NEXT:    vmulpd %ymm10, %ymm12, %ymm11
4477; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4478; AVX1-NEXT:    vmulpd %ymm10, %ymm15, %ymm10
4479; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4480; AVX1-NEXT:    vbroadcastsd 616(%rbp), %ymm10
4481; AVX1-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
4482; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4483; AVX1-NEXT:    vmulpd %ymm7, %ymm10, %ymm10
4484; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4485; AVX1-NEXT:    vbroadcastsd 624(%rbp), %ymm10
4486; AVX1-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
4487; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4488; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
4489; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4490; AVX1-NEXT:    vbroadcastsd 632(%rbp), %ymm10
4491; AVX1-NEXT:    vmovapd 112(%rbp), %ymm3
4492; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
4493; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4494; AVX1-NEXT:    vmovapd 80(%rbp), %ymm3
4495; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
4496; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4497; AVX1-NEXT:    vbroadcastsd 640(%rbp), %ymm10
4498; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
4499; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4500; AVX1-NEXT:    vmovapd 176(%rbp), %ymm3
4501; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
4502; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4503; AVX1-NEXT:    vbroadcastsd 648(%rbp), %ymm10
4504; AVX1-NEXT:    vmovapd %ymm14, %ymm4
4505; AVX1-NEXT:    vmulpd %ymm10, %ymm14, %ymm11
4506; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4507; AVX1-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4508; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm1
4509; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
4510; AVX1-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4511; AVX1-NEXT:    vbroadcastsd 656(%rbp), %ymm2
4512; AVX1-NEXT:    vmovapd %ymm13, %ymm3
4513; AVX1-NEXT:    vmulpd %ymm2, %ymm13, %ymm1
4514; AVX1-NEXT:    vbroadcastsd 664(%rbp), %ymm0
4515; AVX1-NEXT:    vmulpd %ymm0, %ymm6, %ymm14
4516; AVX1-NEXT:    vmovapd %ymm6, %ymm10
4517; AVX1-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
4518; AVX1-NEXT:    vmulpd %ymm2, %ymm9, %ymm2
4519; AVX1-NEXT:    vmulpd %ymm0, %ymm5, %ymm0
4520; AVX1-NEXT:    vmovapd %ymm5, %ymm6
4521; AVX1-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
4522; AVX1-NEXT:    vbroadcastsd 672(%rbp), %ymm2
4523; AVX1-NEXT:    vmulpd %ymm2, %ymm12, %ymm14
4524; AVX1-NEXT:    vaddpd %ymm0, %ymm14, %ymm0
4525; AVX1-NEXT:    vmulpd %ymm2, %ymm15, %ymm2
4526; AVX1-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
4527; AVX1-NEXT:    vbroadcastsd 680(%rbp), %ymm2
4528; AVX1-NEXT:    vmulpd %ymm2, %ymm8, %ymm14
4529; AVX1-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
4530; AVX1-NEXT:    vmulpd %ymm2, %ymm7, %ymm2
4531; AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
4532; AVX1-NEXT:    vbroadcastsd 688(%rbp), %ymm2
4533; AVX1-NEXT:    vmovapd 16(%rbp), %ymm11
4534; AVX1-NEXT:    vmulpd %ymm2, %ymm11, %ymm14
4535; AVX1-NEXT:    vaddpd %ymm0, %ymm14, %ymm0
4536; AVX1-NEXT:    vmulpd 48(%rbp), %ymm2, %ymm2
4537; AVX1-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
4538; AVX1-NEXT:    vbroadcastsd 696(%rbp), %ymm2
4539; AVX1-NEXT:    vmovapd 112(%rbp), %ymm5
4540; AVX1-NEXT:    vmulpd %ymm2, %ymm5, %ymm14
4541; AVX1-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
4542; AVX1-NEXT:    vmovapd 80(%rbp), %ymm5
4543; AVX1-NEXT:    vmulpd %ymm2, %ymm5, %ymm2
4544; AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
4545; AVX1-NEXT:    vbroadcastsd 704(%rbp), %ymm2
4546; AVX1-NEXT:    vmulpd 144(%rbp), %ymm2, %ymm14
4547; AVX1-NEXT:    vaddpd %ymm0, %ymm14, %ymm0
4548; AVX1-NEXT:    vmovapd 176(%rbp), %ymm13
4549; AVX1-NEXT:    vmulpd %ymm2, %ymm13, %ymm2
4550; AVX1-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
4551; AVX1-NEXT:    vbroadcastsd 712(%rbp), %ymm2
4552; AVX1-NEXT:    vmulpd %ymm2, %ymm4, %ymm14
4553; AVX1-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
4554; AVX1-NEXT:    vmovapd 208(%rbp), %ymm14
4555; AVX1-NEXT:    vmulpd %ymm2, %ymm14, %ymm2
4556; AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
4557; AVX1-NEXT:    vbroadcastsd 720(%rbp), %ymm2
4558; AVX1-NEXT:    vmulpd %ymm2, %ymm3, %ymm3
4559; AVX1-NEXT:    vmulpd %ymm2, %ymm9, %ymm2
4560; AVX1-NEXT:    vbroadcastsd 728(%rbp), %ymm4
4561; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm5
4562; AVX1-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
4563; AVX1-NEXT:    vmulpd %ymm4, %ymm6, %ymm4
4564; AVX1-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
4565; AVX1-NEXT:    vbroadcastsd 736(%rbp), %ymm4
4566; AVX1-NEXT:    vmulpd %ymm4, %ymm12, %ymm5
4567; AVX1-NEXT:    vaddpd %ymm5, %ymm2, %ymm2
4568; AVX1-NEXT:    vmulpd %ymm4, %ymm15, %ymm4
4569; AVX1-NEXT:    vaddpd %ymm4, %ymm3, %ymm3
4570; AVX1-NEXT:    vbroadcastsd 744(%rbp), %ymm4
4571; AVX1-NEXT:    vmulpd %ymm4, %ymm8, %ymm5
4572; AVX1-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
4573; AVX1-NEXT:    vmulpd %ymm4, %ymm7, %ymm4
4574; AVX1-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
4575; AVX1-NEXT:    vbroadcastsd 752(%rbp), %ymm4
4576; AVX1-NEXT:    vmulpd %ymm4, %ymm11, %ymm5
4577; AVX1-NEXT:    vaddpd %ymm5, %ymm2, %ymm2
4578; AVX1-NEXT:    vmulpd 48(%rbp), %ymm4, %ymm4
4579; AVX1-NEXT:    vaddpd %ymm4, %ymm3, %ymm3
4580; AVX1-NEXT:    vbroadcastsd 760(%rbp), %ymm4
4581; AVX1-NEXT:    vmulpd 112(%rbp), %ymm4, %ymm5
4582; AVX1-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
4583; AVX1-NEXT:    vmulpd 80(%rbp), %ymm4, %ymm4
4584; AVX1-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
4585; AVX1-NEXT:    vbroadcastsd 768(%rbp), %ymm4
4586; AVX1-NEXT:    vmulpd 144(%rbp), %ymm4, %ymm5
4587; AVX1-NEXT:    vaddpd %ymm5, %ymm2, %ymm2
4588; AVX1-NEXT:    vmulpd %ymm4, %ymm13, %ymm4
4589; AVX1-NEXT:    vaddpd %ymm4, %ymm3, %ymm3
4590; AVX1-NEXT:    vbroadcastsd 776(%rbp), %ymm4
4591; AVX1-NEXT:    vmulpd 240(%rbp), %ymm4, %ymm5
4592; AVX1-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
4593; AVX1-NEXT:    vmulpd %ymm4, %ymm14, %ymm4
4594; AVX1-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
4595; AVX1-NEXT:    vmovapd %ymm3, 480(%rdi)
4596; AVX1-NEXT:    vmovapd %ymm2, 448(%rdi)
4597; AVX1-NEXT:    vmovapd %ymm1, 416(%rdi)
4598; AVX1-NEXT:    vmovapd %ymm0, 384(%rdi)
4599; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4600; AVX1-NEXT:    vmovaps %ymm0, 352(%rdi)
4601; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4602; AVX1-NEXT:    vmovaps %ymm0, 320(%rdi)
4603; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4604; AVX1-NEXT:    vmovaps %ymm0, 288(%rdi)
4605; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4606; AVX1-NEXT:    vmovaps %ymm0, 256(%rdi)
4607; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4608; AVX1-NEXT:    vmovaps %ymm0, 224(%rdi)
4609; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4610; AVX1-NEXT:    vmovaps %ymm0, 192(%rdi)
4611; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4612; AVX1-NEXT:    vmovaps %ymm0, 160(%rdi)
4613; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4614; AVX1-NEXT:    vmovaps %ymm0, 128(%rdi)
4615; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4616; AVX1-NEXT:    vmovaps %ymm0, 96(%rdi)
4617; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4618; AVX1-NEXT:    vmovaps %ymm0, 64(%rdi)
4619; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4620; AVX1-NEXT:    vmovaps %ymm0, 32(%rdi)
4621; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4622; AVX1-NEXT:    vmovaps %ymm0, (%rdi)
4623; AVX1-NEXT:    movq %rbp, %rsp
4624; AVX1-NEXT:    popq %rbp
4625; AVX1-NEXT:    vzeroupper
4626; AVX1-NEXT:    retq
4627;
4628; AVX2-LABEL: test_mul8x8_f64:
4629; AVX2:       # %bb.0: # %entry
4630; AVX2-NEXT:    pushq %rbp
4631; AVX2-NEXT:    movq %rsp, %rbp
4632; AVX2-NEXT:    andq $-32, %rsp
4633; AVX2-NEXT:    subq $448, %rsp # imm = 0x1C0
4634; AVX2-NEXT:    vmovapd %ymm2, %ymm12
4635; AVX2-NEXT:    vmovapd %ymm0, (%rsp) # 32-byte Spill
4636; AVX2-NEXT:    movq %rdi, %rax
4637; AVX2-NEXT:    vmovapd 144(%rbp), %ymm13
4638; AVX2-NEXT:    vmovapd 112(%rbp), %ymm14
4639; AVX2-NEXT:    vbroadcastsd 272(%rbp), %ymm10
4640; AVX2-NEXT:    vmulpd %ymm1, %ymm10, %ymm8
4641; AVX2-NEXT:    vmovapd %ymm1, %ymm9
4642; AVX2-NEXT:    vmulpd %ymm0, %ymm10, %ymm0
4643; AVX2-NEXT:    vbroadcastsd 280(%rbp), %ymm10
4644; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
4645; AVX2-NEXT:    vaddpd %ymm11, %ymm8, %ymm1
4646; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
4647; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4648; AVX2-NEXT:    vbroadcastsd 288(%rbp), %ymm10
4649; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
4650; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4651; AVX2-NEXT:    vmulpd %ymm5, %ymm10, %ymm10
4652; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4653; AVX2-NEXT:    vbroadcastsd 296(%rbp), %ymm10
4654; AVX2-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
4655; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4656; AVX2-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
4657; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4658; AVX2-NEXT:    vbroadcastsd 304(%rbp), %ymm10
4659; AVX2-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
4660; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4661; AVX2-NEXT:    vmulpd 48(%rbp), %ymm10, %ymm10
4662; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4663; AVX2-NEXT:    vbroadcastsd 312(%rbp), %ymm10
4664; AVX2-NEXT:    vmulpd %ymm10, %ymm14, %ymm11
4665; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4666; AVX2-NEXT:    vmulpd 80(%rbp), %ymm10, %ymm10
4667; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4668; AVX2-NEXT:    vbroadcastsd 320(%rbp), %ymm10
4669; AVX2-NEXT:    vmulpd %ymm10, %ymm13, %ymm11
4670; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4671; AVX2-NEXT:    vmulpd 176(%rbp), %ymm10, %ymm10
4672; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4673; AVX2-NEXT:    vbroadcastsd 328(%rbp), %ymm10
4674; AVX2-NEXT:    vmulpd 240(%rbp), %ymm10, %ymm11
4675; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4676; AVX2-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4677; AVX2-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
4678; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
4679; AVX2-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4680; AVX2-NEXT:    vbroadcastsd 336(%rbp), %ymm0
4681; AVX2-NEXT:    vmulpd %ymm0, %ymm9, %ymm1
4682; AVX2-NEXT:    vbroadcastsd 344(%rbp), %ymm10
4683; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
4684; AVX2-NEXT:    vmovapd %ymm3, %ymm8
4685; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4686; AVX2-NEXT:    vmovapd (%rsp), %ymm15 # 32-byte Reload
4687; AVX2-NEXT:    vmulpd %ymm0, %ymm15, %ymm0
4688; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
4689; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4690; AVX2-NEXT:    vbroadcastsd 352(%rbp), %ymm10
4691; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
4692; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4693; AVX2-NEXT:    vmulpd %ymm5, %ymm10, %ymm10
4694; AVX2-NEXT:    vmovapd %ymm5, %ymm3
4695; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4696; AVX2-NEXT:    vbroadcastsd 360(%rbp), %ymm10
4697; AVX2-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
4698; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4699; AVX2-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
4700; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4701; AVX2-NEXT:    vbroadcastsd 368(%rbp), %ymm10
4702; AVX2-NEXT:    vmovapd 16(%rbp), %ymm2
4703; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm11
4704; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4705; AVX2-NEXT:    vmulpd 48(%rbp), %ymm10, %ymm10
4706; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4707; AVX2-NEXT:    vbroadcastsd 376(%rbp), %ymm10
4708; AVX2-NEXT:    vmulpd %ymm10, %ymm14, %ymm11
4709; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4710; AVX2-NEXT:    vmovapd 80(%rbp), %ymm2
4711; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
4712; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4713; AVX2-NEXT:    vbroadcastsd 384(%rbp), %ymm10
4714; AVX2-NEXT:    vmulpd %ymm10, %ymm13, %ymm11
4715; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4716; AVX2-NEXT:    vmovapd 176(%rbp), %ymm14
4717; AVX2-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
4718; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4719; AVX2-NEXT:    vbroadcastsd 392(%rbp), %ymm10
4720; AVX2-NEXT:    vmovapd 240(%rbp), %ymm2
4721; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm11
4722; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4723; AVX2-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4724; AVX2-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
4725; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
4726; AVX2-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4727; AVX2-NEXT:    vbroadcastsd 400(%rbp), %ymm0
4728; AVX2-NEXT:    vmulpd %ymm0, %ymm9, %ymm1
4729; AVX2-NEXT:    vbroadcastsd 408(%rbp), %ymm10
4730; AVX2-NEXT:    vmovapd %ymm8, %ymm5
4731; AVX2-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
4732; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4733; AVX2-NEXT:    vmulpd %ymm0, %ymm15, %ymm0
4734; AVX2-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
4735; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4736; AVX2-NEXT:    vbroadcastsd 416(%rbp), %ymm10
4737; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
4738; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4739; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
4740; AVX2-NEXT:    vmovapd %ymm3, %ymm2
4741; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4742; AVX2-NEXT:    vbroadcastsd 424(%rbp), %ymm10
4743; AVX2-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
4744; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4745; AVX2-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
4746; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4747; AVX2-NEXT:    vbroadcastsd 432(%rbp), %ymm10
4748; AVX2-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
4749; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4750; AVX2-NEXT:    vmulpd 48(%rbp), %ymm10, %ymm10
4751; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4752; AVX2-NEXT:    vbroadcastsd 440(%rbp), %ymm10
4753; AVX2-NEXT:    vmulpd 112(%rbp), %ymm10, %ymm11
4754; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4755; AVX2-NEXT:    vmulpd 80(%rbp), %ymm10, %ymm10
4756; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4757; AVX2-NEXT:    vbroadcastsd 448(%rbp), %ymm10
4758; AVX2-NEXT:    vmulpd %ymm10, %ymm13, %ymm11
4759; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4760; AVX2-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
4761; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4762; AVX2-NEXT:    vbroadcastsd 456(%rbp), %ymm10
4763; AVX2-NEXT:    vmulpd 240(%rbp), %ymm10, %ymm11
4764; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4765; AVX2-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4766; AVX2-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
4767; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
4768; AVX2-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4769; AVX2-NEXT:    vbroadcastsd 464(%rbp), %ymm0
4770; AVX2-NEXT:    vmulpd %ymm0, %ymm9, %ymm1
4771; AVX2-NEXT:    vmovapd %ymm9, %ymm13
4772; AVX2-NEXT:    vbroadcastsd 472(%rbp), %ymm10
4773; AVX2-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
4774; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4775; AVX2-NEXT:    vmulpd %ymm0, %ymm15, %ymm0
4776; AVX2-NEXT:    vmovapd %ymm15, %ymm9
4777; AVX2-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
4778; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4779; AVX2-NEXT:    vbroadcastsd 480(%rbp), %ymm10
4780; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
4781; AVX2-NEXT:    vmovapd %ymm4, %ymm3
4782; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4783; AVX2-NEXT:    vmovapd %ymm2, %ymm15
4784; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
4785; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4786; AVX2-NEXT:    vbroadcastsd 488(%rbp), %ymm10
4787; AVX2-NEXT:    vmovapd %ymm7, %ymm8
4788; AVX2-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
4789; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4790; AVX2-NEXT:    vmovapd %ymm6, %ymm7
4791; AVX2-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
4792; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4793; AVX2-NEXT:    vbroadcastsd 496(%rbp), %ymm10
4794; AVX2-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
4795; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4796; AVX2-NEXT:    vmovapd 48(%rbp), %ymm4
4797; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm10
4798; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4799; AVX2-NEXT:    vbroadcastsd 504(%rbp), %ymm10
4800; AVX2-NEXT:    vmovapd 112(%rbp), %ymm2
4801; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm11
4802; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4803; AVX2-NEXT:    vmovapd 80(%rbp), %ymm14
4804; AVX2-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
4805; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4806; AVX2-NEXT:    vbroadcastsd 512(%rbp), %ymm10
4807; AVX2-NEXT:    vmulpd 144(%rbp), %ymm10, %ymm11
4808; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4809; AVX2-NEXT:    vmovapd 176(%rbp), %ymm2
4810; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
4811; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4812; AVX2-NEXT:    vbroadcastsd 520(%rbp), %ymm10
4813; AVX2-NEXT:    vmulpd 240(%rbp), %ymm10, %ymm11
4814; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4815; AVX2-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4816; AVX2-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
4817; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
4818; AVX2-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4819; AVX2-NEXT:    vbroadcastsd 528(%rbp), %ymm0
4820; AVX2-NEXT:    vmulpd %ymm0, %ymm13, %ymm1
4821; AVX2-NEXT:    vbroadcastsd 536(%rbp), %ymm10
4822; AVX2-NEXT:    vmulpd %ymm5, %ymm10, %ymm11
4823; AVX2-NEXT:    vmovapd %ymm5, %ymm6
4824; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4825; AVX2-NEXT:    vmulpd %ymm0, %ymm9, %ymm0
4826; AVX2-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
4827; AVX2-NEXT:    vmovapd %ymm12, %ymm5
4828; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4829; AVX2-NEXT:    vbroadcastsd 544(%rbp), %ymm10
4830; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
4831; AVX2-NEXT:    vmovapd %ymm3, %ymm12
4832; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4833; AVX2-NEXT:    vmulpd %ymm10, %ymm15, %ymm10
4834; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4835; AVX2-NEXT:    vbroadcastsd 552(%rbp), %ymm10
4836; AVX2-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
4837; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4838; AVX2-NEXT:    vmulpd %ymm7, %ymm10, %ymm10
4839; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4840; AVX2-NEXT:    vbroadcastsd 560(%rbp), %ymm10
4841; AVX2-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
4842; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4843; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm10
4844; AVX2-NEXT:    vmovapd %ymm4, %ymm3
4845; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4846; AVX2-NEXT:    vbroadcastsd 568(%rbp), %ymm10
4847; AVX2-NEXT:    vmulpd 112(%rbp), %ymm10, %ymm11
4848; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4849; AVX2-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
4850; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4851; AVX2-NEXT:    vbroadcastsd 576(%rbp), %ymm10
4852; AVX2-NEXT:    vmovapd 144(%rbp), %ymm4
4853; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
4854; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4855; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
4856; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4857; AVX2-NEXT:    vbroadcastsd 584(%rbp), %ymm10
4858; AVX2-NEXT:    vmovapd 240(%rbp), %ymm14
4859; AVX2-NEXT:    vmulpd %ymm10, %ymm14, %ymm11
4860; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4861; AVX2-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4862; AVX2-NEXT:    vmovapd 208(%rbp), %ymm2
4863; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm1
4864; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
4865; AVX2-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4866; AVX2-NEXT:    vbroadcastsd 592(%rbp), %ymm0
4867; AVX2-NEXT:    vmulpd %ymm0, %ymm13, %ymm1
4868; AVX2-NEXT:    vbroadcastsd 600(%rbp), %ymm10
4869; AVX2-NEXT:    vmulpd %ymm6, %ymm10, %ymm11
4870; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4871; AVX2-NEXT:    vmulpd %ymm0, %ymm9, %ymm0
4872; AVX2-NEXT:    vmulpd %ymm5, %ymm10, %ymm10
4873; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4874; AVX2-NEXT:    vbroadcastsd 608(%rbp), %ymm10
4875; AVX2-NEXT:    vmulpd %ymm10, %ymm12, %ymm11
4876; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4877; AVX2-NEXT:    vmulpd %ymm10, %ymm15, %ymm10
4878; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4879; AVX2-NEXT:    vbroadcastsd 616(%rbp), %ymm10
4880; AVX2-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
4881; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4882; AVX2-NEXT:    vmulpd %ymm7, %ymm10, %ymm10
4883; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4884; AVX2-NEXT:    vbroadcastsd 624(%rbp), %ymm10
4885; AVX2-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
4886; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4887; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
4888; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4889; AVX2-NEXT:    vbroadcastsd 632(%rbp), %ymm10
4890; AVX2-NEXT:    vmovapd 112(%rbp), %ymm3
4891; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
4892; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4893; AVX2-NEXT:    vmovapd 80(%rbp), %ymm3
4894; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
4895; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
4896; AVX2-NEXT:    vbroadcastsd 640(%rbp), %ymm10
4897; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
4898; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
4899; AVX2-NEXT:    vmovapd 176(%rbp), %ymm3
4900; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
4901; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
4902; AVX2-NEXT:    vbroadcastsd 648(%rbp), %ymm10
4903; AVX2-NEXT:    vmovapd %ymm14, %ymm4
4904; AVX2-NEXT:    vmulpd %ymm10, %ymm14, %ymm11
4905; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
4906; AVX2-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4907; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm1
4908; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
4909; AVX2-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4910; AVX2-NEXT:    vbroadcastsd 656(%rbp), %ymm2
4911; AVX2-NEXT:    vmovapd %ymm13, %ymm3
4912; AVX2-NEXT:    vmulpd %ymm2, %ymm13, %ymm1
4913; AVX2-NEXT:    vbroadcastsd 664(%rbp), %ymm0
4914; AVX2-NEXT:    vmulpd %ymm0, %ymm6, %ymm14
4915; AVX2-NEXT:    vmovapd %ymm6, %ymm10
4916; AVX2-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
4917; AVX2-NEXT:    vmulpd %ymm2, %ymm9, %ymm2
4918; AVX2-NEXT:    vmulpd %ymm0, %ymm5, %ymm0
4919; AVX2-NEXT:    vmovapd %ymm5, %ymm6
4920; AVX2-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
4921; AVX2-NEXT:    vbroadcastsd 672(%rbp), %ymm2
4922; AVX2-NEXT:    vmulpd %ymm2, %ymm12, %ymm14
4923; AVX2-NEXT:    vaddpd %ymm0, %ymm14, %ymm0
4924; AVX2-NEXT:    vmulpd %ymm2, %ymm15, %ymm2
4925; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
4926; AVX2-NEXT:    vbroadcastsd 680(%rbp), %ymm2
4927; AVX2-NEXT:    vmulpd %ymm2, %ymm8, %ymm14
4928; AVX2-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
4929; AVX2-NEXT:    vmulpd %ymm2, %ymm7, %ymm2
4930; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
4931; AVX2-NEXT:    vbroadcastsd 688(%rbp), %ymm2
4932; AVX2-NEXT:    vmovapd 16(%rbp), %ymm11
4933; AVX2-NEXT:    vmulpd %ymm2, %ymm11, %ymm14
4934; AVX2-NEXT:    vaddpd %ymm0, %ymm14, %ymm0
4935; AVX2-NEXT:    vmulpd 48(%rbp), %ymm2, %ymm2
4936; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
4937; AVX2-NEXT:    vbroadcastsd 696(%rbp), %ymm2
4938; AVX2-NEXT:    vmovapd 112(%rbp), %ymm5
4939; AVX2-NEXT:    vmulpd %ymm2, %ymm5, %ymm14
4940; AVX2-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
4941; AVX2-NEXT:    vmovapd 80(%rbp), %ymm5
4942; AVX2-NEXT:    vmulpd %ymm2, %ymm5, %ymm2
4943; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
4944; AVX2-NEXT:    vbroadcastsd 704(%rbp), %ymm2
4945; AVX2-NEXT:    vmulpd 144(%rbp), %ymm2, %ymm14
4946; AVX2-NEXT:    vaddpd %ymm0, %ymm14, %ymm0
4947; AVX2-NEXT:    vmovapd 176(%rbp), %ymm13
4948; AVX2-NEXT:    vmulpd %ymm2, %ymm13, %ymm2
4949; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
4950; AVX2-NEXT:    vbroadcastsd 712(%rbp), %ymm2
4951; AVX2-NEXT:    vmulpd %ymm2, %ymm4, %ymm14
4952; AVX2-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
4953; AVX2-NEXT:    vmovapd 208(%rbp), %ymm14
4954; AVX2-NEXT:    vmulpd %ymm2, %ymm14, %ymm2
4955; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
4956; AVX2-NEXT:    vbroadcastsd 720(%rbp), %ymm2
4957; AVX2-NEXT:    vmulpd %ymm2, %ymm3, %ymm3
4958; AVX2-NEXT:    vmulpd %ymm2, %ymm9, %ymm2
4959; AVX2-NEXT:    vbroadcastsd 728(%rbp), %ymm4
4960; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm5
4961; AVX2-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
4962; AVX2-NEXT:    vmulpd %ymm4, %ymm6, %ymm4
4963; AVX2-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
4964; AVX2-NEXT:    vbroadcastsd 736(%rbp), %ymm4
4965; AVX2-NEXT:    vmulpd %ymm4, %ymm12, %ymm5
4966; AVX2-NEXT:    vaddpd %ymm5, %ymm2, %ymm2
4967; AVX2-NEXT:    vmulpd %ymm4, %ymm15, %ymm4
4968; AVX2-NEXT:    vaddpd %ymm4, %ymm3, %ymm3
4969; AVX2-NEXT:    vbroadcastsd 744(%rbp), %ymm4
4970; AVX2-NEXT:    vmulpd %ymm4, %ymm8, %ymm5
4971; AVX2-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
4972; AVX2-NEXT:    vmulpd %ymm4, %ymm7, %ymm4
4973; AVX2-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
4974; AVX2-NEXT:    vbroadcastsd 752(%rbp), %ymm4
4975; AVX2-NEXT:    vmulpd %ymm4, %ymm11, %ymm5
4976; AVX2-NEXT:    vaddpd %ymm5, %ymm2, %ymm2
4977; AVX2-NEXT:    vmulpd 48(%rbp), %ymm4, %ymm4
4978; AVX2-NEXT:    vaddpd %ymm4, %ymm3, %ymm3
4979; AVX2-NEXT:    vbroadcastsd 760(%rbp), %ymm4
4980; AVX2-NEXT:    vmulpd 112(%rbp), %ymm4, %ymm5
4981; AVX2-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
4982; AVX2-NEXT:    vmulpd 80(%rbp), %ymm4, %ymm4
4983; AVX2-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
4984; AVX2-NEXT:    vbroadcastsd 768(%rbp), %ymm4
4985; AVX2-NEXT:    vmulpd 144(%rbp), %ymm4, %ymm5
4986; AVX2-NEXT:    vaddpd %ymm5, %ymm2, %ymm2
4987; AVX2-NEXT:    vmulpd %ymm4, %ymm13, %ymm4
4988; AVX2-NEXT:    vaddpd %ymm4, %ymm3, %ymm3
4989; AVX2-NEXT:    vbroadcastsd 776(%rbp), %ymm4
4990; AVX2-NEXT:    vmulpd 240(%rbp), %ymm4, %ymm5
4991; AVX2-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
4992; AVX2-NEXT:    vmulpd %ymm4, %ymm14, %ymm4
4993; AVX2-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
4994; AVX2-NEXT:    vmovapd %ymm3, 480(%rdi)
4995; AVX2-NEXT:    vmovapd %ymm2, 448(%rdi)
4996; AVX2-NEXT:    vmovapd %ymm1, 416(%rdi)
4997; AVX2-NEXT:    vmovapd %ymm0, 384(%rdi)
4998; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4999; AVX2-NEXT:    vmovaps %ymm0, 352(%rdi)
5000; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5001; AVX2-NEXT:    vmovaps %ymm0, 320(%rdi)
5002; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5003; AVX2-NEXT:    vmovaps %ymm0, 288(%rdi)
5004; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5005; AVX2-NEXT:    vmovaps %ymm0, 256(%rdi)
5006; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5007; AVX2-NEXT:    vmovaps %ymm0, 224(%rdi)
5008; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5009; AVX2-NEXT:    vmovaps %ymm0, 192(%rdi)
5010; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5011; AVX2-NEXT:    vmovaps %ymm0, 160(%rdi)
5012; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5013; AVX2-NEXT:    vmovaps %ymm0, 128(%rdi)
5014; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5015; AVX2-NEXT:    vmovaps %ymm0, 96(%rdi)
5016; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5017; AVX2-NEXT:    vmovaps %ymm0, 64(%rdi)
5018; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5019; AVX2-NEXT:    vmovaps %ymm0, 32(%rdi)
5020; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5021; AVX2-NEXT:    vmovaps %ymm0, (%rdi)
5022; AVX2-NEXT:    movq %rbp, %rsp
5023; AVX2-NEXT:    popq %rbp
5024; AVX2-NEXT:    vzeroupper
5025; AVX2-NEXT:    retq
5026;
5027; AVX512-LABEL: test_mul8x8_f64:
5028; AVX512:       # %bb.0: # %entry
5029; AVX512-NEXT:    pushq %rbp
5030; AVX512-NEXT:    movq %rsp, %rbp
5031; AVX512-NEXT:    andq $-64, %rsp
5032; AVX512-NEXT:    subq $64, %rsp
5033; AVX512-NEXT:    movq %rdi, %rax
5034; AVX512-NEXT:    vmulpd 16(%rbp){1to8}, %zmm0, %zmm8
5035; AVX512-NEXT:    vmulpd 24(%rbp){1to8}, %zmm1, %zmm9
5036; AVX512-NEXT:    vaddpd %zmm9, %zmm8, %zmm8
5037; AVX512-NEXT:    vmulpd 32(%rbp){1to8}, %zmm2, %zmm9
5038; AVX512-NEXT:    vaddpd %zmm9, %zmm8, %zmm8
5039; AVX512-NEXT:    vmulpd 40(%rbp){1to8}, %zmm3, %zmm9
5040; AVX512-NEXT:    vaddpd %zmm9, %zmm8, %zmm8
5041; AVX512-NEXT:    vmulpd 48(%rbp){1to8}, %zmm4, %zmm9
5042; AVX512-NEXT:    vaddpd %zmm9, %zmm8, %zmm8
5043; AVX512-NEXT:    vmulpd 56(%rbp){1to8}, %zmm5, %zmm9
5044; AVX512-NEXT:    vaddpd %zmm9, %zmm8, %zmm8
5045; AVX512-NEXT:    vmulpd 64(%rbp){1to8}, %zmm6, %zmm9
5046; AVX512-NEXT:    vaddpd %zmm9, %zmm8, %zmm8
5047; AVX512-NEXT:    vmulpd 72(%rbp){1to8}, %zmm7, %zmm9
5048; AVX512-NEXT:    vaddpd %zmm9, %zmm8, %zmm8
5049; AVX512-NEXT:    vmulpd 80(%rbp){1to8}, %zmm0, %zmm9
5050; AVX512-NEXT:    vmulpd 88(%rbp){1to8}, %zmm1, %zmm10
5051; AVX512-NEXT:    vaddpd %zmm10, %zmm9, %zmm9
5052; AVX512-NEXT:    vmulpd 96(%rbp){1to8}, %zmm2, %zmm10
5053; AVX512-NEXT:    vaddpd %zmm10, %zmm9, %zmm9
5054; AVX512-NEXT:    vmulpd 104(%rbp){1to8}, %zmm3, %zmm10
5055; AVX512-NEXT:    vaddpd %zmm10, %zmm9, %zmm9
5056; AVX512-NEXT:    vmulpd 112(%rbp){1to8}, %zmm4, %zmm10
5057; AVX512-NEXT:    vaddpd %zmm10, %zmm9, %zmm9
5058; AVX512-NEXT:    vmulpd 120(%rbp){1to8}, %zmm5, %zmm10
5059; AVX512-NEXT:    vaddpd %zmm10, %zmm9, %zmm9
5060; AVX512-NEXT:    vmulpd 128(%rbp){1to8}, %zmm6, %zmm10
5061; AVX512-NEXT:    vaddpd %zmm10, %zmm9, %zmm9
5062; AVX512-NEXT:    vmulpd 136(%rbp){1to8}, %zmm7, %zmm10
5063; AVX512-NEXT:    vaddpd %zmm10, %zmm9, %zmm9
5064; AVX512-NEXT:    vmulpd 144(%rbp){1to8}, %zmm0, %zmm10
5065; AVX512-NEXT:    vmulpd 152(%rbp){1to8}, %zmm1, %zmm11
5066; AVX512-NEXT:    vaddpd %zmm11, %zmm10, %zmm10
5067; AVX512-NEXT:    vmulpd 160(%rbp){1to8}, %zmm2, %zmm11
5068; AVX512-NEXT:    vaddpd %zmm11, %zmm10, %zmm10
5069; AVX512-NEXT:    vmulpd 168(%rbp){1to8}, %zmm3, %zmm11
5070; AVX512-NEXT:    vaddpd %zmm11, %zmm10, %zmm10
5071; AVX512-NEXT:    vmulpd 176(%rbp){1to8}, %zmm4, %zmm11
5072; AVX512-NEXT:    vaddpd %zmm11, %zmm10, %zmm10
5073; AVX512-NEXT:    vmulpd 184(%rbp){1to8}, %zmm5, %zmm11
5074; AVX512-NEXT:    vaddpd %zmm11, %zmm10, %zmm10
5075; AVX512-NEXT:    vmulpd 192(%rbp){1to8}, %zmm6, %zmm11
5076; AVX512-NEXT:    vaddpd %zmm11, %zmm10, %zmm10
5077; AVX512-NEXT:    vmulpd 200(%rbp){1to8}, %zmm7, %zmm11
5078; AVX512-NEXT:    vaddpd %zmm11, %zmm10, %zmm10
5079; AVX512-NEXT:    vmulpd 208(%rbp){1to8}, %zmm0, %zmm11
5080; AVX512-NEXT:    vmulpd 216(%rbp){1to8}, %zmm1, %zmm12
5081; AVX512-NEXT:    vaddpd %zmm12, %zmm11, %zmm11
5082; AVX512-NEXT:    vmulpd 224(%rbp){1to8}, %zmm2, %zmm12
5083; AVX512-NEXT:    vaddpd %zmm12, %zmm11, %zmm11
5084; AVX512-NEXT:    vmulpd 232(%rbp){1to8}, %zmm3, %zmm12
5085; AVX512-NEXT:    vaddpd %zmm12, %zmm11, %zmm11
5086; AVX512-NEXT:    vmulpd 240(%rbp){1to8}, %zmm4, %zmm12
5087; AVX512-NEXT:    vaddpd %zmm12, %zmm11, %zmm11
5088; AVX512-NEXT:    vmulpd 248(%rbp){1to8}, %zmm5, %zmm12
5089; AVX512-NEXT:    vaddpd %zmm12, %zmm11, %zmm11
5090; AVX512-NEXT:    vmulpd 256(%rbp){1to8}, %zmm6, %zmm12
5091; AVX512-NEXT:    vaddpd %zmm12, %zmm11, %zmm11
5092; AVX512-NEXT:    vmulpd 264(%rbp){1to8}, %zmm7, %zmm12
5093; AVX512-NEXT:    vaddpd %zmm12, %zmm11, %zmm11
5094; AVX512-NEXT:    vmulpd 272(%rbp){1to8}, %zmm0, %zmm12
5095; AVX512-NEXT:    vmulpd 280(%rbp){1to8}, %zmm1, %zmm13
5096; AVX512-NEXT:    vaddpd %zmm13, %zmm12, %zmm12
5097; AVX512-NEXT:    vmulpd 288(%rbp){1to8}, %zmm2, %zmm13
5098; AVX512-NEXT:    vaddpd %zmm13, %zmm12, %zmm12
5099; AVX512-NEXT:    vmulpd 296(%rbp){1to8}, %zmm3, %zmm13
5100; AVX512-NEXT:    vaddpd %zmm13, %zmm12, %zmm12
5101; AVX512-NEXT:    vmulpd 304(%rbp){1to8}, %zmm4, %zmm13
5102; AVX512-NEXT:    vaddpd %zmm13, %zmm12, %zmm12
5103; AVX512-NEXT:    vmulpd 312(%rbp){1to8}, %zmm5, %zmm13
5104; AVX512-NEXT:    vaddpd %zmm13, %zmm12, %zmm12
5105; AVX512-NEXT:    vmulpd 320(%rbp){1to8}, %zmm6, %zmm13
5106; AVX512-NEXT:    vaddpd %zmm13, %zmm12, %zmm12
5107; AVX512-NEXT:    vmulpd 328(%rbp){1to8}, %zmm7, %zmm13
5108; AVX512-NEXT:    vaddpd %zmm13, %zmm12, %zmm12
5109; AVX512-NEXT:    vmulpd 336(%rbp){1to8}, %zmm0, %zmm13
5110; AVX512-NEXT:    vmulpd 344(%rbp){1to8}, %zmm1, %zmm14
5111; AVX512-NEXT:    vaddpd %zmm14, %zmm13, %zmm13
5112; AVX512-NEXT:    vmulpd 352(%rbp){1to8}, %zmm2, %zmm14
5113; AVX512-NEXT:    vaddpd %zmm14, %zmm13, %zmm13
5114; AVX512-NEXT:    vmulpd 360(%rbp){1to8}, %zmm3, %zmm14
5115; AVX512-NEXT:    vaddpd %zmm14, %zmm13, %zmm13
5116; AVX512-NEXT:    vmulpd 368(%rbp){1to8}, %zmm4, %zmm14
5117; AVX512-NEXT:    vaddpd %zmm14, %zmm13, %zmm13
5118; AVX512-NEXT:    vmulpd 376(%rbp){1to8}, %zmm5, %zmm14
5119; AVX512-NEXT:    vaddpd %zmm14, %zmm13, %zmm13
5120; AVX512-NEXT:    vmulpd 384(%rbp){1to8}, %zmm6, %zmm14
5121; AVX512-NEXT:    vaddpd %zmm14, %zmm13, %zmm13
5122; AVX512-NEXT:    vmulpd 392(%rbp){1to8}, %zmm7, %zmm14
5123; AVX512-NEXT:    vaddpd %zmm14, %zmm13, %zmm13
5124; AVX512-NEXT:    vmulpd 400(%rbp){1to8}, %zmm0, %zmm14
5125; AVX512-NEXT:    vmulpd 408(%rbp){1to8}, %zmm1, %zmm15
5126; AVX512-NEXT:    vaddpd %zmm15, %zmm14, %zmm14
5127; AVX512-NEXT:    vmulpd 416(%rbp){1to8}, %zmm2, %zmm15
5128; AVX512-NEXT:    vaddpd %zmm15, %zmm14, %zmm14
5129; AVX512-NEXT:    vmulpd 424(%rbp){1to8}, %zmm3, %zmm15
5130; AVX512-NEXT:    vaddpd %zmm15, %zmm14, %zmm14
5131; AVX512-NEXT:    vmulpd 432(%rbp){1to8}, %zmm4, %zmm15
5132; AVX512-NEXT:    vaddpd %zmm15, %zmm14, %zmm14
5133; AVX512-NEXT:    vmulpd 440(%rbp){1to8}, %zmm5, %zmm15
5134; AVX512-NEXT:    vaddpd %zmm15, %zmm14, %zmm14
5135; AVX512-NEXT:    vmulpd 448(%rbp){1to8}, %zmm6, %zmm15
5136; AVX512-NEXT:    vaddpd %zmm15, %zmm14, %zmm14
5137; AVX512-NEXT:    vmulpd 456(%rbp){1to8}, %zmm7, %zmm15
5138; AVX512-NEXT:    vaddpd %zmm15, %zmm14, %zmm14
5139; AVX512-NEXT:    vmulpd 464(%rbp){1to8}, %zmm0, %zmm0
5140; AVX512-NEXT:    vmulpd 472(%rbp){1to8}, %zmm1, %zmm1
5141; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
5142; AVX512-NEXT:    vmulpd 480(%rbp){1to8}, %zmm2, %zmm1
5143; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
5144; AVX512-NEXT:    vmulpd 488(%rbp){1to8}, %zmm3, %zmm1
5145; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
5146; AVX512-NEXT:    vmulpd 496(%rbp){1to8}, %zmm4, %zmm1
5147; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
5148; AVX512-NEXT:    vmulpd 504(%rbp){1to8}, %zmm5, %zmm1
5149; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
5150; AVX512-NEXT:    vmulpd 512(%rbp){1to8}, %zmm6, %zmm1
5151; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
5152; AVX512-NEXT:    vmulpd 520(%rbp){1to8}, %zmm7, %zmm1
5153; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
5154; AVX512-NEXT:    vmovapd %zmm0, 448(%rdi)
5155; AVX512-NEXT:    vmovapd %zmm14, 384(%rdi)
5156; AVX512-NEXT:    vmovapd %zmm13, 320(%rdi)
5157; AVX512-NEXT:    vmovapd %zmm12, 256(%rdi)
5158; AVX512-NEXT:    vmovapd %zmm11, 192(%rdi)
5159; AVX512-NEXT:    vmovapd %zmm10, 128(%rdi)
5160; AVX512-NEXT:    vmovapd %zmm9, 64(%rdi)
5161; AVX512-NEXT:    vmovapd %zmm8, (%rdi)
5162; AVX512-NEXT:    movq %rbp, %rsp
5163; AVX512-NEXT:    popq %rbp
5164; AVX512-NEXT:    vzeroupper
5165; AVX512-NEXT:    retq
5166entry:
5167  %split = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
5168  %split1 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
5169  %split2 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
5170  %split3 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
5171  %split4 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
5172  %split5 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
5173  %split6 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
5174  %split7 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5175  %splat.splat = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> zeroinitializer
5176  %0 = fmul <8 x double> %split, %splat.splat
5177  %splat.splat18 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
5178  %1 = fmul <8 x double> %split1, %splat.splat18
5179  %2 = fadd <8 x double> %0, %1
5180  %splat.splat21 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
5181  %3 = fmul <8 x double> %split2, %splat.splat21
5182  %4 = fadd <8 x double> %2, %3
5183  %splat.splat24 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
5184  %5 = fmul <8 x double> %split3, %splat.splat24
5185  %6 = fadd <8 x double> %4, %5
5186  %splat.splat27 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
5187  %7 = fmul <8 x double> %split4, %splat.splat27
5188  %8 = fadd <8 x double> %6, %7
5189  %splat.splat30 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
5190  %9 = fmul <8 x double> %split5, %splat.splat30
5191  %10 = fadd <8 x double> %8, %9
5192  %splat.splat33 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
5193  %11 = fmul <8 x double> %split6, %splat.splat33
5194  %12 = fadd <8 x double> %10, %11
5195  %splat.splat36 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
5196  %13 = fmul <8 x double> %split7, %splat.splat36
5197  %14 = fadd <8 x double> %12, %13
5198  %splat.splat39 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
5199  %15 = fmul <8 x double> %split, %splat.splat39
5200  %splat.splat42 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
5201  %16 = fmul <8 x double> %split1, %splat.splat42
5202  %17 = fadd <8 x double> %15, %16
5203  %splat.splat45 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
5204  %18 = fmul <8 x double> %split2, %splat.splat45
5205  %19 = fadd <8 x double> %17, %18
5206  %splat.splat48 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
5207  %20 = fmul <8 x double> %split3, %splat.splat48
5208  %21 = fadd <8 x double> %19, %20
5209  %splat.splat51 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
5210  %22 = fmul <8 x double> %split4, %splat.splat51
5211  %23 = fadd <8 x double> %21, %22
5212  %splat.splat54 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
5213  %24 = fmul <8 x double> %split5, %splat.splat54
5214  %25 = fadd <8 x double> %23, %24
5215  %splat.splat57 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14>
5216  %26 = fmul <8 x double> %split6, %splat.splat57
5217  %27 = fadd <8 x double> %25, %26
5218  %splat.splat60 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
5219  %28 = fmul <8 x double> %split7, %splat.splat60
5220  %29 = fadd <8 x double> %27, %28
5221  %splat.splat63 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
5222  %30 = fmul <8 x double> %split, %splat.splat63
5223  %splat.splat66 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
5224  %31 = fmul <8 x double> %split1, %splat.splat66
5225  %32 = fadd <8 x double> %30, %31
5226  %splat.splat69 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18>
5227  %33 = fmul <8 x double> %split2, %splat.splat69
5228  %34 = fadd <8 x double> %32, %33
5229  %splat.splat72 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19>
5230  %35 = fmul <8 x double> %split3, %splat.splat72
5231  %36 = fadd <8 x double> %34, %35
5232  %splat.splat75 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20>
5233  %37 = fmul <8 x double> %split4, %splat.splat75
5234  %38 = fadd <8 x double> %36, %37
5235  %splat.splat78 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21>
5236  %39 = fmul <8 x double> %split5, %splat.splat78
5237  %40 = fadd <8 x double> %38, %39
5238  %splat.splat81 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22>
5239  %41 = fmul <8 x double> %split6, %splat.splat81
5240  %42 = fadd <8 x double> %40, %41
5241  %splat.splat84 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
5242  %43 = fmul <8 x double> %split7, %splat.splat84
5243  %44 = fadd <8 x double> %42, %43
5244  %splat.splat87 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
5245  %45 = fmul <8 x double> %split, %splat.splat87
5246  %splat.splat90 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
5247  %46 = fmul <8 x double> %split1, %splat.splat90
5248  %47 = fadd <8 x double> %45, %46
5249  %splat.splat93 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26>
5250  %48 = fmul <8 x double> %split2, %splat.splat93
5251  %49 = fadd <8 x double> %47, %48
5252  %splat.splat96 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
5253  %50 = fmul <8 x double> %split3, %splat.splat96
5254  %51 = fadd <8 x double> %49, %50
5255  %splat.splat99 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
5256  %52 = fmul <8 x double> %split4, %splat.splat99
5257  %53 = fadd <8 x double> %51, %52
5258  %splat.splat102 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29>
5259  %54 = fmul <8 x double> %split5, %splat.splat102
5260  %55 = fadd <8 x double> %53, %54
5261  %splat.splat105 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30>
5262  %56 = fmul <8 x double> %split6, %splat.splat105
5263  %57 = fadd <8 x double> %55, %56
5264  %splat.splat108 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
5265  %58 = fmul <8 x double> %split7, %splat.splat108
5266  %59 = fadd <8 x double> %57, %58
5267  %splat.splat111 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
5268  %60 = fmul <8 x double> %split, %splat.splat111
5269  %splat.splat114 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33>
5270  %61 = fmul <8 x double> %split1, %splat.splat114
5271  %62 = fadd <8 x double> %60, %61
5272  %splat.splat117 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34>
5273  %63 = fmul <8 x double> %split2, %splat.splat117
5274  %64 = fadd <8 x double> %62, %63
5275  %splat.splat120 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35>
5276  %65 = fmul <8 x double> %split3, %splat.splat120
5277  %66 = fadd <8 x double> %64, %65
5278  %splat.splat123 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36>
5279  %67 = fmul <8 x double> %split4, %splat.splat123
5280  %68 = fadd <8 x double> %66, %67
5281  %splat.splat126 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37>
5282  %69 = fmul <8 x double> %split5, %splat.splat126
5283  %70 = fadd <8 x double> %68, %69
5284  %splat.splat129 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38>
5285  %71 = fmul <8 x double> %split6, %splat.splat129
5286  %72 = fadd <8 x double> %70, %71
5287  %splat.splat132 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39>
5288  %73 = fmul <8 x double> %split7, %splat.splat132
5289  %74 = fadd <8 x double> %72, %73
5290  %splat.splat135 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>
5291  %75 = fmul <8 x double> %split, %splat.splat135
5292  %splat.splat138 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41>
5293  %76 = fmul <8 x double> %split1, %splat.splat138
5294  %77 = fadd <8 x double> %75, %76
5295  %splat.splat141 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
5296  %78 = fmul <8 x double> %split2, %splat.splat141
5297  %79 = fadd <8 x double> %77, %78
5298  %splat.splat144 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43>
5299  %80 = fmul <8 x double> %split3, %splat.splat144
5300  %81 = fadd <8 x double> %79, %80
5301  %splat.splat147 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44>
5302  %82 = fmul <8 x double> %split4, %splat.splat147
5303  %83 = fadd <8 x double> %81, %82
5304  %splat.splat150 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45>
5305  %84 = fmul <8 x double> %split5, %splat.splat150
5306  %85 = fadd <8 x double> %83, %84
5307  %splat.splat153 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46>
5308  %86 = fmul <8 x double> %split6, %splat.splat153
5309  %87 = fadd <8 x double> %85, %86
5310  %splat.splat156 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47>
5311  %88 = fmul <8 x double> %split7, %splat.splat156
5312  %89 = fadd <8 x double> %87, %88
5313  %splat.splat159 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
5314  %90 = fmul <8 x double> %split, %splat.splat159
5315  %splat.splat162 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49>
5316  %91 = fmul <8 x double> %split1, %splat.splat162
5317  %92 = fadd <8 x double> %90, %91
5318  %splat.splat165 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50>
5319  %93 = fmul <8 x double> %split2, %splat.splat165
5320  %94 = fadd <8 x double> %92, %93
5321  %splat.splat168 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51>
5322  %95 = fmul <8 x double> %split3, %splat.splat168
5323  %96 = fadd <8 x double> %94, %95
5324  %splat.splat171 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52>
5325  %97 = fmul <8 x double> %split4, %splat.splat171
5326  %98 = fadd <8 x double> %96, %97
5327  %splat.splat174 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53>
5328  %99 = fmul <8 x double> %split5, %splat.splat174
5329  %100 = fadd <8 x double> %98, %99
5330  %splat.splat177 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54>
5331  %101 = fmul <8 x double> %split6, %splat.splat177
5332  %102 = fadd <8 x double> %100, %101
5333  %splat.splat180 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55>
5334  %103 = fmul <8 x double> %split7, %splat.splat180
5335  %104 = fadd <8 x double> %102, %103
5336  %splat.splat183 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
5337  %105 = fmul <8 x double> %split, %splat.splat183
5338  %splat.splat186 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57>
5339  %106 = fmul <8 x double> %split1, %splat.splat186
5340  %107 = fadd <8 x double> %105, %106
5341  %splat.splat189 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58>
5342  %108 = fmul <8 x double> %split2, %splat.splat189
5343  %109 = fadd <8 x double> %107, %108
5344  %splat.splat192 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59>
5345  %110 = fmul <8 x double> %split3, %splat.splat192
5346  %111 = fadd <8 x double> %109, %110
5347  %splat.splat195 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60>
5348  %112 = fmul <8 x double> %split4, %splat.splat195
5349  %113 = fadd <8 x double> %111, %112
5350  %splat.splat198 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61>
5351  %114 = fmul <8 x double> %split5, %splat.splat198
5352  %115 = fadd <8 x double> %113, %114
5353  %splat.splat201 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62>
5354  %116 = fmul <8 x double> %split6, %splat.splat201
5355  %117 = fadd <8 x double> %115, %116
5356  %splat.splat204 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
5357  %118 = fmul <8 x double> %split7, %splat.splat204
5358  %119 = fadd <8 x double> %117, %118
5359  %120 = shufflevector <8 x double> %14, <8 x double> %29, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
5360  %121 = shufflevector <8 x double> %44, <8 x double> %59, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
5361  %122 = shufflevector <8 x double> %74, <8 x double> %89, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
5362  %123 = shufflevector <8 x double> %104, <8 x double> %119, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
5363  %124 = shufflevector <16 x double> %120, <16 x double> %121, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
5364  %125 = shufflevector <16 x double> %122, <16 x double> %123, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
5365  %126 = shufflevector <32 x double> %124, <32 x double> %125, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5366  ret <64 x double> %126
5367}
5368