xref: /llvm-project/llvm/test/CodeGen/X86/load-partial-dot-product.ll (revision b635d690ed1e3fbebab9dee1b157fa380d3e9eba)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
7
8; Partial load dot product patterns based off PR51075
9
10;
11; dot3(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2]))
12;
13
14define float @dot3_float4(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) {
15; SSE2-LABEL: dot3_float4:
16; SSE2:       # %bb.0:
17; SSE2-NEXT:    movups (%rdi), %xmm0
18; SSE2-NEXT:    movups (%rsi), %xmm1
19; SSE2-NEXT:    mulps %xmm0, %xmm1
20; SSE2-NEXT:    movaps %xmm1, %xmm0
21; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
22; SSE2-NEXT:    addss %xmm1, %xmm0
23; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
24; SSE2-NEXT:    addss %xmm1, %xmm0
25; SSE2-NEXT:    retq
26;
27; SSSE3-LABEL: dot3_float4:
28; SSSE3:       # %bb.0:
29; SSSE3-NEXT:    movups (%rdi), %xmm0
30; SSSE3-NEXT:    movups (%rsi), %xmm1
31; SSSE3-NEXT:    mulps %xmm0, %xmm1
32; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
33; SSSE3-NEXT:    addss %xmm1, %xmm0
34; SSSE3-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
35; SSSE3-NEXT:    addss %xmm1, %xmm0
36; SSSE3-NEXT:    retq
37;
38; SSE41-LABEL: dot3_float4:
39; SSE41:       # %bb.0:
40; SSE41-NEXT:    movups (%rdi), %xmm0
41; SSE41-NEXT:    movups (%rsi), %xmm1
42; SSE41-NEXT:    mulps %xmm0, %xmm1
43; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
44; SSE41-NEXT:    addss %xmm1, %xmm0
45; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
46; SSE41-NEXT:    addss %xmm1, %xmm0
47; SSE41-NEXT:    retq
48;
49; AVX-LABEL: dot3_float4:
50; AVX:       # %bb.0:
51; AVX-NEXT:    vmovups (%rdi), %xmm0
52; AVX-NEXT:    vmulps (%rsi), %xmm0, %xmm0
53; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
54; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
55; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
56; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
57; AVX-NEXT:    retq
58  %x0123 = load <4 x float>, ptr %a0, align 4
59  %y0123 = load <4 x float>, ptr %a1, align 4
60  %mul0123 = fmul <4 x float> %x0123, %y0123
61  %mul0 = extractelement <4 x float> %mul0123, i32 0
62  %mul1 = extractelement <4 x float> %mul0123, i32 1
63  %mul2 = extractelement <4 x float> %mul0123, i32 2
64  %dot01 = fadd float %mul0, %mul1
65  %dot012 = fadd float %dot01, %mul2
66  ret float %dot012
67}
68
69define float @dot3_float4_as_float3(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) {
70; SSE2-LABEL: dot3_float4_as_float3:
71; SSE2:       # %bb.0:
72; SSE2-NEXT:    movups (%rdi), %xmm0
73; SSE2-NEXT:    movups (%rsi), %xmm1
74; SSE2-NEXT:    mulps %xmm0, %xmm1
75; SSE2-NEXT:    movaps %xmm1, %xmm0
76; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
77; SSE2-NEXT:    addss %xmm1, %xmm0
78; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
79; SSE2-NEXT:    addss %xmm1, %xmm0
80; SSE2-NEXT:    retq
81;
82; SSSE3-LABEL: dot3_float4_as_float3:
83; SSSE3:       # %bb.0:
84; SSSE3-NEXT:    movups (%rdi), %xmm0
85; SSSE3-NEXT:    movups (%rsi), %xmm1
86; SSSE3-NEXT:    mulps %xmm0, %xmm1
87; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
88; SSSE3-NEXT:    addss %xmm1, %xmm0
89; SSSE3-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
90; SSSE3-NEXT:    addss %xmm1, %xmm0
91; SSSE3-NEXT:    retq
92;
93; SSE41-LABEL: dot3_float4_as_float3:
94; SSE41:       # %bb.0:
95; SSE41-NEXT:    movups (%rdi), %xmm0
96; SSE41-NEXT:    movups (%rsi), %xmm1
97; SSE41-NEXT:    mulps %xmm0, %xmm1
98; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
99; SSE41-NEXT:    addss %xmm1, %xmm0
100; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
101; SSE41-NEXT:    addss %xmm1, %xmm0
102; SSE41-NEXT:    retq
103;
104; AVX-LABEL: dot3_float4_as_float3:
105; AVX:       # %bb.0:
106; AVX-NEXT:    vmovups (%rdi), %xmm0
107; AVX-NEXT:    vmulps (%rsi), %xmm0, %xmm0
108; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
109; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
110; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
111; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
112; AVX-NEXT:    retq
113  %x0123 = load <4 x float>, ptr %a0, align 4
114  %y0123 = load <4 x float>, ptr %a1, align 4
115  %x012 = shufflevector <4 x float> %x0123, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
116  %y012 = shufflevector <4 x float> %y0123, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
117  %mul012 = fmul <3 x float> %x012, %y012
118  %mul0 = extractelement <3 x float> %mul012, i32 0
119  %mul1 = extractelement <3 x float> %mul012, i32 1
120  %mul2 = extractelement <3 x float> %mul012, i32 2
121  %dot01 = fadd float %mul0, %mul1
122  %dot012 = fadd float %dot01, %mul2
123  ret float %dot012
124}
125
126define float @dot3_float3(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) {
127; SSE2-LABEL: dot3_float3:
128; SSE2:       # %bb.0:
129; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
130; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
131; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
132; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
133; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
134; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
135; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
136; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
137; SSE2-NEXT:    mulps %xmm0, %xmm1
138; SSE2-NEXT:    movaps %xmm1, %xmm0
139; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
140; SSE2-NEXT:    addss %xmm1, %xmm0
141; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
142; SSE2-NEXT:    addss %xmm1, %xmm0
143; SSE2-NEXT:    retq
144;
145; SSSE3-LABEL: dot3_float3:
146; SSSE3:       # %bb.0:
147; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
148; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
149; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
150; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
151; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
152; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
153; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
154; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
155; SSSE3-NEXT:    mulps %xmm0, %xmm1
156; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
157; SSSE3-NEXT:    addss %xmm1, %xmm0
158; SSSE3-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
159; SSSE3-NEXT:    addss %xmm1, %xmm0
160; SSSE3-NEXT:    retq
161;
162; SSE41-LABEL: dot3_float3:
163; SSE41:       # %bb.0:
164; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
165; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
166; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
167; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
168; SSE41-NEXT:    mulps %xmm0, %xmm1
169; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
170; SSE41-NEXT:    addss %xmm1, %xmm0
171; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
172; SSE41-NEXT:    addss %xmm1, %xmm0
173; SSE41-NEXT:    retq
174;
175; AVX-LABEL: dot3_float3:
176; AVX:       # %bb.0:
177; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
178; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
179; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
180; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
181; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
182; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
183; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
184; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
185; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
186; AVX-NEXT:    retq
187  %x012 = load <3 x float>, ptr %a0, align 4
188  %y012 = load <3 x float>, ptr %a1, align 4
189  %mul012 = fmul <3 x float> %x012, %y012
190  %mul0 = extractelement <3 x float> %mul012, i32 0
191  %mul1 = extractelement <3 x float> %mul012, i32 1
192  %mul2 = extractelement <3 x float> %mul012, i32 2
193  %dot01 = fadd float %mul0, %mul1
194  %dot012 = fadd float %dot01, %mul2
195  ret float %dot012
196}
197
198define float @dot3_float2_float(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) {
199; SSE2-LABEL: dot3_float2_float:
200; SSE2:       # %bb.0:
201; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
202; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
203; SSE2-NEXT:    mulps %xmm0, %xmm1
204; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
205; SSE2-NEXT:    mulss 8(%rsi), %xmm2
206; SSE2-NEXT:    movaps %xmm1, %xmm0
207; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
208; SSE2-NEXT:    addss %xmm1, %xmm0
209; SSE2-NEXT:    addss %xmm2, %xmm0
210; SSE2-NEXT:    retq
211;
212; SSSE3-LABEL: dot3_float2_float:
213; SSSE3:       # %bb.0:
214; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
215; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
216; SSSE3-NEXT:    mulps %xmm0, %xmm1
217; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
218; SSSE3-NEXT:    mulss 8(%rsi), %xmm2
219; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
220; SSSE3-NEXT:    addss %xmm1, %xmm0
221; SSSE3-NEXT:    addss %xmm2, %xmm0
222; SSSE3-NEXT:    retq
223;
224; SSE41-LABEL: dot3_float2_float:
225; SSE41:       # %bb.0:
226; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
227; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
228; SSE41-NEXT:    mulps %xmm0, %xmm1
229; SSE41-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
230; SSE41-NEXT:    mulss 8(%rsi), %xmm2
231; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
232; SSE41-NEXT:    addss %xmm1, %xmm0
233; SSE41-NEXT:    addss %xmm2, %xmm0
234; SSE41-NEXT:    retq
235;
236; AVX-LABEL: dot3_float2_float:
237; AVX:       # %bb.0:
238; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
239; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
240; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
241; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
242; AVX-NEXT:    vmulss 8(%rsi), %xmm1, %xmm1
243; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
244; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
245; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
246; AVX-NEXT:    retq
247  %x01 = load <2 x float>, ptr %a0, align 4
248  %y01 = load <2 x float>, ptr %a1, align 4
249  %ptrx2 = getelementptr inbounds float, ptr %a0, i64 2
250  %ptry2 = getelementptr inbounds float, ptr %a1, i64 2
251  %x2 = load float, ptr %ptrx2, align 4
252  %y2 = load float, ptr %ptry2, align 4
253  %mul01 = fmul <2 x float> %x01, %y01
254  %mul2 = fmul float %x2, %y2
255  %mul0 = extractelement <2 x float> %mul01, i32 0
256  %mul1 = extractelement <2 x float> %mul01, i32 1
257  %dot01 = fadd float %mul0, %mul1
258  %dot012 = fadd float %dot01, %mul2
259  ret float %dot012
260}
261
262define float @dot3_float_float2(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) {
263; SSE2-LABEL: dot3_float_float2:
264; SSE2:       # %bb.0:
265; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
266; SSE2-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
267; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
268; SSE2-NEXT:    mulps %xmm2, %xmm0
269; SSE2-NEXT:    mulss (%rsi), %xmm1
270; SSE2-NEXT:    addss %xmm0, %xmm1
271; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
272; SSE2-NEXT:    addss %xmm1, %xmm0
273; SSE2-NEXT:    retq
274;
275; SSSE3-LABEL: dot3_float_float2:
276; SSSE3:       # %bb.0:
277; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
278; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
279; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
280; SSSE3-NEXT:    mulps %xmm1, %xmm2
281; SSSE3-NEXT:    mulss (%rsi), %xmm0
282; SSSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
283; SSSE3-NEXT:    addss %xmm2, %xmm0
284; SSSE3-NEXT:    addss %xmm1, %xmm0
285; SSSE3-NEXT:    retq
286;
287; SSE41-LABEL: dot3_float_float2:
288; SSE41:       # %bb.0:
289; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
290; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
291; SSE41-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
292; SSE41-NEXT:    mulps %xmm1, %xmm2
293; SSE41-NEXT:    mulss (%rsi), %xmm0
294; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
295; SSE41-NEXT:    addss %xmm2, %xmm0
296; SSE41-NEXT:    addss %xmm1, %xmm0
297; SSE41-NEXT:    retq
298;
299; AVX-LABEL: dot3_float_float2:
300; AVX:       # %bb.0:
301; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
302; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
303; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
304; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm1
305; AVX-NEXT:    vmulss (%rsi), %xmm0, %xmm0
306; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
307; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
308; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
309; AVX-NEXT:    retq
310  %x0 = load float, ptr %a0, align 4
311  %y0 = load float, ptr %a1, align 4
312  %ptrx12 = getelementptr inbounds float, ptr %a0, i64 1
313  %ptry12 = getelementptr inbounds float, ptr %a1, i64 1
314  %x12 = load <2 x float>, ptr %ptrx12, align 4
315  %y12 = load <2 x float>, ptr %ptry12, align 4
316  %mul0 = fmul float %x0, %y0
317  %mul12 = fmul <2 x float> %x12, %y12
318  %mul1 = extractelement <2 x float> %mul12, i32 0
319  %mul2 = extractelement <2 x float> %mul12, i32 1
320  %dot01 = fadd float %mul0, %mul1
321  %dot012 = fadd float %dot01, %mul2
322  ret float %dot012
323}
324
325;
326; dot2(ptr x, ptr y) - ((xptr y[0])+(xptr y[1]))
327;
328
329define float @dot2_float4(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) {
330; SSE2-LABEL: dot2_float4:
331; SSE2:       # %bb.0:
332; SSE2-NEXT:    movups (%rdi), %xmm0
333; SSE2-NEXT:    movups (%rsi), %xmm1
334; SSE2-NEXT:    mulps %xmm0, %xmm1
335; SSE2-NEXT:    movaps %xmm1, %xmm0
336; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
337; SSE2-NEXT:    addss %xmm1, %xmm0
338; SSE2-NEXT:    retq
339;
340; SSSE3-LABEL: dot2_float4:
341; SSSE3:       # %bb.0:
342; SSSE3-NEXT:    movups (%rdi), %xmm0
343; SSSE3-NEXT:    movups (%rsi), %xmm1
344; SSSE3-NEXT:    mulps %xmm0, %xmm1
345; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
346; SSSE3-NEXT:    addss %xmm1, %xmm0
347; SSSE3-NEXT:    retq
348;
349; SSE41-LABEL: dot2_float4:
350; SSE41:       # %bb.0:
351; SSE41-NEXT:    movups (%rdi), %xmm0
352; SSE41-NEXT:    movups (%rsi), %xmm1
353; SSE41-NEXT:    mulps %xmm0, %xmm1
354; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
355; SSE41-NEXT:    addss %xmm1, %xmm0
356; SSE41-NEXT:    retq
357;
358; AVX-LABEL: dot2_float4:
359; AVX:       # %bb.0:
360; AVX-NEXT:    vmovups (%rdi), %xmm0
361; AVX-NEXT:    vmulps (%rsi), %xmm0, %xmm0
362; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
363; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
364; AVX-NEXT:    retq
365  %x0123 = load <4 x float>, ptr %a0, align 4
366  %y0123 = load <4 x float>, ptr %a1, align 4
367  %mul0123 = fmul <4 x float> %x0123, %y0123
368  %mul0 = extractelement <4 x float> %mul0123, i32 0
369  %mul1 = extractelement <4 x float> %mul0123, i32 1
370  %dot01 = fadd float %mul0, %mul1
371  ret float %dot01
372}
373
374define float @dot2_float2(ptr dereferenceable(16) %a0, ptr dereferenceable(16) %a1) {
375; SSE2-LABEL: dot2_float2:
376; SSE2:       # %bb.0:
377; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
378; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
379; SSE2-NEXT:    mulps %xmm0, %xmm1
380; SSE2-NEXT:    movaps %xmm1, %xmm0
381; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
382; SSE2-NEXT:    addss %xmm1, %xmm0
383; SSE2-NEXT:    retq
384;
385; SSSE3-LABEL: dot2_float2:
386; SSSE3:       # %bb.0:
387; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
388; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
389; SSSE3-NEXT:    mulps %xmm0, %xmm1
390; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
391; SSSE3-NEXT:    addss %xmm1, %xmm0
392; SSSE3-NEXT:    retq
393;
394; SSE41-LABEL: dot2_float2:
395; SSE41:       # %bb.0:
396; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
397; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
398; SSE41-NEXT:    mulps %xmm0, %xmm1
399; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
400; SSE41-NEXT:    addss %xmm1, %xmm0
401; SSE41-NEXT:    retq
402;
403; AVX-LABEL: dot2_float2:
404; AVX:       # %bb.0:
405; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
406; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
407; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
408; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
409; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
410; AVX-NEXT:    retq
411  %x01 = load <2 x float>, ptr %a0, align 4
412  %y01 = load <2 x float>, ptr %a1, align 4
413  %mul01 = fmul <2 x float> %x01, %y01
414  %mul0 = extractelement <2 x float> %mul01, i32 0
415  %mul1 = extractelement <2 x float> %mul01, i32 1
416  %dot01 = fadd float %mul0, %mul1
417  ret float %dot01
418}
419