xref: /llvm-project/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll (revision e9f9467da063875bd684e46660e2ff36ba4f55e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2            | FileCheck %s --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefix=SSSE3-SLOW
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSSE3-FAST
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefix=AVX1-SLOW
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefix=AVX1-FAST
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefix=AVX2
8
9; PR37890 - subvector reduction followed by shuffle reduction
10
11define float @PR37890_v4f32(<4 x float> %a)  {
12; SSE2-LABEL: PR37890_v4f32:
13; SSE2:       # %bb.0:
14; SSE2-NEXT:    movaps %xmm0, %xmm1
15; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
16; SSE2-NEXT:    addps %xmm1, %xmm0
17; SSE2-NEXT:    movaps %xmm0, %xmm1
18; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
19; SSE2-NEXT:    addss %xmm1, %xmm0
20; SSE2-NEXT:    retq
21;
22; SSSE3-SLOW-LABEL: PR37890_v4f32:
23; SSSE3-SLOW:       # %bb.0:
24; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
25; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
26; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
27; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
28; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm0
29; SSSE3-SLOW-NEXT:    retq
30;
31; SSSE3-FAST-LABEL: PR37890_v4f32:
32; SSSE3-FAST:       # %bb.0:
33; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
34; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
35; SSSE3-FAST-NEXT:    retq
36;
37; AVX1-SLOW-LABEL: PR37890_v4f32:
38; AVX1-SLOW:       # %bb.0:
39; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
40; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
41; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
42; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
43; AVX1-SLOW-NEXT:    retq
44;
45; AVX1-FAST-LABEL: PR37890_v4f32:
46; AVX1-FAST:       # %bb.0:
47; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
48; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
49; AVX1-FAST-NEXT:    retq
50;
51; AVX2-LABEL: PR37890_v4f32:
52; AVX2:       # %bb.0:
53; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
54; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
55; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
56; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
57; AVX2-NEXT:    retq
58  %hi0 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
59  %lo0 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
60  %sum0 = fadd fast <2 x float> %lo0, %hi0
61  %hi1 = shufflevector <2 x float> %sum0, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
62  %sum1 = fadd fast <2 x float> %sum0, %hi1
63  %e = extractelement <2 x float> %sum1, i32 0
64  ret float %e
65}
66
67define double @PR37890_v4f64(<4 x double> %a)  {
68; SSE2-LABEL: PR37890_v4f64:
69; SSE2:       # %bb.0:
70; SSE2-NEXT:    addpd %xmm1, %xmm0
71; SSE2-NEXT:    movapd %xmm0, %xmm1
72; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
73; SSE2-NEXT:    addsd %xmm1, %xmm0
74; SSE2-NEXT:    retq
75;
76; SSSE3-SLOW-LABEL: PR37890_v4f64:
77; SSSE3-SLOW:       # %bb.0:
78; SSSE3-SLOW-NEXT:    addpd %xmm1, %xmm0
79; SSSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
80; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
81; SSSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
82; SSSE3-SLOW-NEXT:    retq
83;
84; SSSE3-FAST-LABEL: PR37890_v4f64:
85; SSSE3-FAST:       # %bb.0:
86; SSSE3-FAST-NEXT:    addpd %xmm1, %xmm0
87; SSSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
88; SSSE3-FAST-NEXT:    retq
89;
90; AVX1-SLOW-LABEL: PR37890_v4f64:
91; AVX1-SLOW:       # %bb.0:
92; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
93; AVX1-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
94; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
95; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
96; AVX1-SLOW-NEXT:    vzeroupper
97; AVX1-SLOW-NEXT:    retq
98;
99; AVX1-FAST-LABEL: PR37890_v4f64:
100; AVX1-FAST:       # %bb.0:
101; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
102; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm1, %xmm0
103; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
104; AVX1-FAST-NEXT:    vzeroupper
105; AVX1-FAST-NEXT:    retq
106;
107; AVX2-LABEL: PR37890_v4f64:
108; AVX2:       # %bb.0:
109; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
110; AVX2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
111; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
112; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
113; AVX2-NEXT:    vzeroupper
114; AVX2-NEXT:    retq
115  %hi0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3>
116  %lo0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1>
117  %sum0 = fadd fast <2 x double> %lo0, %hi0
118  %hi1 = shufflevector <2 x double> %sum0, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
119  %sum1 = fadd fast <2 x double> %sum0, %hi1
120  %e = extractelement <2 x double> %sum1, i32 0
121  ret double %e
122}
123
124define float @PR37890_v8f32(<8 x float> %a)  {
125; SSE2-LABEL: PR37890_v8f32:
126; SSE2:       # %bb.0:
127; SSE2-NEXT:    addps %xmm1, %xmm0
128; SSE2-NEXT:    movaps %xmm0, %xmm1
129; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
130; SSE2-NEXT:    addps %xmm1, %xmm0
131; SSE2-NEXT:    movaps %xmm0, %xmm1
132; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
133; SSE2-NEXT:    addss %xmm1, %xmm0
134; SSE2-NEXT:    retq
135;
136; SSSE3-SLOW-LABEL: PR37890_v8f32:
137; SSSE3-SLOW:       # %bb.0:
138; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
139; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
140; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
141; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
142; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
143; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm0
144; SSSE3-SLOW-NEXT:    retq
145;
146; SSSE3-FAST-LABEL: PR37890_v8f32:
147; SSSE3-FAST:       # %bb.0:
148; SSSE3-FAST-NEXT:    addps %xmm1, %xmm0
149; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
150; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
151; SSSE3-FAST-NEXT:    retq
152;
153; AVX1-SLOW-LABEL: PR37890_v8f32:
154; AVX1-SLOW:       # %bb.0:
155; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
156; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
157; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
158; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
159; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
160; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
161; AVX1-SLOW-NEXT:    vzeroupper
162; AVX1-SLOW-NEXT:    retq
163;
164; AVX1-FAST-LABEL: PR37890_v8f32:
165; AVX1-FAST:       # %bb.0:
166; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
167; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm1, %xmm0
168; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
169; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
170; AVX1-FAST-NEXT:    vzeroupper
171; AVX1-FAST-NEXT:    retq
172;
173; AVX2-LABEL: PR37890_v8f32:
174; AVX2:       # %bb.0:
175; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
176; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
177; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
178; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
179; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
180; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
181; AVX2-NEXT:    vzeroupper
182; AVX2-NEXT:    retq
183  %hi0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
184  %lo0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
185  %sum0 = fadd fast <4 x float> %lo0, %hi0
186  %hi1 = shufflevector <4 x float> %sum0, <4 x float> undef, <2 x i32> <i32 2, i32 3>
187  %lo1 = shufflevector <4 x float> %sum0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
188  %sum1 = fadd fast <2 x float> %lo1, %hi1
189  %hi2 = shufflevector <2 x float> %sum1, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
190  %sum2 = fadd fast <2 x float> %sum1, %hi2
191  %e = extractelement <2 x float> %sum2, i32 0
192  ret float %e
193}
194
195define double @PR37890_v8f64(<8 x double> %a)  {
196; SSE2-LABEL: PR37890_v8f64:
197; SSE2:       # %bb.0:
198; SSE2-NEXT:    addpd %xmm3, %xmm1
199; SSE2-NEXT:    addpd %xmm2, %xmm0
200; SSE2-NEXT:    addpd %xmm1, %xmm0
201; SSE2-NEXT:    movapd %xmm0, %xmm1
202; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
203; SSE2-NEXT:    addsd %xmm1, %xmm0
204; SSE2-NEXT:    retq
205;
206; SSSE3-SLOW-LABEL: PR37890_v8f64:
207; SSSE3-SLOW:       # %bb.0:
208; SSSE3-SLOW-NEXT:    addpd %xmm3, %xmm1
209; SSSE3-SLOW-NEXT:    addpd %xmm2, %xmm0
210; SSSE3-SLOW-NEXT:    addpd %xmm1, %xmm0
211; SSSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
212; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
213; SSSE3-SLOW-NEXT:    addsd %xmm1, %xmm0
214; SSSE3-SLOW-NEXT:    retq
215;
216; SSSE3-FAST-LABEL: PR37890_v8f64:
217; SSSE3-FAST:       # %bb.0:
218; SSSE3-FAST-NEXT:    addpd %xmm3, %xmm1
219; SSSE3-FAST-NEXT:    addpd %xmm2, %xmm0
220; SSSE3-FAST-NEXT:    addpd %xmm1, %xmm0
221; SSSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
222; SSSE3-FAST-NEXT:    retq
223;
224; AVX1-SLOW-LABEL: PR37890_v8f64:
225; AVX1-SLOW:       # %bb.0:
226; AVX1-SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
227; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
228; AVX1-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
229; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
230; AVX1-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
231; AVX1-SLOW-NEXT:    vzeroupper
232; AVX1-SLOW-NEXT:    retq
233;
234; AVX1-FAST-LABEL: PR37890_v8f64:
235; AVX1-FAST:       # %bb.0:
236; AVX1-FAST-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
237; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
238; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm1, %xmm0
239; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
240; AVX1-FAST-NEXT:    vzeroupper
241; AVX1-FAST-NEXT:    retq
242;
243; AVX2-LABEL: PR37890_v8f64:
244; AVX2:       # %bb.0:
245; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
246; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
247; AVX2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
248; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
249; AVX2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
250; AVX2-NEXT:    vzeroupper
251; AVX2-NEXT:    retq
252  %hi0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
253  %lo0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
254  %sum0 = fadd fast <4 x double> %lo0, %hi0
255  %hi1 = shufflevector <4 x double> %sum0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
256  %lo1 = shufflevector <4 x double> %sum0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
257  %sum1 = fadd fast <2 x double> %lo1, %hi1
258  %hi2 = shufflevector <2 x double> %sum1, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
259  %sum2 = fadd fast <2 x double> %sum1, %hi2
260  %e = extractelement <2 x double> %sum2, i32 0
261  ret double %e
262}
263
264define float @PR37890_v16f32(<16 x float> %a)  {
265; SSE2-LABEL: PR37890_v16f32:
266; SSE2:       # %bb.0:
267; SSE2-NEXT:    addps %xmm3, %xmm1
268; SSE2-NEXT:    addps %xmm2, %xmm0
269; SSE2-NEXT:    addps %xmm1, %xmm0
270; SSE2-NEXT:    movaps %xmm0, %xmm1
271; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
272; SSE2-NEXT:    addps %xmm1, %xmm0
273; SSE2-NEXT:    movaps %xmm0, %xmm1
274; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
275; SSE2-NEXT:    addss %xmm1, %xmm0
276; SSE2-NEXT:    retq
277;
278; SSSE3-SLOW-LABEL: PR37890_v16f32:
279; SSSE3-SLOW:       # %bb.0:
280; SSSE3-SLOW-NEXT:    addps %xmm3, %xmm1
281; SSSE3-SLOW-NEXT:    addps %xmm2, %xmm0
282; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
283; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
284; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
285; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
286; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
287; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm0
288; SSSE3-SLOW-NEXT:    retq
289;
290; SSSE3-FAST-LABEL: PR37890_v16f32:
291; SSSE3-FAST:       # %bb.0:
292; SSSE3-FAST-NEXT:    addps %xmm3, %xmm1
293; SSSE3-FAST-NEXT:    addps %xmm2, %xmm0
294; SSSE3-FAST-NEXT:    addps %xmm1, %xmm0
295; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm1
296; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
297; SSSE3-FAST-NEXT:    addps %xmm1, %xmm0
298; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
299; SSSE3-FAST-NEXT:    retq
300;
301; AVX1-SLOW-LABEL: PR37890_v16f32:
302; AVX1-SLOW:       # %bb.0:
303; AVX1-SLOW-NEXT:    vaddps %ymm1, %ymm0, %ymm0
304; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
305; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
306; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
307; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
308; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
309; AVX1-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0
310; AVX1-SLOW-NEXT:    vzeroupper
311; AVX1-SLOW-NEXT:    retq
312;
313; AVX1-FAST-LABEL: PR37890_v16f32:
314; AVX1-FAST:       # %bb.0:
315; AVX1-FAST-NEXT:    vaddps %ymm1, %ymm0, %ymm0
316; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
317; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm1, %xmm0
318; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
319; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
320; AVX1-FAST-NEXT:    vzeroupper
321; AVX1-FAST-NEXT:    retq
322;
323; AVX2-LABEL: PR37890_v16f32:
324; AVX2:       # %bb.0:
325; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
326; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
327; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
328; AVX2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
329; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
330; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
331; AVX2-NEXT:    vaddss %xmm1, %xmm0, %xmm0
332; AVX2-NEXT:    vzeroupper
333; AVX2-NEXT:    retq
334  %hi0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
335  %lo0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
336  %sum0 = fadd fast <8 x float> %lo0, %hi0
337  %hi1 = shufflevector <8 x float> %sum0, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
338  %lo1 = shufflevector <8 x float> %sum0, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
339  %sum1 = fadd fast <4 x float> %lo1, %hi1
340  %hi2 = shufflevector <4 x float> %sum1, <4 x float> undef, <2 x i32> <i32 2, i32 3>
341  %lo2 = shufflevector <4 x float> %sum1, <4 x float> undef, <2 x i32> <i32 0, i32 1>
342  %sum2 = fadd fast <2 x float> %lo2, %hi2
343  %hi3 = shufflevector <2 x float> %sum2, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
344  %sum3 = fadd fast <2 x float> %sum2, %hi3
345  %e = extractelement <2 x float> %sum3, i32 0
346  ret float %e
347}
348