xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll (revision ebfdd38228d4e21597642301fb75f5b02ff3ee06)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
4; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
5; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
6; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
7; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
8
9;
10; 128-bit vectors
11;
12
13define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) {
14; CHECK-LABEL: @test_v2f64(
15; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
16; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
17; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
18; CHECK-NEXT:    ret <2 x double> [[TMP3]]
19;
20  %a0 = extractelement <2 x double> %a, i32 0
21  %a1 = extractelement <2 x double> %a, i32 1
22  %b0 = extractelement <2 x double> %b, i32 0
23  %b1 = extractelement <2 x double> %b, i32 1
24  %r0 = fadd double %a0, %a1
25  %r1 = fadd double %b0, %b1
26  %r00 = insertelement <2 x double> zeroinitializer, double %r0, i32 0
27  %r01 = insertelement <2 x double>  %r00, double %r1, i32 1
28  ret <2 x double> %r01
29}
30
31define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) {
32; CHECK-LABEL: @test_v4f32(
33; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
34; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
35; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
36; CHECK-NEXT:    ret <4 x float> [[TMP3]]
37;
38  %a0 = extractelement <4 x float> %a, i32 0
39  %a1 = extractelement <4 x float> %a, i32 1
40  %a2 = extractelement <4 x float> %a, i32 2
41  %a3 = extractelement <4 x float> %a, i32 3
42  %b0 = extractelement <4 x float> %b, i32 0
43  %b1 = extractelement <4 x float> %b, i32 1
44  %b2 = extractelement <4 x float> %b, i32 2
45  %b3 = extractelement <4 x float> %b, i32 3
46  %r0 = fadd float %a0, %a1
47  %r1 = fadd float %a2, %a3
48  %r2 = fadd float %b0, %b1
49  %r3 = fadd float %b2, %b3
50  %r00 = insertelement <4 x float> zeroinitializer, float %r0, i32 0
51  %r01 = insertelement <4 x float>  %r00, float %r1, i32 1
52  %r02 = insertelement <4 x float>  %r01, float %r2, i32 2
53  %r03 = insertelement <4 x float>  %r02, float %r3, i32 3
54  ret <4 x float> %r03
55}
56
57define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) {
58; CHECK-LABEL: @test_v2i64(
59; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
60; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
61; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
62; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
63;
64  %a0 = extractelement <2 x i64> %a, i32 0
65  %a1 = extractelement <2 x i64> %a, i32 1
66  %b0 = extractelement <2 x i64> %b, i32 0
67  %b1 = extractelement <2 x i64> %b, i32 1
68  %r0 = add i64 %a0, %a1
69  %r1 = add i64 %b0, %b1
70  %r00 = insertelement <2 x i64> zeroinitializer, i64 %r0, i32 0
71  %r01 = insertelement <2 x i64>  %r00, i64 %r1, i32 1
72  ret <2 x i64> %r01
73}
74
75define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) {
76; CHECK-LABEL: @test_v4i32(
77; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
78; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
79; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
80; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
81;
82  %a0 = extractelement <4 x i32> %a, i32 0
83  %a1 = extractelement <4 x i32> %a, i32 1
84  %a2 = extractelement <4 x i32> %a, i32 2
85  %a3 = extractelement <4 x i32> %a, i32 3
86  %b0 = extractelement <4 x i32> %b, i32 0
87  %b1 = extractelement <4 x i32> %b, i32 1
88  %b2 = extractelement <4 x i32> %b, i32 2
89  %b3 = extractelement <4 x i32> %b, i32 3
90  %r0 = add i32 %a0, %a1
91  %r1 = add i32 %a2, %a3
92  %r2 = add i32 %b0, %b1
93  %r3 = add i32 %b2, %b3
94  %r00 = insertelement <4 x i32> zeroinitializer, i32 %r0, i32 0
95  %r01 = insertelement <4 x i32>  %r00, i32 %r1, i32 1
96  %r02 = insertelement <4 x i32>  %r01, i32 %r2, i32 2
97  %r03 = insertelement <4 x i32>  %r02, i32 %r3, i32 3
98  ret <4 x i32> %r03
99}
100
101define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
102; CHECK-LABEL: @test_v8i16(
103; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
104; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
105; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
106; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
107;
108  %a0 = extractelement <8 x i16> %a, i32 0
109  %a1 = extractelement <8 x i16> %a, i32 1
110  %a2 = extractelement <8 x i16> %a, i32 2
111  %a3 = extractelement <8 x i16> %a, i32 3
112  %a4 = extractelement <8 x i16> %a, i32 4
113  %a5 = extractelement <8 x i16> %a, i32 5
114  %a6 = extractelement <8 x i16> %a, i32 6
115  %a7 = extractelement <8 x i16> %a, i32 7
116  %b0 = extractelement <8 x i16> %b, i32 0
117  %b1 = extractelement <8 x i16> %b, i32 1
118  %b2 = extractelement <8 x i16> %b, i32 2
119  %b3 = extractelement <8 x i16> %b, i32 3
120  %b4 = extractelement <8 x i16> %b, i32 4
121  %b5 = extractelement <8 x i16> %b, i32 5
122  %b6 = extractelement <8 x i16> %b, i32 6
123  %b7 = extractelement <8 x i16> %b, i32 7
124  %r0 = add i16 %a0, %a1
125  %r1 = add i16 %a2, %a3
126  %r2 = add i16 %a4, %a5
127  %r3 = add i16 %a6, %a7
128  %r4 = add i16 %b0, %b1
129  %r5 = add i16 %b2, %b3
130  %r6 = add i16 %b4, %b5
131  %r7 = add i16 %b6, %b7
132  %r00 = insertelement <8 x i16> zeroinitializer, i16 %r0, i32 0
133  %r01 = insertelement <8 x i16>  %r00, i16 %r1, i32 1
134  %r02 = insertelement <8 x i16>  %r01, i16 %r2, i32 2
135  %r03 = insertelement <8 x i16>  %r02, i16 %r3, i32 3
136  %r04 = insertelement <8 x i16>  %r03, i16 %r4, i32 4
137  %r05 = insertelement <8 x i16>  %r04, i16 %r5, i32 5
138  %r06 = insertelement <8 x i16>  %r05, i16 %r6, i32 6
139  %r07 = insertelement <8 x i16>  %r06, i16 %r7, i32 7
140  ret <8 x i16> %r07
141}
142
143; PR41892
144define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
145; CHECK-LABEL: @test_v4f32_v2f32_store(
146; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[F:%.*]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
147; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <2 x i32> <i32 0, i32 3>
148; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
149; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[P:%.*]], align 4
150; CHECK-NEXT:    ret void
151;
152  %x0 = extractelement <4 x float> %f, i64 0
153  %x1 = extractelement <4 x float> %f, i64 1
154  %add01 = fadd float %x0, %x1
155  store float %add01, ptr %p, align 4
156  %x2 = extractelement <4 x float> %f, i64 2
157  %x3 = extractelement <4 x float> %f, i64 3
158  %add23 = fadd float %x2, %x3
159  %p23 = getelementptr inbounds float, ptr %p, i64 1
160  store float %add23, ptr %p23, align 4
161  ret void
162}
163
164;
165; 256-bit vectors
166;
167
168define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
169; SSE-LABEL: @test_v4f64(
170; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
171; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
172; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
173; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
174; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
175; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
176; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
177; SSE-NEXT:    ret <4 x double> [[TMP7]]
178;
179; SLM-LABEL: @test_v4f64(
180; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
181; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
182; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
183; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
184; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
185; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
186; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
187; SLM-NEXT:    ret <4 x double> [[TMP7]]
188;
189; AVX-LABEL: @test_v4f64(
190; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
191; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
192; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
193; AVX-NEXT:    ret <4 x double> [[TMP3]]
194;
195  %a0 = extractelement <4 x double> %a, i32 0
196  %a1 = extractelement <4 x double> %a, i32 1
197  %a2 = extractelement <4 x double> %a, i32 2
198  %a3 = extractelement <4 x double> %a, i32 3
199  %b0 = extractelement <4 x double> %b, i32 0
200  %b1 = extractelement <4 x double> %b, i32 1
201  %b2 = extractelement <4 x double> %b, i32 2
202  %b3 = extractelement <4 x double> %b, i32 3
203  %r0 = fadd double %a0, %a1
204  %r1 = fadd double %b0, %b1
205  %r2 = fadd double %a2, %a3
206  %r3 = fadd double %b2, %b3
207  %r00 = insertelement <4 x double> zeroinitializer, double %r0, i32 0
208  %r01 = insertelement <4 x double>  %r00, double %r1, i32 1
209  %r02 = insertelement <4 x double>  %r01, double %r2, i32 2
210  %r03 = insertelement <4 x double>  %r02, double %r3, i32 3
211  ret <4 x double> %r03
212}
213
214; PR50392
215define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) {
216; SSE-LABEL: @test_v4f64_partial_swizzle(
217; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
218; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
219; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
220; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
221; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
222; SSE-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
223; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
224; SSE-NEXT:    [[R0212:%.*]] = insertelement <4 x double> [[TMP4]], double 0.000000e+00, i64 1
225; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R0212]], double [[R3]], i64 3
226; SSE-NEXT:    ret <4 x double> [[R03]]
227;
228; SLM-LABEL: @test_v4f64_partial_swizzle(
229; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
230; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
231; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
232; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
233; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
234; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
235; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double poison, double poison>, double [[R0]], i64 0
236; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
237; SLM-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
238; SLM-NEXT:    ret <4 x double> [[R031]]
239;
240; AVX-LABEL: @test_v4f64_partial_swizzle(
241; AVX-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
242; AVX-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
243; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
244; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
245; AVX-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
246; AVX-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
247; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
248; AVX-NEXT:    [[R0212:%.*]] = insertelement <4 x double> [[TMP4]], double 0.000000e+00, i64 1
249; AVX-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R0212]], double [[R3]], i64 3
250; AVX-NEXT:    ret <4 x double> [[R03]]
251;
252  %a0 = extractelement <4 x double> %a, i64 0
253  %a1 = extractelement <4 x double> %a, i64 1
254  %b0 = extractelement <4 x double> %b, i64 0
255  %b1 = extractelement <4 x double> %b, i64 1
256  %b2 = extractelement <4 x double> %b, i32 2
257  %b3 = extractelement <4 x double> %b, i32 3
258  %r0 = fadd double %a0, %a1
259  %r2 = fadd double %b0, %b1
260  %r3 = fadd double %b2, %b3
261  %r00 = insertelement <4 x double> zeroinitializer, double %r0, i32 0
262  %r02 = insertelement <4 x double>  %r00, double %r2, i32 2
263  %r03 = insertelement <4 x double>  %r02, double %r3, i32 3
264  ret <4 x double> %r03
265}
266
267define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
268; SSE-LABEL: @test_v8f32(
269; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
270; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
271; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
272; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
273; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
274; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
275; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
276; SSE-NEXT:    ret <8 x float> [[TMP7]]
277;
278; SLM-LABEL: @test_v8f32(
279; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
280; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
281; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
282; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
283; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
284; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
285; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
286; SLM-NEXT:    ret <8 x float> [[TMP7]]
287;
288; AVX-LABEL: @test_v8f32(
289; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
290; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
291; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
292; AVX-NEXT:    ret <8 x float> [[TMP3]]
293;
294  %a0 = extractelement <8 x float> %a, i32 0
295  %a1 = extractelement <8 x float> %a, i32 1
296  %a2 = extractelement <8 x float> %a, i32 2
297  %a3 = extractelement <8 x float> %a, i32 3
298  %a4 = extractelement <8 x float> %a, i32 4
299  %a5 = extractelement <8 x float> %a, i32 5
300  %a6 = extractelement <8 x float> %a, i32 6
301  %a7 = extractelement <8 x float> %a, i32 7
302  %b0 = extractelement <8 x float> %b, i32 0
303  %b1 = extractelement <8 x float> %b, i32 1
304  %b2 = extractelement <8 x float> %b, i32 2
305  %b3 = extractelement <8 x float> %b, i32 3
306  %b4 = extractelement <8 x float> %b, i32 4
307  %b5 = extractelement <8 x float> %b, i32 5
308  %b6 = extractelement <8 x float> %b, i32 6
309  %b7 = extractelement <8 x float> %b, i32 7
310  %r0 = fadd float %a0, %a1
311  %r1 = fadd float %a2, %a3
312  %r2 = fadd float %b0, %b1
313  %r3 = fadd float %b2, %b3
314  %r4 = fadd float %a4, %a5
315  %r5 = fadd float %a6, %a7
316  %r6 = fadd float %b4, %b5
317  %r7 = fadd float %b6, %b7
318  %r00 = insertelement <8 x float> zeroinitializer, float %r0, i32 0
319  %r01 = insertelement <8 x float>  %r00, float %r1, i32 1
320  %r02 = insertelement <8 x float>  %r01, float %r2, i32 2
321  %r03 = insertelement <8 x float>  %r02, float %r3, i32 3
322  %r04 = insertelement <8 x float>  %r03, float %r4, i32 4
323  %r05 = insertelement <8 x float>  %r04, float %r5, i32 5
324  %r06 = insertelement <8 x float>  %r05, float %r6, i32 6
325  %r07 = insertelement <8 x float>  %r06, float %r7, i32 7
326  ret <8 x float> %r07
327}
328
329define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
330; SSE-LABEL: @test_v4i64(
331; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
332; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
333; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
334; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
335; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
336; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
337; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
338; SSE-NEXT:    ret <4 x i64> [[TMP7]]
339;
340; SLM-LABEL: @test_v4i64(
341; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
342; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
343; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
344; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
345; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
346; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
347; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
348; SLM-NEXT:    ret <4 x i64> [[TMP7]]
349;
350; AVX-LABEL: @test_v4i64(
351; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
352; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
353; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
354; AVX-NEXT:    ret <4 x i64> [[TMP3]]
355;
356  %a0 = extractelement <4 x i64> %a, i32 0
357  %a1 = extractelement <4 x i64> %a, i32 1
358  %a2 = extractelement <4 x i64> %a, i32 2
359  %a3 = extractelement <4 x i64> %a, i32 3
360  %b0 = extractelement <4 x i64> %b, i32 0
361  %b1 = extractelement <4 x i64> %b, i32 1
362  %b2 = extractelement <4 x i64> %b, i32 2
363  %b3 = extractelement <4 x i64> %b, i32 3
364  %r0 = add i64 %a0, %a1
365  %r1 = add i64 %b0, %b1
366  %r2 = add i64 %a2, %a3
367  %r3 = add i64 %b2, %b3
368  %r00 = insertelement <4 x i64> zeroinitializer, i64 %r0, i32 0
369  %r01 = insertelement <4 x i64>  %r00, i64 %r1, i32 1
370  %r02 = insertelement <4 x i64>  %r01, i64 %r2, i32 2
371  %r03 = insertelement <4 x i64>  %r02, i64 %r3, i32 3
372  ret <4 x i64> %r03
373}
374
375define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
376; SSE-LABEL: @test_v8i32(
377; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
378; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
379; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
380; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
381; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
382; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
383; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
384; SSE-NEXT:    ret <8 x i32> [[TMP7]]
385;
386; SLM-LABEL: @test_v8i32(
387; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
388; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
389; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
390; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
391; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
392; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
393; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
394; SLM-NEXT:    ret <8 x i32> [[TMP7]]
395;
396; AVX-LABEL: @test_v8i32(
397; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
398; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
399; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
400; AVX-NEXT:    ret <8 x i32> [[TMP3]]
401;
402  %a0 = extractelement <8 x i32> %a, i32 0
403  %a1 = extractelement <8 x i32> %a, i32 1
404  %a2 = extractelement <8 x i32> %a, i32 2
405  %a3 = extractelement <8 x i32> %a, i32 3
406  %a4 = extractelement <8 x i32> %a, i32 4
407  %a5 = extractelement <8 x i32> %a, i32 5
408  %a6 = extractelement <8 x i32> %a, i32 6
409  %a7 = extractelement <8 x i32> %a, i32 7
410  %b0 = extractelement <8 x i32> %b, i32 0
411  %b1 = extractelement <8 x i32> %b, i32 1
412  %b2 = extractelement <8 x i32> %b, i32 2
413  %b3 = extractelement <8 x i32> %b, i32 3
414  %b4 = extractelement <8 x i32> %b, i32 4
415  %b5 = extractelement <8 x i32> %b, i32 5
416  %b6 = extractelement <8 x i32> %b, i32 6
417  %b7 = extractelement <8 x i32> %b, i32 7
418  %r0 = add i32 %a0, %a1
419  %r1 = add i32 %a2, %a3
420  %r2 = add i32 %b0, %b1
421  %r3 = add i32 %b2, %b3
422  %r4 = add i32 %a4, %a5
423  %r5 = add i32 %a6, %a7
424  %r6 = add i32 %b4, %b5
425  %r7 = add i32 %b6, %b7
426  %r00 = insertelement <8 x i32> zeroinitializer, i32 %r0, i32 0
427  %r01 = insertelement <8 x i32>  %r00, i32 %r1, i32 1
428  %r02 = insertelement <8 x i32>  %r01, i32 %r2, i32 2
429  %r03 = insertelement <8 x i32>  %r02, i32 %r3, i32 3
430  %r04 = insertelement <8 x i32>  %r03, i32 %r4, i32 4
431  %r05 = insertelement <8 x i32>  %r04, i32 %r5, i32 5
432  %r06 = insertelement <8 x i32>  %r05, i32 %r6, i32 6
433  %r07 = insertelement <8 x i32>  %r06, i32 %r7, i32 7
434  ret <8 x i32> %r07
435}
436
437define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
438; SSE-LABEL: @test_v16i16(
439; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
440; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
441; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
442; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
443; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
444; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
445; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
446; SSE-NEXT:    ret <16 x i16> [[TMP7]]
447;
448; SLM-LABEL: @test_v16i16(
449; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
450; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
451; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
452; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
453; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
454; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
455; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
456; SLM-NEXT:    ret <16 x i16> [[TMP7]]
457;
458; AVX-LABEL: @test_v16i16(
459; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
460; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
461; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
462; AVX-NEXT:    ret <16 x i16> [[TMP3]]
463;
464  %a0  = extractelement <16 x i16> %a, i32 0
465  %a1  = extractelement <16 x i16> %a, i32 1
466  %a2  = extractelement <16 x i16> %a, i32 2
467  %a3  = extractelement <16 x i16> %a, i32 3
468  %a4  = extractelement <16 x i16> %a, i32 4
469  %a5  = extractelement <16 x i16> %a, i32 5
470  %a6  = extractelement <16 x i16> %a, i32 6
471  %a7  = extractelement <16 x i16> %a, i32 7
472  %a8  = extractelement <16 x i16> %a, i32 8
473  %a9  = extractelement <16 x i16> %a, i32 9
474  %a10 = extractelement <16 x i16> %a, i32 10
475  %a11 = extractelement <16 x i16> %a, i32 11
476  %a12 = extractelement <16 x i16> %a, i32 12
477  %a13 = extractelement <16 x i16> %a, i32 13
478  %a14 = extractelement <16 x i16> %a, i32 14
479  %a15 = extractelement <16 x i16> %a, i32 15
480  %b0  = extractelement <16 x i16> %b, i32 0
481  %b1  = extractelement <16 x i16> %b, i32 1
482  %b2  = extractelement <16 x i16> %b, i32 2
483  %b3  = extractelement <16 x i16> %b, i32 3
484  %b4  = extractelement <16 x i16> %b, i32 4
485  %b5  = extractelement <16 x i16> %b, i32 5
486  %b6  = extractelement <16 x i16> %b, i32 6
487  %b7  = extractelement <16 x i16> %b, i32 7
488  %b8  = extractelement <16 x i16> %b, i32 8
489  %b9  = extractelement <16 x i16> %b, i32 9
490  %b10 = extractelement <16 x i16> %b, i32 10
491  %b11 = extractelement <16 x i16> %b, i32 11
492  %b12 = extractelement <16 x i16> %b, i32 12
493  %b13 = extractelement <16 x i16> %b, i32 13
494  %b14 = extractelement <16 x i16> %b, i32 14
495  %b15 = extractelement <16 x i16> %b, i32 15
496  %r0  = add i16 %a0 , %a1
497  %r1  = add i16 %a2 , %a3
498  %r2  = add i16 %a4 , %a5
499  %r3  = add i16 %a6 , %a7
500  %r4  = add i16 %b0 , %b1
501  %r5  = add i16 %b2 , %b3
502  %r6  = add i16 %b4 , %b5
503  %r7  = add i16 %b6 , %b7
504  %r8  = add i16 %a8 , %a9
505  %r9  = add i16 %a10, %a11
506  %r10 = add i16 %a12, %a13
507  %r11 = add i16 %a14, %a15
508  %r12 = add i16 %b8 , %b9
509  %r13 = add i16 %b10, %b11
510  %r14 = add i16 %b12, %b13
511  %r15 = add i16 %b14, %b15
512  %rv0  = insertelement <16 x i16> zeroinitializer, i16 %r0 , i32 0
513  %rv1  = insertelement <16 x i16> %rv0 , i16 %r1 , i32 1
514  %rv2  = insertelement <16 x i16> %rv1 , i16 %r2 , i32 2
515  %rv3  = insertelement <16 x i16> %rv2 , i16 %r3 , i32 3
516  %rv4  = insertelement <16 x i16> %rv3 , i16 %r4 , i32 4
517  %rv5  = insertelement <16 x i16> %rv4 , i16 %r5 , i32 5
518  %rv6  = insertelement <16 x i16> %rv5 , i16 %r6 , i32 6
519  %rv7  = insertelement <16 x i16> %rv6 , i16 %r7 , i32 7
520  %rv8  = insertelement <16 x i16> %rv7 , i16 %r8 , i32 8
521  %rv9  = insertelement <16 x i16> %rv8 , i16 %r9 , i32 9
522  %rv10 = insertelement <16 x i16> %rv9 , i16 %r10, i32 10
523  %rv11 = insertelement <16 x i16> %rv10, i16 %r11, i32 11
524  %rv12 = insertelement <16 x i16> %rv11, i16 %r12, i32 12
525  %rv13 = insertelement <16 x i16> %rv12, i16 %r13, i32 13
526  %rv14 = insertelement <16 x i16> %rv13, i16 %r14, i32 14
527  %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
528  ret <16 x i16> %rv15
529}
530