xref: /llvm-project/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll (revision 88e00141f81c4dfd48bca58fda15d078a138b586)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
3; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
4; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
5; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
6; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
7; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
8; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
9; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
10
11; PR34072 - failure to canonicalize to (add (shuffle a, b),(shuffle a, b)) for optimal horizontal add patterns (with undemanded elements)
12
13;
14; v8i16
15;
16
17define <8 x i16> @add_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
18; CHECK-LABEL: @add_v8i16_01234567(
19; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
20; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
21; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
22; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
23;
24  %a0 = extractelement <8 x i16> %a, i32 0
25  %a1 = extractelement <8 x i16> %a, i32 1
26  %a2 = extractelement <8 x i16> %a, i32 2
27  %a3 = extractelement <8 x i16> %a, i32 3
28  %a4 = extractelement <8 x i16> %a, i32 4
29  %a5 = extractelement <8 x i16> %a, i32 5
30  %a6 = extractelement <8 x i16> %a, i32 6
31  %a7 = extractelement <8 x i16> %a, i32 7
32  %a01 = add i16 %a0, %a1
33  %a23 = add i16 %a2, %a3
34  %a45 = add i16 %a4, %a5
35  %a67 = add i16 %a6, %a7
36  %b0 = extractelement <8 x i16> %b, i32 0
37  %b1 = extractelement <8 x i16> %b, i32 1
38  %b2 = extractelement <8 x i16> %b, i32 2
39  %b3 = extractelement <8 x i16> %b, i32 3
40  %b4 = extractelement <8 x i16> %b, i32 4
41  %b5 = extractelement <8 x i16> %b, i32 5
42  %b6 = extractelement <8 x i16> %b, i32 6
43  %b7 = extractelement <8 x i16> %b, i32 7
44  %b01 = add i16 %b0, %b1
45  %b23 = add i16 %b2, %b3
46  %b45 = add i16 %b4, %b5
47  %b67 = add i16 %b6, %b7
48  %hadd0 = insertelement <8 x i16> poison, i16 %a01, i32 0
49  %hadd1 = insertelement <8 x i16> %hadd0, i16 %a23, i32 1
50  %hadd2 = insertelement <8 x i16> %hadd1, i16 %a45, i32 2
51  %hadd3 = insertelement <8 x i16> %hadd2, i16 %a67, i32 3
52  %hadd4 = insertelement <8 x i16> %hadd3, i16 %b01, i32 4
53  %hadd5 = insertelement <8 x i16> %hadd4, i16 %b23, i32 5
54  %hadd6 = insertelement <8 x i16> %hadd5, i16 %b45, i32 6
55  %hadd7 = insertelement <8 x i16> %hadd6, i16 %b67, i32 7
56  %result = shufflevector <8 x i16> %hadd7, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
57  ret <8 x i16> %result
58}
59
60define <8 x i16> @add_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
61; SSE2-LABEL: @add_v8i16_u1234567(
62; SSE2-NEXT:    [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
63; SSE2-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[A]], [[SHIFT3]]
64; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
65; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
66; SSE2-NEXT:    [[HADD1:%.*]] = add <8 x i16> [[TMP7]], [[TMP4]]
67; SSE2-NEXT:    [[HADD3:%.*]] = shufflevector <8 x i16> [[HADD1]], <8 x i16> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 14, i32 poison, i32 poison, i32 poison, i32 poison>
68; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
69; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
70; SSE2-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
71; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD3]], <8 x i16> [[TMP3]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
72; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
73;
74; SSE4-LABEL: @add_v8i16_u1234567(
75; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 5, i32 6, i32 8, i32 10, i32 12, i32 14>
76; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 4, i32 7, i32 9, i32 11, i32 13, i32 15>
77; SSE4-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
78; SSE4-NEXT:    ret <8 x i16> [[TMP7]]
79;
80; AVX-LABEL: @add_v8i16_u1234567(
81; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 5, i32 6, i32 8, i32 10, i32 12, i32 14>
82; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 4, i32 7, i32 9, i32 11, i32 13, i32 15>
83; AVX-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
84; AVX-NEXT:    ret <8 x i16> [[TMP7]]
85;
86  %a0 = extractelement <8 x i16> %a, i32 0
87  %a1 = extractelement <8 x i16> %a, i32 1
88  %a2 = extractelement <8 x i16> %a, i32 2
89  %a3 = extractelement <8 x i16> %a, i32 3
90  %a4 = extractelement <8 x i16> %a, i32 4
91  %a5 = extractelement <8 x i16> %a, i32 5
92  %a6 = extractelement <8 x i16> %a, i32 6
93  %a7 = extractelement <8 x i16> %a, i32 7
94  %a01 = add i16 %a0, %a1
95  %a23 = add i16 %a2, %a3
96  %a45 = add i16 %a4, %a5
97  %a67 = add i16 %a6, %a7
98  %b0 = extractelement <8 x i16> %b, i32 0
99  %b1 = extractelement <8 x i16> %b, i32 1
100  %b2 = extractelement <8 x i16> %b, i32 2
101  %b3 = extractelement <8 x i16> %b, i32 3
102  %b4 = extractelement <8 x i16> %b, i32 4
103  %b5 = extractelement <8 x i16> %b, i32 5
104  %b6 = extractelement <8 x i16> %b, i32 6
105  %b7 = extractelement <8 x i16> %b, i32 7
106  %b01 = add i16 %b0, %b1
107  %b23 = add i16 %b2, %b3
108  %b45 = add i16 %b4, %b5
109  %b67 = add i16 %b6, %b7
110  %hadd0 = insertelement <8 x i16> poison, i16 %a01, i32 0
111  %hadd1 = insertelement <8 x i16> %hadd0, i16 %a23, i32 1
112  %hadd2 = insertelement <8 x i16> %hadd1, i16 %a45, i32 2
113  %hadd3 = insertelement <8 x i16> %hadd2, i16 %a67, i32 3
114  %hadd4 = insertelement <8 x i16> %hadd3, i16 %b01, i32 4
115  %hadd5 = insertelement <8 x i16> %hadd4, i16 %b23, i32 5
116  %hadd6 = insertelement <8 x i16> %hadd5, i16 %b45, i32 6
117  %hadd7 = insertelement <8 x i16> %hadd6, i16 %b67, i32 7
118  %result = shufflevector <8 x i16> %hadd7, <8 x i16> %a, <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
119  ret <8 x i16> %result
120}
121
122define <8 x i16> @add_v8i16_76u43210(<8 x i16> %a, <8 x i16> %b) {
123; SSE2-LABEL: @add_v8i16_76u43210(
124; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
125; SSE2-NEXT:    [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]]
126; SSE2-NEXT:    [[SHIFT2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
127; SSE2-NEXT:    [[TMP2:%.*]] = add <8 x i16> [[B]], [[SHIFT2]]
128; SSE2-NEXT:    [[SHIFT3:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 6>
129; SSE2-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[SHIFT3]], [[B]]
130; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 2, i32 4, i32 6, i32 8, i32 poison, i32 poison, i32 poison, i32 poison>
131; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 3, i32 5, i32 7, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
132; SSE2-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
133; SSE2-NEXT:    [[HADD41:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP6]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
134; SSE2-NEXT:    [[HADD6:%.*]] = shufflevector <8 x i16> [[HADD41]], <8 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 12, i32 poison>
135; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[HADD6]], <8 x i32> <i32 7, i32 14, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8>
136; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
137;
138; SSE4-LABEL: @add_v8i16_76u43210(
139; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
140; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
141; SSE4-NEXT:    [[HADD22:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
142; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B:%.*]], <8 x i32> <i32 6, i32 8, i32 poison, i32 12, i32 14, i32 poison, i32 poison, i32 poison>
143; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 7, i32 9, i32 poison, i32 13, i32 15, i32 poison, i32 poison, i32 poison>
144; SSE4-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP3]], [[TMP4]]
145; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[HADD22]], <8 x i32> <i32 4, i32 3, i32 poison, i32 1, i32 0, i32 10, i32 9, i32 8>
146; SSE4-NEXT:    ret <8 x i16> [[RESULT]]
147;
148; AVX-LABEL: @add_v8i16_76u43210(
149; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
150; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
151; AVX-NEXT:    [[HADD22:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
152; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B:%.*]], <8 x i32> <i32 6, i32 8, i32 poison, i32 12, i32 14, i32 poison, i32 poison, i32 poison>
153; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 7, i32 9, i32 poison, i32 13, i32 15, i32 poison, i32 poison, i32 poison>
154; AVX-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP3]], [[TMP4]]
155; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[HADD22]], <8 x i32> <i32 4, i32 3, i32 poison, i32 1, i32 0, i32 10, i32 9, i32 8>
156; AVX-NEXT:    ret <8 x i16> [[RESULT]]
157;
158  %a0 = extractelement <8 x i16> %a, i32 0
159  %a1 = extractelement <8 x i16> %a, i32 1
160  %a2 = extractelement <8 x i16> %a, i32 2
161  %a3 = extractelement <8 x i16> %a, i32 3
162  %a4 = extractelement <8 x i16> %a, i32 4
163  %a5 = extractelement <8 x i16> %a, i32 5
164  %a6 = extractelement <8 x i16> %a, i32 6
165  %a7 = extractelement <8 x i16> %a, i32 7
166  %a01 = add i16 %a0, %a1
167  %a23 = add i16 %a2, %a3
168  %a45 = add i16 %a4, %a5
169  %a67 = add i16 %a6, %a7
170  %b0 = extractelement <8 x i16> %b, i32 0
171  %b1 = extractelement <8 x i16> %b, i32 1
172  %b2 = extractelement <8 x i16> %b, i32 2
173  %b3 = extractelement <8 x i16> %b, i32 3
174  %b4 = extractelement <8 x i16> %b, i32 4
175  %b5 = extractelement <8 x i16> %b, i32 5
176  %b6 = extractelement <8 x i16> %b, i32 6
177  %b7 = extractelement <8 x i16> %b, i32 7
178  %b01 = add i16 %b0, %b1
179  %b23 = add i16 %b2, %b3
180  %b45 = add i16 %b4, %b5
181  %b67 = add i16 %b6, %b7
182  %hadd0 = insertelement <8 x i16> poison, i16 %a01, i32 0
183  %hadd1 = insertelement <8 x i16> %hadd0, i16 %a23, i32 1
184  %hadd2 = insertelement <8 x i16> %hadd1, i16 %a45, i32 2
185  %hadd3 = insertelement <8 x i16> %hadd2, i16 %a67, i32 3
186  %hadd4 = insertelement <8 x i16> %hadd3, i16 %b01, i32 4
187  %hadd5 = insertelement <8 x i16> %hadd4, i16 %b23, i32 5
188  %hadd6 = insertelement <8 x i16> %hadd5, i16 %b45, i32 6
189  %hadd7 = insertelement <8 x i16> %hadd6, i16 %b67, i32 7
190  %result = shufflevector <8 x i16> %hadd7, <8 x i16> %a, <8 x i32> <i32 7, i32 6, i32 poison, i32 4, i32 3, i32 2, i32 1, i32 0>
191  ret <8 x i16> %result
192}
193
194;
195; v16i16
196;
197
198define <16 x i16> @add_v16i16_0123456789ABCDEF(<16 x i16> %a, <16 x i16> %b) {
199; CHECK-LABEL: @add_v16i16_0123456789ABCDEF(
200; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
201; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
202; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
203; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
204;
205  %a0 = extractelement <16 x i16> %a, i32 0
206  %a1 = extractelement <16 x i16> %a, i32 1
207  %a2 = extractelement <16 x i16> %a, i32 2
208  %a3 = extractelement <16 x i16> %a, i32 3
209  %a4 = extractelement <16 x i16> %a, i32 4
210  %a5 = extractelement <16 x i16> %a, i32 5
211  %a6 = extractelement <16 x i16> %a, i32 6
212  %a7 = extractelement <16 x i16> %a, i32 7
213  %a8 = extractelement <16 x i16> %a, i32 8
214  %a9 = extractelement <16 x i16> %a, i32 9
215  %aA = extractelement <16 x i16> %a, i32 10
216  %aB = extractelement <16 x i16> %a, i32 11
217  %aC = extractelement <16 x i16> %a, i32 12
218  %aD = extractelement <16 x i16> %a, i32 13
219  %aE = extractelement <16 x i16> %a, i32 14
220  %aF = extractelement <16 x i16> %a, i32 15
221  %a01 = add i16 %a0, %a1
222  %a23 = add i16 %a2, %a3
223  %a45 = add i16 %a4, %a5
224  %a67 = add i16 %a6, %a7
225  %a89 = add i16 %a8, %a9
226  %aAB = add i16 %aA, %aB
227  %aCD = add i16 %aC, %aD
228  %aEF = add i16 %aE, %aF
229  %b0 = extractelement <16 x i16> %b, i32 0
230  %b1 = extractelement <16 x i16> %b, i32 1
231  %b2 = extractelement <16 x i16> %b, i32 2
232  %b3 = extractelement <16 x i16> %b, i32 3
233  %b4 = extractelement <16 x i16> %b, i32 4
234  %b5 = extractelement <16 x i16> %b, i32 5
235  %b6 = extractelement <16 x i16> %b, i32 6
236  %b7 = extractelement <16 x i16> %b, i32 7
237  %b8 = extractelement <16 x i16> %b, i32 8
238  %b9 = extractelement <16 x i16> %b, i32 9
239  %bA = extractelement <16 x i16> %b, i32 10
240  %bB = extractelement <16 x i16> %b, i32 11
241  %bC = extractelement <16 x i16> %b, i32 12
242  %bD = extractelement <16 x i16> %b, i32 13
243  %bE = extractelement <16 x i16> %b, i32 14
244  %bF = extractelement <16 x i16> %b, i32 15
245  %b01 = add i16 %b0, %b1
246  %b23 = add i16 %b2, %b3
247  %b45 = add i16 %b4, %b5
248  %b67 = add i16 %b6, %b7
249  %b89 = add i16 %b8, %b9
250  %bAB = add i16 %bA, %bB
251  %bCD = add i16 %bC, %bD
252  %bEF = add i16 %bE, %bF
253  %hadd0 = insertelement <16 x i16> poison, i16 %a01, i32 0
254  %hadd1 = insertelement <16 x i16> %hadd0, i16 %a23, i32 1
255  %hadd2 = insertelement <16 x i16> %hadd1, i16 %a45, i32 2
256  %hadd3 = insertelement <16 x i16> %hadd2, i16 %a67, i32 3
257  %hadd4 = insertelement <16 x i16> %hadd3, i16 %b01, i32 4
258  %hadd5 = insertelement <16 x i16> %hadd4, i16 %b23, i32 5
259  %hadd6 = insertelement <16 x i16> %hadd5, i16 %b45, i32 6
260  %hadd7 = insertelement <16 x i16> %hadd6, i16 %b67, i32 7
261  %hadd8 = insertelement <16 x i16> %hadd7, i16 %a89, i32 8
262  %hadd9 = insertelement <16 x i16> %hadd8, i16 %aAB, i32 9
263  %haddA = insertelement <16 x i16> %hadd9, i16 %aCD, i32 10
264  %haddB = insertelement <16 x i16> %haddA, i16 %aEF, i32 11
265  %haddC = insertelement <16 x i16> %haddB, i16 %b89, i32 12
266  %haddD = insertelement <16 x i16> %haddC, i16 %bAB, i32 13
267  %haddE = insertelement <16 x i16> %haddD, i16 %bCD, i32 14
268  %haddF = insertelement <16 x i16> %haddE, i16 %bEF, i32 15
269  %result = shufflevector <16 x i16> %haddF, <16 x i16> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
270  ret <16 x i16> %result
271}
272
273define <16 x i16> @add_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
274; SSE2-LABEL: @add_v16i16_0123u56789uBCDEF(
275; SSE2-NEXT:    [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
276; SSE2-NEXT:    [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
277; SSE2-NEXT:    [[A89:%.*]] = add i16 [[A8]], [[A9]]
278; SSE2-NEXT:    [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
279; SSE2-NEXT:    [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
280; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
281; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
282; SSE2-NEXT:    [[BCD:%.*]] = add i16 [[BC]], [[BD]]
283; SSE2-NEXT:    [[BEF:%.*]] = add i16 [[BE]], [[BF]]
284; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
285; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
286; SSE2-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
287; SSE2-NEXT:    [[HADD8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
288; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 poison, i32 14, i32 24, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
289; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 poison, i32 15, i32 25, i32 27, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
290; SSE2-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
291; SSE2-NEXT:    [[HADDD1:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 19, i32 20, i32 poison, i32 poison>
292; SSE2-NEXT:    [[HADDE:%.*]] = insertelement <16 x i16> [[HADDD1]], i16 [[BCD]], i64 14
293; SSE2-NEXT:    [[HADDF:%.*]] = insertelement <16 x i16> [[HADDE]], i16 [[BEF]], i64 15
294; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HADDF]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 12, i32 13, i32 14, i32 15>
295; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
296;
297; SSE4-LABEL: @add_v16i16_0123u56789uBCDEF(
298; SSE4-NEXT:    [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
299; SSE4-NEXT:    [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
300; SSE4-NEXT:    [[A89:%.*]] = add i16 [[A8]], [[A9]]
301; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
302; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
303; SSE4-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
304; SSE4-NEXT:    [[HADD8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
305; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 10, i32 poison, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
306; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 11, i32 poison, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
307; SSE4-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
308; SSE4-NEXT:    [[HADDB2:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 poison, i32 poison, i32 poison, i32 poison>
309; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
310; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
311; SSE4-NEXT:    [[TMP9:%.*]] = add <16 x i16> [[TMP7]], [[TMP8]]
312; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HADDB2]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
313; SSE4-NEXT:    ret <16 x i16> [[RESULT]]
314;
315; AVX2-LABEL: @add_v16i16_0123u56789uBCDEF(
316; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 9, i32 10, i32 poison, i32 14, i32 24, i32 26, i32 28, i32 30>
317; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 8, i32 11, i32 poison, i32 15, i32 25, i32 27, i32 29, i32 31>
318; AVX2-NEXT:    [[RESULT:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
319; AVX2-NEXT:    ret <16 x i16> [[RESULT]]
320;
321; AVX512-LABEL: @add_v16i16_0123u56789uBCDEF(
322; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 8, i32 10, i32 poison, i32 14, i32 24, i32 26, i32 28, i32 30>
323; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 9, i32 11, i32 poison, i32 15, i32 25, i32 27, i32 29, i32 31>
324; AVX512-NEXT:    [[RESULT:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
325; AVX512-NEXT:    ret <16 x i16> [[RESULT]]
326;
327  %a0 = extractelement <16 x i16> %a, i32 0
328  %a1 = extractelement <16 x i16> %a, i32 1
329  %a2 = extractelement <16 x i16> %a, i32 2
330  %a3 = extractelement <16 x i16> %a, i32 3
331  %a4 = extractelement <16 x i16> %a, i32 4
332  %a5 = extractelement <16 x i16> %a, i32 5
333  %a6 = extractelement <16 x i16> %a, i32 6
334  %a7 = extractelement <16 x i16> %a, i32 7
335  %a8 = extractelement <16 x i16> %a, i32 8
336  %a9 = extractelement <16 x i16> %a, i32 9
337  %aA = extractelement <16 x i16> %a, i32 10
338  %aB = extractelement <16 x i16> %a, i32 11
339  %aC = extractelement <16 x i16> %a, i32 12
340  %aD = extractelement <16 x i16> %a, i32 13
341  %aE = extractelement <16 x i16> %a, i32 14
342  %aF = extractelement <16 x i16> %a, i32 15
343  %a01 = add i16 %a0, %a1
344  %a23 = add i16 %a2, %a3
345  %a45 = add i16 %a4, %a5
346  %a67 = add i16 %a6, %a7
347  %a89 = add i16 %a8, %a9
348  %aAB = add i16 %aA, %aB
349  %aCD = add i16 %aC, %aD
350  %aEF = add i16 %aE, %aF
351  %b0 = extractelement <16 x i16> %b, i32 0
352  %b1 = extractelement <16 x i16> %b, i32 1
353  %b2 = extractelement <16 x i16> %b, i32 2
354  %b3 = extractelement <16 x i16> %b, i32 3
355  %b4 = extractelement <16 x i16> %b, i32 4
356  %b5 = extractelement <16 x i16> %b, i32 5
357  %b6 = extractelement <16 x i16> %b, i32 6
358  %b7 = extractelement <16 x i16> %b, i32 7
359  %b8 = extractelement <16 x i16> %b, i32 8
360  %b9 = extractelement <16 x i16> %b, i32 9
361  %bA = extractelement <16 x i16> %b, i32 10
362  %bB = extractelement <16 x i16> %b, i32 11
363  %bC = extractelement <16 x i16> %b, i32 12
364  %bD = extractelement <16 x i16> %b, i32 13
365  %bE = extractelement <16 x i16> %b, i32 14
366  %bF = extractelement <16 x i16> %b, i32 15
367  %b01 = add i16 %b0, %b1
368  %b23 = add i16 %b2, %b3
369  %b45 = add i16 %b4, %b5
370  %b67 = add i16 %b6, %b7
371  %b89 = add i16 %b8, %b9
372  %bAB = add i16 %bA, %bB
373  %bCD = add i16 %bC, %bD
374  %bEF = add i16 %bE, %bF
375  %hadd0 = insertelement <16 x i16> poison, i16 %a01, i32 0
376  %hadd1 = insertelement <16 x i16> %hadd0, i16 %a23, i32 1
377  %hadd2 = insertelement <16 x i16> %hadd1, i16 %a45, i32 2
378  %hadd3 = insertelement <16 x i16> %hadd2, i16 %a67, i32 3
379  %hadd4 = insertelement <16 x i16> %hadd3, i16 %b01, i32 4
380  %hadd5 = insertelement <16 x i16> %hadd4, i16 %b23, i32 5
381  %hadd6 = insertelement <16 x i16> %hadd5, i16 %b45, i32 6
382  %hadd7 = insertelement <16 x i16> %hadd6, i16 %b67, i32 7
383  %hadd8 = insertelement <16 x i16> %hadd7, i16 %a89, i32 8
384  %hadd9 = insertelement <16 x i16> %hadd8, i16 %aAB, i32 9
385  %haddA = insertelement <16 x i16> %hadd9, i16 %aCD, i32 10
386  %haddB = insertelement <16 x i16> %haddA, i16 %aEF, i32 11
387  %haddC = insertelement <16 x i16> %haddB, i16 %b89, i32 12
388  %haddD = insertelement <16 x i16> %haddC, i16 %bAB, i32 13
389  %haddE = insertelement <16 x i16> %haddD, i16 %bCD, i32 14
390  %haddF = insertelement <16 x i16> %haddE, i16 %bEF, i32 15
391  %result = shufflevector <16 x i16> %haddF, <16 x i16> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 12, i32 13, i32 14, i32 15>
392  ret <16 x i16> %result
393}
394
395define <16 x i16> @add_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) {
396; SSE2-LABEL: @add_v16i16_FEuCBA98765432u0(
397; SSE2-NEXT:    [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
398; SSE2-NEXT:    [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
399; SSE2-NEXT:    [[A89:%.*]] = add i16 [[A8]], [[A9]]
400; SSE2-NEXT:    [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
401; SSE2-NEXT:    [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
402; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
403; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
404; SSE2-NEXT:    [[BCD:%.*]] = add i16 [[BC]], [[BD]]
405; SSE2-NEXT:    [[BEF:%.*]] = add i16 [[BE]], [[BF]]
406; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
407; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
408; SSE2-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
409; SSE2-NEXT:    [[HADD8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
410; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 12, i32 14, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
411; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 13, i32 15, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
412; SSE2-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
413; SSE2-NEXT:    [[HADDC1:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison>
414; SSE2-NEXT:    [[HADDE:%.*]] = insertelement <16 x i16> [[HADDC1]], i16 [[BCD]], i64 14
415; SSE2-NEXT:    [[HADDF:%.*]] = insertelement <16 x i16> [[HADDE]], i16 [[BEF]], i64 15
416; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HADDF]], <16 x i16> poison, <16 x i32> <i32 15, i32 14, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 poison, i32 0>
417; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
418;
419; SSE4-LABEL: @add_v16i16_FEuCBA98765432u0(
420; SSE4-NEXT:    [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
421; SSE4-NEXT:    [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
422; SSE4-NEXT:    [[A89:%.*]] = add i16 [[A8]], [[A9]]
423; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
424; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
425; SSE4-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
426; SSE4-NEXT:    [[HADD8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
427; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 11, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
428; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 10, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
429; SSE4-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
430; SSE4-NEXT:    [[HADDA2:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
431; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 poison, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
432; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 poison, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
433; SSE4-NEXT:    [[TMP9:%.*]] = add <16 x i16> [[TMP7]], [[TMP8]]
434; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[HADDA2]], <16 x i32> <i32 4, i32 3, i32 poison, i32 1, i32 0, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 poison, i32 16>
435; SSE4-NEXT:    ret <16 x i16> [[RESULT]]
436;
437; AVX2-LABEL: @add_v16i16_FEuCBA98765432u0(
438; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 9, i32 10, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
439; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 8, i32 11, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
440; AVX2-NEXT:    [[HADDA:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
441; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 poison, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
442; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 poison, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
443; AVX2-NEXT:    [[TMP5:%.*]] = add <16 x i16> [[TMP3]], [[TMP4]]
444; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> [[HADDA]], <16 x i32> <i32 4, i32 3, i32 poison, i32 1, i32 0, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 poison, i32 16>
445; AVX2-NEXT:    ret <16 x i16> [[RESULT]]
446;
447; AVX512-LABEL: @add_v16i16_FEuCBA98765432u0(
448; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 11, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
449; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 10, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
450; AVX512-NEXT:    [[HADDA2:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
451; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 poison, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
452; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 poison, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
453; AVX512-NEXT:    [[TMP5:%.*]] = add <16 x i16> [[TMP3]], [[TMP4]]
454; AVX512-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> [[HADDA2]], <16 x i32> <i32 4, i32 3, i32 poison, i32 1, i32 0, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 poison, i32 16>
455; AVX512-NEXT:    ret <16 x i16> [[RESULT]]
456;
457  %a0 = extractelement <16 x i16> %a, i32 0
458  %a1 = extractelement <16 x i16> %a, i32 1
459  %a2 = extractelement <16 x i16> %a, i32 2
460  %a3 = extractelement <16 x i16> %a, i32 3
461  %a4 = extractelement <16 x i16> %a, i32 4
462  %a5 = extractelement <16 x i16> %a, i32 5
463  %a6 = extractelement <16 x i16> %a, i32 6
464  %a7 = extractelement <16 x i16> %a, i32 7
465  %a8 = extractelement <16 x i16> %a, i32 8
466  %a9 = extractelement <16 x i16> %a, i32 9
467  %aA = extractelement <16 x i16> %a, i32 10
468  %aB = extractelement <16 x i16> %a, i32 11
469  %aC = extractelement <16 x i16> %a, i32 12
470  %aD = extractelement <16 x i16> %a, i32 13
471  %aE = extractelement <16 x i16> %a, i32 14
472  %aF = extractelement <16 x i16> %a, i32 15
473  %a01 = add i16 %a0, %a1
474  %a23 = add i16 %a2, %a3
475  %a45 = add i16 %a4, %a5
476  %a67 = add i16 %a6, %a7
477  %a89 = add i16 %a8, %a9
478  %aAB = add i16 %aA, %aB
479  %aCD = add i16 %aC, %aD
480  %aEF = add i16 %aE, %aF
481  %b0 = extractelement <16 x i16> %b, i32 0
482  %b1 = extractelement <16 x i16> %b, i32 1
483  %b2 = extractelement <16 x i16> %b, i32 2
484  %b3 = extractelement <16 x i16> %b, i32 3
485  %b4 = extractelement <16 x i16> %b, i32 4
486  %b5 = extractelement <16 x i16> %b, i32 5
487  %b6 = extractelement <16 x i16> %b, i32 6
488  %b7 = extractelement <16 x i16> %b, i32 7
489  %b8 = extractelement <16 x i16> %b, i32 8
490  %b9 = extractelement <16 x i16> %b, i32 9
491  %bA = extractelement <16 x i16> %b, i32 10
492  %bB = extractelement <16 x i16> %b, i32 11
493  %bC = extractelement <16 x i16> %b, i32 12
494  %bD = extractelement <16 x i16> %b, i32 13
495  %bE = extractelement <16 x i16> %b, i32 14
496  %bF = extractelement <16 x i16> %b, i32 15
497  %b01 = add i16 %b0, %b1
498  %b23 = add i16 %b2, %b3
499  %b45 = add i16 %b4, %b5
500  %b67 = add i16 %b6, %b7
501  %b89 = add i16 %b8, %b9
502  %bAB = add i16 %bA, %bB
503  %bCD = add i16 %bC, %bD
504  %bEF = add i16 %bE, %bF
505  %hadd0 = insertelement <16 x i16> poison, i16 %a01, i32 0
506  %hadd1 = insertelement <16 x i16> %hadd0, i16 %a23, i32 1
507  %hadd2 = insertelement <16 x i16> %hadd1, i16 %a45, i32 2
508  %hadd3 = insertelement <16 x i16> %hadd2, i16 %a67, i32 3
509  %hadd4 = insertelement <16 x i16> %hadd3, i16 %b01, i32 4
510  %hadd5 = insertelement <16 x i16> %hadd4, i16 %b23, i32 5
511  %hadd6 = insertelement <16 x i16> %hadd5, i16 %b45, i32 6
512  %hadd7 = insertelement <16 x i16> %hadd6, i16 %b67, i32 7
513  %hadd8 = insertelement <16 x i16> %hadd7, i16 %a89, i32 8
514  %hadd9 = insertelement <16 x i16> %hadd8, i16 %aAB, i32 9
515  %haddA = insertelement <16 x i16> %hadd9, i16 %aCD, i32 10
516  %haddB = insertelement <16 x i16> %haddA, i16 %aEF, i32 11
517  %haddC = insertelement <16 x i16> %haddB, i16 %b89, i32 12
518  %haddD = insertelement <16 x i16> %haddC, i16 %bAB, i32 13
519  %haddE = insertelement <16 x i16> %haddD, i16 %bCD, i32 14
520  %haddF = insertelement <16 x i16> %haddE, i16 %bEF, i32 15
521  %result = shufflevector <16 x i16> %haddF, <16 x i16> %a, <16 x i32> <i32 15, i32 14, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 poison, i32 0>
522  ret <16 x i16> %result
523}
524
525;
526; v4i32
527;
528
529define <4 x i32> @add_v4i32_0123(<4 x i32> %a, <4 x i32> %b) {
530; CHECK-LABEL: @add_v4i32_0123(
531; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
532; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
533; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
534; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
535;
536  %a0 = extractelement <4 x i32> %a, i32 0
537  %a1 = extractelement <4 x i32> %a, i32 1
538  %a2 = extractelement <4 x i32> %a, i32 2
539  %a3 = extractelement <4 x i32> %a, i32 3
540  %a01 = add i32 %a0, %a1
541  %a23 = add i32 %a2, %a3
542  %b0 = extractelement <4 x i32> %b, i32 0
543  %b1 = extractelement <4 x i32> %b, i32 1
544  %b2 = extractelement <4 x i32> %b, i32 2
545  %b3 = extractelement <4 x i32> %b, i32 3
546  %b01 = add i32 %b0, %b1
547  %b23 = add i32 %b2, %b3
548  %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
549  %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
550  %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
551  %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
552  %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
553  ret <4 x i32> %result
554}
555
556define <4 x i32> @add_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
557; CHECK-LABEL: @add_v4i32_u123(
558; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 5, i32 6>
559; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 poison, i32 3, i32 4, i32 7>
560; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
561; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
562;
563  %a0 = extractelement <4 x i32> %a, i32 0
564  %a1 = extractelement <4 x i32> %a, i32 1
565  %a2 = extractelement <4 x i32> %a, i32 2
566  %a3 = extractelement <4 x i32> %a, i32 3
567  %a01 = add i32 %a0, %a1
568  %a23 = add i32 %a2, %a3
569  %b0 = extractelement <4 x i32> %b, i32 0
570  %b1 = extractelement <4 x i32> %b, i32 1
571  %b2 = extractelement <4 x i32> %b, i32 2
572  %b3 = extractelement <4 x i32> %b, i32 3
573  %b01 = add i32 %b0, %b1
574  %b23 = add i32 %b2, %b3
575  %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
576  %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
577  %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
578  %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
579  %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
580  ret <4 x i32> %result
581}
582
583define <4 x i32> @add_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
584; CHECK-LABEL: @add_v4i32_0u23(
585; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 5, i32 6>
586; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 poison, i32 4, i32 7>
587; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
588; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
589;
590  %a0 = extractelement <4 x i32> %a, i32 0
591  %a1 = extractelement <4 x i32> %a, i32 1
592  %a2 = extractelement <4 x i32> %a, i32 2
593  %a3 = extractelement <4 x i32> %a, i32 3
594  %a01 = add i32 %a0, %a1
595  %a23 = add i32 %a2, %a3
596  %b0 = extractelement <4 x i32> %b, i32 0
597  %b1 = extractelement <4 x i32> %b, i32 1
598  %b2 = extractelement <4 x i32> %b, i32 2
599  %b3 = extractelement <4 x i32> %b, i32 3
600  %b01 = add i32 %b0, %b1
601  %b23 = add i32 %b2, %b3
602  %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
603  %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
604  %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
605  %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
606  %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
607  ret <4 x i32> %result
608}
609
610define <4 x i32> @add_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
611; SSE2-LABEL: @add_v4i32_01u3(
612; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
613; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
614; SSE2-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
615; SSE2-NEXT:    ret <4 x i32> [[TMP4]]
616;
617; SSE4-LABEL: @add_v4i32_01u3(
618; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 poison, i32 6>
619; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 3, i32 poison, i32 7>
620; SSE4-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
621; SSE4-NEXT:    ret <4 x i32> [[TMP4]]
622;
623; AVX2-LABEL: @add_v4i32_01u3(
624; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 poison, i32 6>
625; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 3, i32 poison, i32 7>
626; AVX2-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
627; AVX2-NEXT:    ret <4 x i32> [[TMP4]]
628;
629; AVX512-LABEL: @add_v4i32_01u3(
630; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
631; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
632; AVX512-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
633; AVX512-NEXT:    ret <4 x i32> [[TMP4]]
634;
635  %a0 = extractelement <4 x i32> %a, i32 0
636  %a1 = extractelement <4 x i32> %a, i32 1
637  %a2 = extractelement <4 x i32> %a, i32 2
638  %a3 = extractelement <4 x i32> %a, i32 3
639  %a01 = add i32 %a0, %a1
640  %a23 = add i32 %a2, %a3
641  %b0 = extractelement <4 x i32> %b, i32 0
642  %b1 = extractelement <4 x i32> %b, i32 1
643  %b2 = extractelement <4 x i32> %b, i32 2
644  %b3 = extractelement <4 x i32> %b, i32 3
645  %b01 = add i32 %b0, %b1
646  %b23 = add i32 %b2, %b3
647  %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
648  %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
649  %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
650  %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
651  %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
652  ret <4 x i32> %result
653}
654
655define <4 x i32> @add_v4i32_012u(<4 x i32> %a, <4 x i32> %b) {
656; CHECK-LABEL: @add_v4i32_012u(
657; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
658; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
659; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
660; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
661;
662  %a0 = extractelement <4 x i32> %a, i32 0
663  %a1 = extractelement <4 x i32> %a, i32 1
664  %a2 = extractelement <4 x i32> %a, i32 2
665  %a3 = extractelement <4 x i32> %a, i32 3
666  %a01 = add i32 %a0, %a1
667  %a23 = add i32 %a2, %a3
668  %b0 = extractelement <4 x i32> %b, i32 0
669  %b1 = extractelement <4 x i32> %b, i32 1
670  %b2 = extractelement <4 x i32> %b, i32 2
671  %b3 = extractelement <4 x i32> %b, i32 3
672  %b01 = add i32 %b0, %b1
673  %b23 = add i32 %b2, %b3
674  %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
675  %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
676  %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
677  %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
678  %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
679  ret <4 x i32> %result
680}
681
682define <4 x i32> @add_v4i32_uu23(<4 x i32> %a, <4 x i32> %b) {
683; CHECK-LABEL: @add_v4i32_uu23(
684; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 2>
685; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
686; CHECK-NEXT:    [[RESULT1:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
687; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
688;
689  %a0 = extractelement <4 x i32> %a, i32 0
690  %a1 = extractelement <4 x i32> %a, i32 1
691  %a2 = extractelement <4 x i32> %a, i32 2
692  %a3 = extractelement <4 x i32> %a, i32 3
693  %a01 = add i32 %a0, %a1
694  %a23 = add i32 %a2, %a3
695  %b0 = extractelement <4 x i32> %b, i32 0
696  %b1 = extractelement <4 x i32> %b, i32 1
697  %b2 = extractelement <4 x i32> %b, i32 2
698  %b3 = extractelement <4 x i32> %b, i32 3
699  %b01 = add i32 %b0, %b1
700  %b23 = add i32 %b2, %b3
701  %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
702  %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
703  %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
704  %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
705  %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
706  ret <4 x i32> %result
707}
708
709define <4 x i32> @add_v4i32_01uu(<4 x i32> %a, <4 x i32> %b) {
710; CHECK-LABEL: @add_v4i32_01uu(
711; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
712; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
713; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
714; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
715;
716  %a0 = extractelement <4 x i32> %a, i32 0
717  %a1 = extractelement <4 x i32> %a, i32 1
718  %a2 = extractelement <4 x i32> %a, i32 2
719  %a3 = extractelement <4 x i32> %a, i32 3
720  %a01 = add i32 %a0, %a1
721  %a23 = add i32 %a2, %a3
722  %b0 = extractelement <4 x i32> %b, i32 0
723  %b1 = extractelement <4 x i32> %b, i32 1
724  %b2 = extractelement <4 x i32> %b, i32 2
725  %b3 = extractelement <4 x i32> %b, i32 3
726  %b01 = add i32 %b0, %b1
727  %b23 = add i32 %b2, %b3
728  %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
729  %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
730  %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
731  %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
732  %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
733  ret <4 x i32> %result
734}
735
736define <4 x i32> @add_v4i32_32u0(<4 x i32> %a, <4 x i32> %b) {
737; SSE2-LABEL: @add_v4i32_32u0(
738; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
739; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
740; SSE2-NEXT:    [[RESULT1:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
741; SSE2-NEXT:    ret <4 x i32> [[RESULT1]]
742;
743; SSE4-LABEL: @add_v4i32_32u0(
744; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
745; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
746; SSE4-NEXT:    [[RESULT:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
747; SSE4-NEXT:    ret <4 x i32> [[RESULT]]
748;
749; AVX2-LABEL: @add_v4i32_32u0(
750; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
751; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
752; AVX2-NEXT:    [[RESULT:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
753; AVX2-NEXT:    ret <4 x i32> [[RESULT]]
754;
755; AVX512-LABEL: @add_v4i32_32u0(
756; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
757; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
758; AVX512-NEXT:    [[RESULT1:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
759; AVX512-NEXT:    ret <4 x i32> [[RESULT1]]
760;
761  %a0 = extractelement <4 x i32> %a, i32 0
762  %a1 = extractelement <4 x i32> %a, i32 1
763  %a2 = extractelement <4 x i32> %a, i32 2
764  %a3 = extractelement <4 x i32> %a, i32 3
765  %a01 = add i32 %a0, %a1
766  %a23 = add i32 %a2, %a3
767  %b0 = extractelement <4 x i32> %b, i32 0
768  %b1 = extractelement <4 x i32> %b, i32 1
769  %b2 = extractelement <4 x i32> %b, i32 2
770  %b3 = extractelement <4 x i32> %b, i32 3
771  %b01 = add i32 %b0, %b1
772  %b23 = add i32 %b2, %b3
773  %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
774  %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
775  %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
776  %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
777  %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 3, i32 2, i32 poison, i32 0>
778  ret <4 x i32> %result
779}
780
781;
782; v8i32
783;
784
785define <8 x i32> @add_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
786; CHECK-LABEL: @add_v8i32_01234567(
787; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
788; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
789; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
790; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
791;
792  %a0 = extractelement <8 x i32> %a, i32 0
793  %a1 = extractelement <8 x i32> %a, i32 1
794  %a2 = extractelement <8 x i32> %a, i32 2
795  %a3 = extractelement <8 x i32> %a, i32 3
796  %a4 = extractelement <8 x i32> %a, i32 4
797  %a5 = extractelement <8 x i32> %a, i32 5
798  %a6 = extractelement <8 x i32> %a, i32 6
799  %a7 = extractelement <8 x i32> %a, i32 7
800  %a01 = add i32 %a0, %a1
801  %a23 = add i32 %a2, %a3
802  %a45 = add i32 %a4, %a5
803  %a67 = add i32 %a6, %a7
804  %b0 = extractelement <8 x i32> %b, i32 0
805  %b1 = extractelement <8 x i32> %b, i32 1
806  %b2 = extractelement <8 x i32> %b, i32 2
807  %b3 = extractelement <8 x i32> %b, i32 3
808  %b4 = extractelement <8 x i32> %b, i32 4
809  %b5 = extractelement <8 x i32> %b, i32 5
810  %b6 = extractelement <8 x i32> %b, i32 6
811  %b7 = extractelement <8 x i32> %b, i32 7
812  %b01 = add i32 %b0, %b1
813  %b23 = add i32 %b2, %b3
814  %b45 = add i32 %b4, %b5
815  %b67 = add i32 %b6, %b7
816  %hadd0 = insertelement <8 x i32> poison, i32 %a01, i32 0
817  %hadd1 = insertelement <8 x i32> %hadd0, i32 %a23, i32 1
818  %hadd2 = insertelement <8 x i32> %hadd1, i32 %b01, i32 2
819  %hadd3 = insertelement <8 x i32> %hadd2, i32 %b23, i32 3
820  %hadd4 = insertelement <8 x i32> %hadd3, i32 %a45, i32 4
821  %hadd5 = insertelement <8 x i32> %hadd4, i32 %a67, i32 5
822  %hadd6 = insertelement <8 x i32> %hadd5, i32 %b45, i32 6
823  %hadd7 = insertelement <8 x i32> %hadd6, i32 %b67, i32 7
824  %result = shufflevector <8 x i32> %hadd7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
825  ret <8 x i32> %result
826}
827
828define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
829; SSE2-LABEL: @add_v8i32_01234u67(
830; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
831; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
832; SSE2-NEXT:    [[RESULT:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
833; SSE2-NEXT:    ret <8 x i32> [[RESULT]]
834;
835; SSE4-LABEL: @add_v8i32_01234u67(
836; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
837; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
838; SSE4-NEXT:    [[A45:%.*]] = add i32 [[A4]], [[A5]]
839; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
840; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
841; SSE4-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
842; SSE4-NEXT:    [[HADD4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
843; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
844; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
845; SSE4-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP5]]
846; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
847; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
848;
849; AVX-LABEL: @add_v8i32_01234u67(
850; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
851; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
852; AVX-NEXT:    [[RESULT:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
853; AVX-NEXT:    ret <8 x i32> [[RESULT]]
854;
855  %a0 = extractelement <8 x i32> %a, i32 0
856  %a1 = extractelement <8 x i32> %a, i32 1
857  %a2 = extractelement <8 x i32> %a, i32 2
858  %a3 = extractelement <8 x i32> %a, i32 3
859  %a4 = extractelement <8 x i32> %a, i32 4
860  %a5 = extractelement <8 x i32> %a, i32 5
861  %a6 = extractelement <8 x i32> %a, i32 6
862  %a7 = extractelement <8 x i32> %a, i32 7
863  %a01 = add i32 %a0, %a1
864  %a23 = add i32 %a2, %a3
865  %a45 = add i32 %a4, %a5
866  %a67 = add i32 %a6, %a7
867  %b0 = extractelement <8 x i32> %b, i32 0
868  %b1 = extractelement <8 x i32> %b, i32 1
869  %b2 = extractelement <8 x i32> %b, i32 2
870  %b3 = extractelement <8 x i32> %b, i32 3
871  %b4 = extractelement <8 x i32> %b, i32 4
872  %b5 = extractelement <8 x i32> %b, i32 5
873  %b6 = extractelement <8 x i32> %b, i32 6
874  %b7 = extractelement <8 x i32> %b, i32 7
875  %b01 = add i32 %b0, %b1
876  %b23 = add i32 %b2, %b3
877  %b45 = add i32 %b4, %b5
878  %b67 = add i32 %b6, %b7
879  %hadd0 = insertelement <8 x i32> poison, i32 %a01, i32 0
880  %hadd1 = insertelement <8 x i32> %hadd0, i32 %a23, i32 1
881  %hadd2 = insertelement <8 x i32> %hadd1, i32 %b01, i32 2
882  %hadd3 = insertelement <8 x i32> %hadd2, i32 %b23, i32 3
883  %hadd4 = insertelement <8 x i32> %hadd3, i32 %a45, i32 4
884  %hadd5 = insertelement <8 x i32> %hadd4, i32 %a67, i32 5
885  %hadd6 = insertelement <8 x i32> %hadd5, i32 %b45, i32 6
886  %hadd7 = insertelement <8 x i32> %hadd6, i32 %b67, i32 7
887  %result = shufflevector <8 x i32> %hadd7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 6, i32 7>
888  ret <8 x i32> %result
889}
890
891;
892; v4f32
893;
894
895define <4 x float> @add_v4f32_0123(<4 x float> %a, <4 x float> %b) {
896; CHECK-LABEL: @add_v4f32_0123(
897; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
898; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
899; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
900; CHECK-NEXT:    ret <4 x float> [[TMP3]]
901;
902  %a0 = extractelement <4 x float> %a, i32 0
903  %a1 = extractelement <4 x float> %a, i32 1
904  %a2 = extractelement <4 x float> %a, i32 2
905  %a3 = extractelement <4 x float> %a, i32 3
906  %a01 = fadd float %a0, %a1
907  %a23 = fadd float %a2, %a3
908  %b0 = extractelement <4 x float> %b, i32 0
909  %b1 = extractelement <4 x float> %b, i32 1
910  %b2 = extractelement <4 x float> %b, i32 2
911  %b3 = extractelement <4 x float> %b, i32 3
912  %b01 = fadd float %b0, %b1
913  %b23 = fadd float %b2, %b3
914  %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
915  %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
916  %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
917  %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
918  %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
919  ret <4 x float> %result
920}
921
922define <4 x float> @add_v4f32_u123(<4 x float> %a, <4 x float> %b) {
923; CHECK-LABEL: @add_v4f32_u123(
924; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 5, i32 6>
925; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 4, i32 7>
926; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
927; CHECK-NEXT:    ret <4 x float> [[TMP4]]
928;
929  %a0 = extractelement <4 x float> %a, i32 0
930  %a1 = extractelement <4 x float> %a, i32 1
931  %a2 = extractelement <4 x float> %a, i32 2
932  %a3 = extractelement <4 x float> %a, i32 3
933  %a01 = fadd float %a0, %a1
934  %a23 = fadd float %a2, %a3
935  %b0 = extractelement <4 x float> %b, i32 0
936  %b1 = extractelement <4 x float> %b, i32 1
937  %b2 = extractelement <4 x float> %b, i32 2
938  %b3 = extractelement <4 x float> %b, i32 3
939  %b01 = fadd float %b0, %b1
940  %b23 = fadd float %b2, %b3
941  %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
942  %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
943  %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
944  %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
945  %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
946  ret <4 x float> %result
947}
948
949define <4 x float> @add_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
950; CHECK-LABEL: @add_v4f32_0u23(
951; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 5, i32 6>
952; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 poison, i32 4, i32 7>
953; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
954; CHECK-NEXT:    ret <4 x float> [[TMP4]]
955;
956  %a0 = extractelement <4 x float> %a, i32 0
957  %a1 = extractelement <4 x float> %a, i32 1
958  %a2 = extractelement <4 x float> %a, i32 2
959  %a3 = extractelement <4 x float> %a, i32 3
960  %a01 = fadd float %a0, %a1
961  %a23 = fadd float %a2, %a3
962  %b0 = extractelement <4 x float> %b, i32 0
963  %b1 = extractelement <4 x float> %b, i32 1
964  %b2 = extractelement <4 x float> %b, i32 2
965  %b3 = extractelement <4 x float> %b, i32 3
966  %b01 = fadd float %b0, %b1
967  %b23 = fadd float %b2, %b3
968  %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
969  %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
970  %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
971  %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
972  %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
973  ret <4 x float> %result
974}
975
976define <4 x float> @add_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
977; CHECK-LABEL: @add_v4f32_01u3(
978; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
979; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
980; CHECK-NEXT:    [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
981; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
982;
983  %a0 = extractelement <4 x float> %a, i32 0
984  %a1 = extractelement <4 x float> %a, i32 1
985  %a2 = extractelement <4 x float> %a, i32 2
986  %a3 = extractelement <4 x float> %a, i32 3
987  %a01 = fadd float %a0, %a1
988  %a23 = fadd float %a2, %a3
989  %b0 = extractelement <4 x float> %b, i32 0
990  %b1 = extractelement <4 x float> %b, i32 1
991  %b2 = extractelement <4 x float> %b, i32 2
992  %b3 = extractelement <4 x float> %b, i32 3
993  %b01 = fadd float %b0, %b1
994  %b23 = fadd float %b2, %b3
995  %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
996  %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
997  %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
998  %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
999  %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
1000  ret <4 x float> %result
1001}
1002
1003define <4 x float> @add_v4f32_012u(<4 x float> %a, <4 x float> %b) {
1004; SSE2-LABEL: @add_v4f32_012u(
1005; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
1006; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
1007; SSE2-NEXT:    [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
1008; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
1009;
1010; SSE4-LABEL: @add_v4f32_012u(
1011; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 4, i32 poison>
1012; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 3, i32 5, i32 poison>
1013; SSE4-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
1014; SSE4-NEXT:    ret <4 x float> [[TMP4]]
1015;
1016; AVX2-LABEL: @add_v4f32_012u(
1017; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 4, i32 poison>
1018; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 3, i32 5, i32 poison>
1019; AVX2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
1020; AVX2-NEXT:    ret <4 x float> [[TMP4]]
1021;
1022; AVX512-LABEL: @add_v4f32_012u(
1023; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
1024; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
1025; AVX512-NEXT:    [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
1026; AVX512-NEXT:    ret <4 x float> [[RESULT1]]
1027;
1028  %a0 = extractelement <4 x float> %a, i32 0
1029  %a1 = extractelement <4 x float> %a, i32 1
1030  %a2 = extractelement <4 x float> %a, i32 2
1031  %a3 = extractelement <4 x float> %a, i32 3
1032  %a01 = fadd float %a0, %a1
1033  %a23 = fadd float %a2, %a3
1034  %b0 = extractelement <4 x float> %b, i32 0
1035  %b1 = extractelement <4 x float> %b, i32 1
1036  %b2 = extractelement <4 x float> %b, i32 2
1037  %b3 = extractelement <4 x float> %b, i32 3
1038  %b01 = fadd float %b0, %b1
1039  %b23 = fadd float %b2, %b3
1040  %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
1041  %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
1042  %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
1043  %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
1044  %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
1045  ret <4 x float> %result
1046}
1047
1048define <4 x float> @add_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
1049; CHECK-LABEL: @add_v4f32_uu23(
1050; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 2>
1051; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
1052; CHECK-NEXT:    [[RESULT2:%.*]] = fadd <4 x float> [[TMP2]], [[RESULT1]]
1053; CHECK-NEXT:    ret <4 x float> [[RESULT2]]
1054;
1055  %a0 = extractelement <4 x float> %a, i32 0
1056  %a1 = extractelement <4 x float> %a, i32 1
1057  %a2 = extractelement <4 x float> %a, i32 2
1058  %a3 = extractelement <4 x float> %a, i32 3
1059  %a01 = fadd float %a0, %a1
1060  %a23 = fadd float %a2, %a3
1061  %b0 = extractelement <4 x float> %b, i32 0
1062  %b1 = extractelement <4 x float> %b, i32 1
1063  %b2 = extractelement <4 x float> %b, i32 2
1064  %b3 = extractelement <4 x float> %b, i32 3
1065  %b01 = fadd float %b0, %b1
1066  %b23 = fadd float %b2, %b3
1067  %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
1068  %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
1069  %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
1070  %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
1071  %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
1072  ret <4 x float> %result
1073}
1074
1075define <4 x float> @add_v4f32_01uu(<4 x float> %a, <4 x float> %b) {
1076; CHECK-LABEL: @add_v4f32_01uu(
1077; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
1078; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
1079; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
1080; CHECK-NEXT:    ret <4 x float> [[TMP4]]
1081;
1082  %a0 = extractelement <4 x float> %a, i32 0
1083  %a1 = extractelement <4 x float> %a, i32 1
1084  %a2 = extractelement <4 x float> %a, i32 2
1085  %a3 = extractelement <4 x float> %a, i32 3
1086  %a01 = fadd float %a0, %a1
1087  %a23 = fadd float %a2, %a3
1088  %b0 = extractelement <4 x float> %b, i32 0
1089  %b1 = extractelement <4 x float> %b, i32 1
1090  %b2 = extractelement <4 x float> %b, i32 2
1091  %b3 = extractelement <4 x float> %b, i32 3
1092  %b01 = fadd float %b0, %b1
1093  %b23 = fadd float %b2, %b3
1094  %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
1095  %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
1096  %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
1097  %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
1098  %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1099  ret <4 x float> %result
1100}
1101
1102;
1103; v8f32
1104;
1105
1106define <8 x float> @add_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
1107; CHECK-LABEL: @add_v8f32_01234567(
1108; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
1109; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
1110; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
1111; CHECK-NEXT:    ret <8 x float> [[TMP3]]
1112;
1113  %a0 = extractelement <8 x float> %a, i32 0
1114  %a1 = extractelement <8 x float> %a, i32 1
1115  %a2 = extractelement <8 x float> %a, i32 2
1116  %a3 = extractelement <8 x float> %a, i32 3
1117  %a4 = extractelement <8 x float> %a, i32 4
1118  %a5 = extractelement <8 x float> %a, i32 5
1119  %a6 = extractelement <8 x float> %a, i32 6
1120  %a7 = extractelement <8 x float> %a, i32 7
1121  %a01 = fadd float %a0, %a1
1122  %a23 = fadd float %a2, %a3
1123  %a45 = fadd float %a4, %a5
1124  %a67 = fadd float %a6, %a7
1125  %b0 = extractelement <8 x float> %b, i32 0
1126  %b1 = extractelement <8 x float> %b, i32 1
1127  %b2 = extractelement <8 x float> %b, i32 2
1128  %b3 = extractelement <8 x float> %b, i32 3
1129  %b4 = extractelement <8 x float> %b, i32 4
1130  %b5 = extractelement <8 x float> %b, i32 5
1131  %b6 = extractelement <8 x float> %b, i32 6
1132  %b7 = extractelement <8 x float> %b, i32 7
1133  %b01 = fadd float %b0, %b1
1134  %b23 = fadd float %b2, %b3
1135  %b45 = fadd float %b4, %b5
1136  %b67 = fadd float %b6, %b7
1137  %hadd0 = insertelement <8 x float> poison, float %a01, i32 0
1138  %hadd1 = insertelement <8 x float> %hadd0, float %a23, i32 1
1139  %hadd2 = insertelement <8 x float> %hadd1, float %b01, i32 2
1140  %hadd3 = insertelement <8 x float> %hadd2, float %b23, i32 3
1141  %hadd4 = insertelement <8 x float> %hadd3, float %a45, i32 4
1142  %hadd5 = insertelement <8 x float> %hadd4, float %a67, i32 5
1143  %hadd6 = insertelement <8 x float> %hadd5, float %b45, i32 6
1144  %hadd7 = insertelement <8 x float> %hadd6, float %b67, i32 7
1145  %result = shufflevector <8 x float> %hadd7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1146  ret <8 x float> %result
1147}
1148
1149define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
1150; SSE2-LABEL: @add_v8f32_012u4567(
1151; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
1152; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
1153; SSE2-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
1154; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
1155; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
1156; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
1157; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
1158; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
1159; SSE2-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP5]], [[TMP8]]
1160; SSE2-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
1161; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1162; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
1163; SSE2-NEXT:    ret <8 x float> [[RESULT]]
1164;
1165; SSE4-LABEL: @add_v8f32_012u4567(
1166; SSE4-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
1167; SSE4-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
1168; SSE4-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
1169; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
1170; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
1171; SSE4-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP7]]
1172; SSE4-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
1173; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1174; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1175; SSE4-NEXT:    [[TMP9:%.*]] = fadd <8 x float> [[TMP8]], [[TMP5]]
1176; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
1177; SSE4-NEXT:    ret <8 x float> [[RESULT]]
1178;
1179; AVX-LABEL: @add_v8f32_012u4567(
1180; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 13, i32 14>
1181; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 12, i32 15>
1182; AVX-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
1183; AVX-NEXT:    ret <8 x float> [[TMP7]]
1184;
1185  %a0 = extractelement <8 x float> %a, i32 0
1186  %a1 = extractelement <8 x float> %a, i32 1
1187  %a2 = extractelement <8 x float> %a, i32 2
1188  %a3 = extractelement <8 x float> %a, i32 3
1189  %a4 = extractelement <8 x float> %a, i32 4
1190  %a5 = extractelement <8 x float> %a, i32 5
1191  %a6 = extractelement <8 x float> %a, i32 6
1192  %a7 = extractelement <8 x float> %a, i32 7
1193  %a01 = fadd float %a0, %a1
1194  %a23 = fadd float %a2, %a3
1195  %a45 = fadd float %a4, %a5
1196  %a67 = fadd float %a6, %a7
1197  %b0 = extractelement <8 x float> %b, i32 0
1198  %b1 = extractelement <8 x float> %b, i32 1
1199  %b2 = extractelement <8 x float> %b, i32 2
1200  %b3 = extractelement <8 x float> %b, i32 3
1201  %b4 = extractelement <8 x float> %b, i32 4
1202  %b5 = extractelement <8 x float> %b, i32 5
1203  %b6 = extractelement <8 x float> %b, i32 6
1204  %b7 = extractelement <8 x float> %b, i32 7
1205  %b01 = fadd float %b0, %b1
1206  %b23 = fadd float %b2, %b3
1207  %b45 = fadd float %b4, %b5
1208  %b67 = fadd float %b6, %b7
1209  %hadd0 = insertelement <8 x float> poison, float %a01, i32 0
1210  %hadd1 = insertelement <8 x float> %hadd0, float %a23, i32 1
1211  %hadd2 = insertelement <8 x float> %hadd1, float %b01, i32 2
1212  %hadd3 = insertelement <8 x float> %hadd2, float %b23, i32 3
1213  %hadd4 = insertelement <8 x float> %hadd3, float %a45, i32 4
1214  %hadd5 = insertelement <8 x float> %hadd4, float %a67, i32 5
1215  %hadd6 = insertelement <8 x float> %hadd5, float %b45, i32 6
1216  %hadd7 = insertelement <8 x float> %hadd6, float %b67, i32 7
1217  %result = shufflevector <8 x float> %hadd7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7>
1218  ret <8 x float> %result
1219}
1220
1221define <8 x float> @add_v8f32_76u43210(<8 x float> %a, <8 x float> %b) {
1222; SSE2-LABEL: @add_v8f32_76u43210(
1223; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1224; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1225; SSE2-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
1226; SSE2-NEXT:    [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i64 0
1227; SSE2-NEXT:    [[B1:%.*]] = extractelement <8 x float> [[B]], i64 1
1228; SSE2-NEXT:    [[B01:%.*]] = fadd float [[B0]], [[B1]]
1229; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
1230; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
1231; SSE2-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP4]], [[TMP5]]
1232; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
1233; SSE2-NEXT:    [[HADD4:%.*]] = insertelement <8 x float> [[TMP7]], float [[B01]], i64 4
1234; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1235; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> [[HADD4]], <8 x i32> <i32 1, i32 0, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8>
1236; SSE2-NEXT:    ret <8 x float> [[RESULT]]
1237;
1238; SSE4-LABEL: @add_v8f32_76u43210(
1239; SSE4-NEXT:    [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i64 0
1240; SSE4-NEXT:    [[B1:%.*]] = extractelement <8 x float> [[B]], i64 1
1241; SSE4-NEXT:    [[B01:%.*]] = fadd float [[B0]], [[B1]]
1242; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
1243; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
1244; SSE4-NEXT:    [[RESULT:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
1245; SSE4-NEXT:    [[HADD4:%.*]] = insertelement <8 x float> [[RESULT]], float [[B01]], i64 4
1246; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1247; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1248; SSE4-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
1249; SSE4-NEXT:    [[RESULT1:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> [[HADD4]], <8 x i32> <i32 1, i32 0, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8>
1250; SSE4-NEXT:    ret <8 x float> [[RESULT1]]
1251;
1252; AVX-LABEL: @add_v8f32_76u43210(
1253; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> [[A:%.*]], <8 x i32> <i32 6, i32 5, i32 poison, i32 0, i32 14, i32 12, i32 10, i32 8>
1254; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> [[A]], <8 x i32> <i32 7, i32 4, i32 poison, i32 1, i32 15, i32 13, i32 11, i32 9>
1255; AVX-NEXT:    [[RESULT:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
1256; AVX-NEXT:    ret <8 x float> [[RESULT]]
1257;
1258  %a0 = extractelement <8 x float> %a, i32 0
1259  %a1 = extractelement <8 x float> %a, i32 1
1260  %a2 = extractelement <8 x float> %a, i32 2
1261  %a3 = extractelement <8 x float> %a, i32 3
1262  %a4 = extractelement <8 x float> %a, i32 4
1263  %a5 = extractelement <8 x float> %a, i32 5
1264  %a6 = extractelement <8 x float> %a, i32 6
1265  %a7 = extractelement <8 x float> %a, i32 7
1266  %a01 = fadd float %a0, %a1
1267  %a23 = fadd float %a2, %a3
1268  %a45 = fadd float %a4, %a5
1269  %a67 = fadd float %a6, %a7
1270  %b0 = extractelement <8 x float> %b, i32 0
1271  %b1 = extractelement <8 x float> %b, i32 1
1272  %b2 = extractelement <8 x float> %b, i32 2
1273  %b3 = extractelement <8 x float> %b, i32 3
1274  %b4 = extractelement <8 x float> %b, i32 4
1275  %b5 = extractelement <8 x float> %b, i32 5
1276  %b6 = extractelement <8 x float> %b, i32 6
1277  %b7 = extractelement <8 x float> %b, i32 7
1278  %b01 = fadd float %b0, %b1
1279  %b23 = fadd float %b2, %b3
1280  %b45 = fadd float %b4, %b5
1281  %b67 = fadd float %b6, %b7
1282  %hadd0 = insertelement <8 x float> poison, float %a01, i32 0
1283  %hadd1 = insertelement <8 x float> %hadd0, float %a23, i32 1
1284  %hadd2 = insertelement <8 x float> %hadd1, float %a45, i32 2
1285  %hadd3 = insertelement <8 x float> %hadd2, float %a67, i32 3
1286  %hadd4 = insertelement <8 x float> %hadd3, float %b01, i32 4
1287  %hadd5 = insertelement <8 x float> %hadd4, float %b23, i32 5
1288  %hadd6 = insertelement <8 x float> %hadd5, float %b45, i32 6
1289  %hadd7 = insertelement <8 x float> %hadd6, float %b67, i32 7
1290  %result = shufflevector <8 x float> %hadd7, <8 x float> %a, <8 x i32> <i32 7, i32 6, i32 poison, i32 4, i32 3, i32 2, i32 1, i32 0>
1291  ret <8 x float> %result
1292}
1293
1294;
1295; v2f64
1296;
1297
1298define <2 x double> @add_v2f64_01(<2 x double> %a, <2 x double> %b) {
1299; CHECK-LABEL: @add_v2f64_01(
1300; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
1301; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
1302; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
1303; CHECK-NEXT:    ret <2 x double> [[TMP3]]
1304;
1305  %a0 = extractelement <2 x double> %a, i32 0
1306  %a1 = extractelement <2 x double> %a, i32 1
1307  %a01 = fadd double %a0, %a1
1308  %b0 = extractelement <2 x double> %b, i32 0
1309  %b1 = extractelement <2 x double> %b, i32 1
1310  %b01 = fadd double %b0, %b1
1311  %hadd0 = insertelement <2 x double> poison, double %a01, i32 0
1312  %hadd1 = insertelement <2 x double> %hadd0, double %b01, i32 1
1313  %result = shufflevector <2 x double> %hadd1, <2 x double> %a, <2 x i32> <i32 0, i32 1>
1314  ret <2 x double> %result
1315}
1316
1317define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
1318; CHECK-LABEL: @add_v2f64_u1(
1319; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
1320; CHECK-NEXT:    [[RESULT1:%.*]] = fadd <2 x double> [[TMP1]], [[B]]
1321; CHECK-NEXT:    ret <2 x double> [[RESULT1]]
1322;
1323  %a0 = extractelement <2 x double> %a, i32 0
1324  %a1 = extractelement <2 x double> %a, i32 1
1325  %a01 = fadd double %a0, %a1
1326  %b0 = extractelement <2 x double> %b, i32 0
1327  %b1 = extractelement <2 x double> %b, i32 1
1328  %b01 = fadd double %b0, %b1
1329  %hadd0 = insertelement <2 x double> poison, double %a01, i32 0
1330  %hadd1 = insertelement <2 x double> %hadd0, double %b01, i32 1
1331  %result = shufflevector <2 x double> %hadd1, <2 x double> %a, <2 x i32> <i32 poison, i32 1>
1332  ret <2 x double> %result
1333}
1334
1335define <2 x double> @add_v2f64_0u(<2 x double> %a, <2 x double> %b) {
1336; CHECK-LABEL: @add_v2f64_0u(
1337; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
1338; CHECK-NEXT:    [[RESULT1:%.*]] = fadd <2 x double> [[TMP1]], [[RESULT]]
1339; CHECK-NEXT:    ret <2 x double> [[RESULT1]]
1340;
1341  %a0 = extractelement <2 x double> %a, i32 0
1342  %a1 = extractelement <2 x double> %a, i32 1
1343  %a01 = fadd double %a0, %a1
1344  %b0 = extractelement <2 x double> %b, i32 0
1345  %b1 = extractelement <2 x double> %b, i32 1
1346  %b01 = fadd double %b0, %b1
1347  %hadd0 = insertelement <2 x double> poison, double %a01, i32 0
1348  %hadd1 = insertelement <2 x double> %hadd0, double %b01, i32 1
1349  %result = shufflevector <2 x double> %hadd1, <2 x double> %a, <2 x i32> <i32 0, i32 poison>
1350  ret <2 x double> %result
1351}
1352
1353;
1354; v4f64
1355;
1356
1357define <4 x double> @add_v4f64_0123(<4 x double> %a, <4 x double> %b) {
1358; CHECK-LABEL: @add_v4f64_0123(
1359; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
1360; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
1361; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
1362; CHECK-NEXT:    ret <4 x double> [[TMP3]]
1363;
1364  %a0 = extractelement <4 x double> %a, i32 0
1365  %a1 = extractelement <4 x double> %a, i32 1
1366  %a2 = extractelement <4 x double> %a, i32 2
1367  %a3 = extractelement <4 x double> %a, i32 3
1368  %a01 = fadd double %a0, %a1
1369  %a23 = fadd double %a2, %a3
1370  %b0 = extractelement <4 x double> %b, i32 0
1371  %b1 = extractelement <4 x double> %b, i32 1
1372  %b2 = extractelement <4 x double> %b, i32 2
1373  %b3 = extractelement <4 x double> %b, i32 3
1374  %b01 = fadd double %b0, %b1
1375  %b23 = fadd double %b2, %b3
1376  %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
1377  %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
1378  %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
1379  %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
1380  %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1381  ret <4 x double> %result
1382}
1383
1384define <4 x double> @add_v4f64_u123(<4 x double> %a, <4 x double> %b) {
1385; SSE2-LABEL: @add_v4f64_u123(
1386; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
1387; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
1388; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <2 x i32> <i32 0, i32 6>
1389; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <2 x i32> <i32 1, i32 7>
1390; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
1391; SSE2-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
1392; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
1393; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
1394; SSE2-NEXT:    ret <4 x double> [[RESULT]]
1395;
1396; SSE4-LABEL: @add_v4f64_u123(
1397; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
1398; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
1399; SSE4-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
1400; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
1401; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
1402; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
1403; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
1404; SSE4-NEXT:    ret <4 x double> [[RESULT]]
1405;
1406; AVX-LABEL: @add_v4f64_u123(
1407; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
1408; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
1409; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
1410; AVX-NEXT:    ret <4 x double> [[TMP4]]
1411;
1412  %a0 = extractelement <4 x double> %a, i32 0
1413  %a1 = extractelement <4 x double> %a, i32 1
1414  %a2 = extractelement <4 x double> %a, i32 2
1415  %a3 = extractelement <4 x double> %a, i32 3
1416  %a01 = fadd double %a0, %a1
1417  %a23 = fadd double %a2, %a3
1418  %b0 = extractelement <4 x double> %b, i32 0
1419  %b1 = extractelement <4 x double> %b, i32 1
1420  %b2 = extractelement <4 x double> %b, i32 2
1421  %b3 = extractelement <4 x double> %b, i32 3
1422  %b01 = fadd double %b0, %b1
1423  %b23 = fadd double %b2, %b3
1424  %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
1425  %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
1426  %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
1427  %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
1428  %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
1429  ret <4 x double> %result
1430}
1431
1432define <4 x double> @add_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
1433; SSE2-LABEL: @add_v4f64_0u23(
1434; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
1435; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
1436; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
1437; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
1438; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
1439; SSE2-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
1440; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
1441; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
1442; SSE2-NEXT:    ret <4 x double> [[RESULT]]
1443;
1444; SSE4-LABEL: @add_v4f64_0u23(
1445; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
1446; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
1447; SSE4-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
1448; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
1449; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
1450; SSE4-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
1451; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
1452; SSE4-NEXT:    ret <4 x double> [[RESULT]]
1453;
1454; AVX-LABEL: @add_v4f64_0u23(
1455; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 1, i32 poison, i32 2, i32 6>
1456; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 poison, i32 3, i32 7>
1457; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
1458; AVX-NEXT:    ret <4 x double> [[TMP4]]
1459;
1460  %a0 = extractelement <4 x double> %a, i32 0
1461  %a1 = extractelement <4 x double> %a, i32 1
1462  %a2 = extractelement <4 x double> %a, i32 2
1463  %a3 = extractelement <4 x double> %a, i32 3
1464  %a01 = fadd double %a0, %a1
1465  %a23 = fadd double %a2, %a3
1466  %b0 = extractelement <4 x double> %b, i32 0
1467  %b1 = extractelement <4 x double> %b, i32 1
1468  %b2 = extractelement <4 x double> %b, i32 2
1469  %b3 = extractelement <4 x double> %b, i32 3
1470  %b01 = fadd double %b0, %b1
1471  %b23 = fadd double %b2, %b3
1472  %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
1473  %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
1474  %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
1475  %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
1476  %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
1477  ret <4 x double> %result
1478}
1479
1480define <4 x double> @add_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
1481; SSE2-LABEL: @add_v4f64_01u3(
1482; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
1483; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
1484; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
1485; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
1486; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
1487; SSE2-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
1488; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1489; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
1490; SSE2-NEXT:    ret <4 x double> [[RESULT]]
1491;
1492; SSE4-LABEL: @add_v4f64_01u3(
1493; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
1494; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
1495; SSE4-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
1496; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
1497; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
1498; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
1499; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
1500; SSE4-NEXT:    ret <4 x double> [[RESULT]]
1501;
1502; AVX-LABEL: @add_v4f64_01u3(
1503; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
1504; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
1505; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
1506; AVX-NEXT:    ret <4 x double> [[TMP4]]
1507;
1508  %a0 = extractelement <4 x double> %a, i32 0
1509  %a1 = extractelement <4 x double> %a, i32 1
1510  %a2 = extractelement <4 x double> %a, i32 2
1511  %a3 = extractelement <4 x double> %a, i32 3
1512  %a01 = fadd double %a0, %a1
1513  %a23 = fadd double %a2, %a3
1514  %b0 = extractelement <4 x double> %b, i32 0
1515  %b1 = extractelement <4 x double> %b, i32 1
1516  %b2 = extractelement <4 x double> %b, i32 2
1517  %b3 = extractelement <4 x double> %b, i32 3
1518  %b01 = fadd double %b0, %b1
1519  %b23 = fadd double %b2, %b3
1520  %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
1521  %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
1522  %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
1523  %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
1524  %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
1525  ret <4 x double> %result
1526}
1527
1528define <4 x double> @add_v4f64_012u(<4 x double> %a, <4 x double> %b) {
1529; SSE2-LABEL: @add_v4f64_012u(
1530; SSE2-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
1531; SSE2-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
1532; SSE2-NEXT:    [[A23:%.*]] = fadd double [[A2]], [[A3]]
1533; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
1534; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
1535; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
1536; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1537; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[A23]], i64 2
1538; SSE2-NEXT:    ret <4 x double> [[RESULT]]
1539;
1540; SSE4-LABEL: @add_v4f64_012u(
1541; SSE4-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
1542; SSE4-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
1543; SSE4-NEXT:    [[A23:%.*]] = fadd double [[A2]], [[A3]]
1544; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
1545; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
1546; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
1547; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
1548; SSE4-NEXT:    ret <4 x double> [[RESULT]]
1549;
1550; AVX-LABEL: @add_v4f64_012u(
1551; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
1552; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
1553; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
1554; AVX-NEXT:    ret <4 x double> [[TMP4]]
1555;
1556  %a0 = extractelement <4 x double> %a, i32 0
1557  %a1 = extractelement <4 x double> %a, i32 1
1558  %a2 = extractelement <4 x double> %a, i32 2
1559  %a3 = extractelement <4 x double> %a, i32 3
1560  %a01 = fadd double %a0, %a1
1561  %a23 = fadd double %a2, %a3
1562  %b0 = extractelement <4 x double> %b, i32 0
1563  %b1 = extractelement <4 x double> %b, i32 1
1564  %b2 = extractelement <4 x double> %b, i32 2
1565  %b3 = extractelement <4 x double> %b, i32 3
1566  %b01 = fadd double %b0, %b1
1567  %b23 = fadd double %b2, %b3
1568  %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
1569  %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
1570  %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
1571  %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
1572  %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
1573  ret <4 x double> %result
1574}
1575
1576define <4 x double> @add_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
1577; SSE2-LABEL: @add_v4f64_uu23(
1578; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 2, i32 6>
1579; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
1580; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
1581; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
1582; SSE2-NEXT:    ret <4 x double> [[RESULT1]]
1583;
1584; SSE4-LABEL: @add_v4f64_uu23(
1585; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
1586; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
1587; SSE4-NEXT:    [[RESULT1:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
1588; SSE4-NEXT:    ret <4 x double> [[RESULT1]]
1589;
1590; AVX-LABEL: @add_v4f64_uu23(
1591; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
1592; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
1593; AVX-NEXT:    [[RESULT1:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
1594; AVX-NEXT:    ret <4 x double> [[RESULT1]]
1595;
1596  %a0 = extractelement <4 x double> %a, i32 0
1597  %a1 = extractelement <4 x double> %a, i32 1
1598  %a2 = extractelement <4 x double> %a, i32 2
1599  %a3 = extractelement <4 x double> %a, i32 3
1600  %a01 = fadd double %a0, %a1
1601  %a23 = fadd double %a2, %a3
1602  %b0 = extractelement <4 x double> %b, i32 0
1603  %b1 = extractelement <4 x double> %b, i32 1
1604  %b2 = extractelement <4 x double> %b, i32 2
1605  %b3 = extractelement <4 x double> %b, i32 3
1606  %b01 = fadd double %b0, %b1
1607  %b23 = fadd double %b2, %b3
1608  %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
1609  %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
1610  %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
1611  %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
1612  %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
1613  ret <4 x double> %result
1614}
1615
1616define <4 x double> @add_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
1617; SSE2-LABEL: @add_v4f64_01uu(
1618; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
1619; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
1620; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
1621; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1622; SSE2-NEXT:    ret <4 x double> [[TMP4]]
1623;
1624; SSE4-LABEL: @add_v4f64_01uu(
1625; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
1626; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
1627; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
1628; SSE4-NEXT:    ret <4 x double> [[TMP3]]
1629;
1630; AVX-LABEL: @add_v4f64_01uu(
1631; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
1632; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
1633; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
1634; AVX-NEXT:    ret <4 x double> [[TMP3]]
1635;
1636  %a0 = extractelement <4 x double> %a, i32 0
1637  %a1 = extractelement <4 x double> %a, i32 1
1638  %a2 = extractelement <4 x double> %a, i32 2
1639  %a3 = extractelement <4 x double> %a, i32 3
1640  %a01 = fadd double %a0, %a1
1641  %a23 = fadd double %a2, %a3
1642  %b0 = extractelement <4 x double> %b, i32 0
1643  %b1 = extractelement <4 x double> %b, i32 1
1644  %b2 = extractelement <4 x double> %b, i32 2
1645  %b3 = extractelement <4 x double> %b, i32 3
1646  %b01 = fadd double %b0, %b1
1647  %b23 = fadd double %b2, %b3
1648  %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
1649  %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
1650  %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
1651  %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
1652  %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1653  ret <4 x double> %result
1654}
1655
1656define <4 x double> @add_v4f64_32u0(<4 x double> %a, <4 x double> %b) {
1657; SSE2-LABEL: @add_v4f64_32u0(
1658; SSE2-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
1659; SSE2-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
1660; SSE2-NEXT:    [[A01:%.*]] = fadd double [[A0]], [[A1]]
1661; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 2, i32 0>
1662; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 3, i32 1>
1663; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
1664; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1665; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[A01]], i64 3
1666; SSE2-NEXT:    ret <4 x double> [[RESULT]]
1667;
1668; SSE4-LABEL: @add_v4f64_32u0(
1669; SSE4-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
1670; SSE4-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
1671; SSE4-NEXT:    [[A01:%.*]] = fadd double [[A0]], [[A1]]
1672; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
1673; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 3, i32 1, i32 poison, i32 poison>
1674; SSE4-NEXT:    [[RESULT:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
1675; SSE4-NEXT:    [[RESULT1:%.*]] = insertelement <4 x double> [[RESULT]], double [[A01]], i64 3
1676; SSE4-NEXT:    ret <4 x double> [[RESULT1]]
1677;
1678; AVX-LABEL: @add_v4f64_32u0(
1679; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
1680; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
1681; AVX-NEXT:    [[RESULT:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
1682; AVX-NEXT:    ret <4 x double> [[RESULT]]
1683;
1684  %a0 = extractelement <4 x double> %a, i32 0
1685  %a1 = extractelement <4 x double> %a, i32 1
1686  %a2 = extractelement <4 x double> %a, i32 2
1687  %a3 = extractelement <4 x double> %a, i32 3
1688  %a01 = fadd double %a0, %a1
1689  %a23 = fadd double %a2, %a3
1690  %b0 = extractelement <4 x double> %b, i32 0
1691  %b1 = extractelement <4 x double> %b, i32 1
1692  %b2 = extractelement <4 x double> %b, i32 2
1693  %b3 = extractelement <4 x double> %b, i32 3
1694  %b01 = fadd double %b0, %b1
1695  %b23 = fadd double %b2, %b3
1696  %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
1697  %hadd1 = insertelement <4 x double> %hadd0, double %a23, i32 1
1698  %hadd2 = insertelement <4 x double> %hadd1, double %b01, i32 2
1699  %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
1700  %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 3, i32 2, i32 poison, i32 0>
1701  ret <4 x double> %result
1702}
1703