xref: /llvm-project/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll (revision 88e00141f81c4dfd48bca58fda15d078a138b586)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
3; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
4; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
5; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
6; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
7; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
8; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
9; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
10
11; PR34072 - failure to canonicalize to (sub (shuffle a, b),(shuffle a, b)) for optimal horizontal sub patterns (with undemanded elements)
12
13;
14; v8i16
15;
16
17define <8 x i16> @sub_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
18; CHECK-LABEL: @sub_v8i16_01234567(
19; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
20; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
21; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
22; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
23;
24  %a0 = extractelement <8 x i16> %a, i32 0
25  %a1 = extractelement <8 x i16> %a, i32 1
26  %a2 = extractelement <8 x i16> %a, i32 2
27  %a3 = extractelement <8 x i16> %a, i32 3
28  %a4 = extractelement <8 x i16> %a, i32 4
29  %a5 = extractelement <8 x i16> %a, i32 5
30  %a6 = extractelement <8 x i16> %a, i32 6
31  %a7 = extractelement <8 x i16> %a, i32 7
32  %a01 = sub i16 %a0, %a1
33  %a23 = sub i16 %a2, %a3
34  %a45 = sub i16 %a4, %a5
35  %a67 = sub i16 %a6, %a7
36  %b0 = extractelement <8 x i16> %b, i32 0
37  %b1 = extractelement <8 x i16> %b, i32 1
38  %b2 = extractelement <8 x i16> %b, i32 2
39  %b3 = extractelement <8 x i16> %b, i32 3
40  %b4 = extractelement <8 x i16> %b, i32 4
41  %b5 = extractelement <8 x i16> %b, i32 5
42  %b6 = extractelement <8 x i16> %b, i32 6
43  %b7 = extractelement <8 x i16> %b, i32 7
44  %b01 = sub i16 %b0, %b1
45  %b23 = sub i16 %b2, %b3
46  %b45 = sub i16 %b4, %b5
47  %b67 = sub i16 %b6, %b7
48  %hsub0 = insertelement <8 x i16> poison, i16 %a01, i32 0
49  %hsub1 = insertelement <8 x i16> %hsub0, i16 %a23, i32 1
50  %hsub2 = insertelement <8 x i16> %hsub1, i16 %a45, i32 2
51  %hsub3 = insertelement <8 x i16> %hsub2, i16 %a67, i32 3
52  %hsub4 = insertelement <8 x i16> %hsub3, i16 %b01, i32 4
53  %hsub5 = insertelement <8 x i16> %hsub4, i16 %b23, i32 5
54  %hsub6 = insertelement <8 x i16> %hsub5, i16 %b45, i32 6
55  %hsub7 = insertelement <8 x i16> %hsub6, i16 %b67, i32 7
56  %result = shufflevector <8 x i16> %hsub7, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
57  ret <8 x i16> %result
58}
59
60define <8 x i16> @sub_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
61; SSE2-LABEL: @sub_v8i16_u1234567(
62; SSE2-NEXT:    [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
63; SSE2-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[A]], [[SHIFT3]]
64; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
65; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
66; SSE2-NEXT:    [[HSUB22:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
67; SSE2-NEXT:    [[HSUB3:%.*]] = shufflevector <8 x i16> [[HSUB22]], <8 x i16> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 14, i32 poison, i32 poison, i32 poison, i32 poison>
68; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
69; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
70; SSE2-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
71; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HSUB3]], <8 x i16> [[TMP3]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
72; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
73;
74; SSE4-LABEL: @sub_v8i16_u1234567(
75; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
76; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
77; SSE4-NEXT:    [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
78; SSE4-NEXT:    ret <8 x i16> [[TMP7]]
79;
80; AVX-LABEL: @sub_v8i16_u1234567(
81; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
82; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
83; AVX-NEXT:    [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
84; AVX-NEXT:    ret <8 x i16> [[TMP7]]
85;
86  %a0 = extractelement <8 x i16> %a, i32 0
87  %a1 = extractelement <8 x i16> %a, i32 1
88  %a2 = extractelement <8 x i16> %a, i32 2
89  %a3 = extractelement <8 x i16> %a, i32 3
90  %a4 = extractelement <8 x i16> %a, i32 4
91  %a5 = extractelement <8 x i16> %a, i32 5
92  %a6 = extractelement <8 x i16> %a, i32 6
93  %a7 = extractelement <8 x i16> %a, i32 7
94  %a01 = sub i16 %a0, %a1
95  %a23 = sub i16 %a2, %a3
96  %a45 = sub i16 %a4, %a5
97  %a67 = sub i16 %a6, %a7
98  %b0 = extractelement <8 x i16> %b, i32 0
99  %b1 = extractelement <8 x i16> %b, i32 1
100  %b2 = extractelement <8 x i16> %b, i32 2
101  %b3 = extractelement <8 x i16> %b, i32 3
102  %b4 = extractelement <8 x i16> %b, i32 4
103  %b5 = extractelement <8 x i16> %b, i32 5
104  %b6 = extractelement <8 x i16> %b, i32 6
105  %b7 = extractelement <8 x i16> %b, i32 7
106  %b01 = sub i16 %b0, %b1
107  %b23 = sub i16 %b2, %b3
108  %b45 = sub i16 %b4, %b5
109  %b67 = sub i16 %b6, %b7
110  %hsub0 = insertelement <8 x i16> poison, i16 %a01, i32 0
111  %hsub1 = insertelement <8 x i16> %hsub0, i16 %a23, i32 1
112  %hsub2 = insertelement <8 x i16> %hsub1, i16 %a45, i32 2
113  %hsub3 = insertelement <8 x i16> %hsub2, i16 %a67, i32 3
114  %hsub4 = insertelement <8 x i16> %hsub3, i16 %b01, i32 4
115  %hsub5 = insertelement <8 x i16> %hsub4, i16 %b23, i32 5
116  %hsub6 = insertelement <8 x i16> %hsub5, i16 %b45, i32 6
117  %hsub7 = insertelement <8 x i16> %hsub6, i16 %b67, i32 7
118  %result = shufflevector <8 x i16> %hsub7, <8 x i16> %a, <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
119  ret <8 x i16> %result
120}
121
122define <8 x i16> @sub_v8i16_76u43210(<8 x i16> %a, <8 x i16> %b) {
123; SSE2-LABEL: @sub_v8i16_76u43210(
124; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
125; SSE2-NEXT:    [[TMP1:%.*]] = sub <8 x i16> [[A]], [[SHIFT]]
126; SSE2-NEXT:    [[SHIFT2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
127; SSE2-NEXT:    [[TMP2:%.*]] = sub <8 x i16> [[B]], [[SHIFT2]]
128; SSE2-NEXT:    [[SHIFT3:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 6>
129; SSE2-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[SHIFT3]], [[B]]
130; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 2, i32 4, i32 6, i32 8, i32 poison, i32 poison, i32 poison, i32 poison>
131; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 3, i32 5, i32 7, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
132; SSE2-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
133; SSE2-NEXT:    [[HSUB41:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP6]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
134; SSE2-NEXT:    [[HSUB6:%.*]] = shufflevector <8 x i16> [[HSUB41]], <8 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 12, i32 poison>
135; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[HSUB6]], <8 x i32> <i32 7, i32 14, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8>
136; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
137;
138; SSE4-LABEL: @sub_v8i16_76u43210(
139; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
140; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
141; SSE4-NEXT:    [[HSUB22:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
142; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B:%.*]], <8 x i32> <i32 6, i32 8, i32 poison, i32 12, i32 14, i32 poison, i32 poison, i32 poison>
143; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 7, i32 9, i32 poison, i32 13, i32 15, i32 poison, i32 poison, i32 poison>
144; SSE4-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP3]], [[TMP4]]
145; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[HSUB22]], <8 x i32> <i32 4, i32 3, i32 poison, i32 1, i32 0, i32 10, i32 9, i32 8>
146; SSE4-NEXT:    ret <8 x i16> [[RESULT]]
147;
148; AVX-LABEL: @sub_v8i16_76u43210(
149; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
150; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
151; AVX-NEXT:    [[HSUB22:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
152; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B:%.*]], <8 x i32> <i32 6, i32 8, i32 poison, i32 12, i32 14, i32 poison, i32 poison, i32 poison>
153; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 7, i32 9, i32 poison, i32 13, i32 15, i32 poison, i32 poison, i32 poison>
154; AVX-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP3]], [[TMP4]]
155; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[HSUB22]], <8 x i32> <i32 4, i32 3, i32 poison, i32 1, i32 0, i32 10, i32 9, i32 8>
156; AVX-NEXT:    ret <8 x i16> [[RESULT]]
157;
158  %a0 = extractelement <8 x i16> %a, i32 0
159  %a1 = extractelement <8 x i16> %a, i32 1
160  %a2 = extractelement <8 x i16> %a, i32 2
161  %a3 = extractelement <8 x i16> %a, i32 3
162  %a4 = extractelement <8 x i16> %a, i32 4
163  %a5 = extractelement <8 x i16> %a, i32 5
164  %a6 = extractelement <8 x i16> %a, i32 6
165  %a7 = extractelement <8 x i16> %a, i32 7
166  %a01 = sub i16 %a0, %a1
167  %a23 = sub i16 %a2, %a3
168  %a45 = sub i16 %a4, %a5
169  %a67 = sub i16 %a6, %a7
170  %b0 = extractelement <8 x i16> %b, i32 0
171  %b1 = extractelement <8 x i16> %b, i32 1
172  %b2 = extractelement <8 x i16> %b, i32 2
173  %b3 = extractelement <8 x i16> %b, i32 3
174  %b4 = extractelement <8 x i16> %b, i32 4
175  %b5 = extractelement <8 x i16> %b, i32 5
176  %b6 = extractelement <8 x i16> %b, i32 6
177  %b7 = extractelement <8 x i16> %b, i32 7
178  %b01 = sub i16 %b0, %b1
179  %b23 = sub i16 %b2, %b3
180  %b45 = sub i16 %b4, %b5
181  %b67 = sub i16 %b6, %b7
182  %hsub0 = insertelement <8 x i16> poison, i16 %a01, i32 0
183  %hsub1 = insertelement <8 x i16> %hsub0, i16 %a23, i32 1
184  %hsub2 = insertelement <8 x i16> %hsub1, i16 %a45, i32 2
185  %hsub3 = insertelement <8 x i16> %hsub2, i16 %a67, i32 3
186  %hsub4 = insertelement <8 x i16> %hsub3, i16 %b01, i32 4
187  %hsub5 = insertelement <8 x i16> %hsub4, i16 %b23, i32 5
188  %hsub6 = insertelement <8 x i16> %hsub5, i16 %b45, i32 6
189  %hsub7 = insertelement <8 x i16> %hsub6, i16 %b67, i32 7
190  %result = shufflevector <8 x i16> %hsub7, <8 x i16> %a, <8 x i32> <i32 7, i32 6, i32 poison, i32 4, i32 3, i32 2, i32 1, i32 0>
191  ret <8 x i16> %result
192}
193
194;
195; v16i16
196;
197
198define <16 x i16> @sub_v16i16_0123456789ABCDEF(<16 x i16> %a, <16 x i16> %b) {
199; CHECK-LABEL: @sub_v16i16_0123456789ABCDEF(
200; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
201; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
202; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
203; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
204;
205  %a0 = extractelement <16 x i16> %a, i32 0
206  %a1 = extractelement <16 x i16> %a, i32 1
207  %a2 = extractelement <16 x i16> %a, i32 2
208  %a3 = extractelement <16 x i16> %a, i32 3
209  %a4 = extractelement <16 x i16> %a, i32 4
210  %a5 = extractelement <16 x i16> %a, i32 5
211  %a6 = extractelement <16 x i16> %a, i32 6
212  %a7 = extractelement <16 x i16> %a, i32 7
213  %a8 = extractelement <16 x i16> %a, i32 8
214  %a9 = extractelement <16 x i16> %a, i32 9
215  %aA = extractelement <16 x i16> %a, i32 10
216  %aB = extractelement <16 x i16> %a, i32 11
217  %aC = extractelement <16 x i16> %a, i32 12
218  %aD = extractelement <16 x i16> %a, i32 13
219  %aE = extractelement <16 x i16> %a, i32 14
220  %aF = extractelement <16 x i16> %a, i32 15
221  %a01 = sub i16 %a0, %a1
222  %a23 = sub i16 %a2, %a3
223  %a45 = sub i16 %a4, %a5
224  %a67 = sub i16 %a6, %a7
225  %a89 = sub i16 %a8, %a9
226  %aAB = sub i16 %aA, %aB
227  %aCD = sub i16 %aC, %aD
228  %aEF = sub i16 %aE, %aF
229  %b0 = extractelement <16 x i16> %b, i32 0
230  %b1 = extractelement <16 x i16> %b, i32 1
231  %b2 = extractelement <16 x i16> %b, i32 2
232  %b3 = extractelement <16 x i16> %b, i32 3
233  %b4 = extractelement <16 x i16> %b, i32 4
234  %b5 = extractelement <16 x i16> %b, i32 5
235  %b6 = extractelement <16 x i16> %b, i32 6
236  %b7 = extractelement <16 x i16> %b, i32 7
237  %b8 = extractelement <16 x i16> %b, i32 8
238  %b9 = extractelement <16 x i16> %b, i32 9
239  %bA = extractelement <16 x i16> %b, i32 10
240  %bB = extractelement <16 x i16> %b, i32 11
241  %bC = extractelement <16 x i16> %b, i32 12
242  %bD = extractelement <16 x i16> %b, i32 13
243  %bE = extractelement <16 x i16> %b, i32 14
244  %bF = extractelement <16 x i16> %b, i32 15
245  %b01 = sub i16 %b0, %b1
246  %b23 = sub i16 %b2, %b3
247  %b45 = sub i16 %b4, %b5
248  %b67 = sub i16 %b6, %b7
249  %b89 = sub i16 %b8, %b9
250  %bAB = sub i16 %bA, %bB
251  %bCD = sub i16 %bC, %bD
252  %bEF = sub i16 %bE, %bF
253  %hsub0 = insertelement <16 x i16> poison, i16 %a01, i32 0
254  %hsub1 = insertelement <16 x i16> %hsub0, i16 %a23, i32 1
255  %hsub2 = insertelement <16 x i16> %hsub1, i16 %a45, i32 2
256  %hsub3 = insertelement <16 x i16> %hsub2, i16 %a67, i32 3
257  %hsub4 = insertelement <16 x i16> %hsub3, i16 %b01, i32 4
258  %hsub5 = insertelement <16 x i16> %hsub4, i16 %b23, i32 5
259  %hsub6 = insertelement <16 x i16> %hsub5, i16 %b45, i32 6
260  %hsub7 = insertelement <16 x i16> %hsub6, i16 %b67, i32 7
261  %hsub8 = insertelement <16 x i16> %hsub7, i16 %a89, i32 8
262  %hsub9 = insertelement <16 x i16> %hsub8, i16 %aAB, i32 9
263  %hsubA = insertelement <16 x i16> %hsub9, i16 %aCD, i32 10
264  %hsubB = insertelement <16 x i16> %hsubA, i16 %aEF, i32 11
265  %hsubC = insertelement <16 x i16> %hsubB, i16 %b89, i32 12
266  %hsubD = insertelement <16 x i16> %hsubC, i16 %bAB, i32 13
267  %hsubE = insertelement <16 x i16> %hsubD, i16 %bCD, i32 14
268  %hsubF = insertelement <16 x i16> %hsubE, i16 %bEF, i32 15
269  %result = shufflevector <16 x i16> %hsubF, <16 x i16> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
270  ret <16 x i16> %result
271}
272
273define <16 x i16> @sub_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
274; SSE2-LABEL: @sub_v16i16_0123u56789uBCDEF(
275; SSE2-NEXT:    [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
276; SSE2-NEXT:    [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
277; SSE2-NEXT:    [[A89:%.*]] = sub i16 [[A8]], [[A9]]
278; SSE2-NEXT:    [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
279; SSE2-NEXT:    [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
280; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
281; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
282; SSE2-NEXT:    [[BCD:%.*]] = sub i16 [[BC]], [[BD]]
283; SSE2-NEXT:    [[BEF:%.*]] = sub i16 [[BE]], [[BF]]
284; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
285; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
286; SSE2-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
287; SSE2-NEXT:    [[HSUB8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
288; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 poison, i32 14, i32 24, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
289; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 poison, i32 15, i32 25, i32 27, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
290; SSE2-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
291; SSE2-NEXT:    [[HSUBD1:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 19, i32 20, i32 poison, i32 poison>
292; SSE2-NEXT:    [[HSUBE:%.*]] = insertelement <16 x i16> [[HSUBD1]], i16 [[BCD]], i64 14
293; SSE2-NEXT:    [[HSUBF:%.*]] = insertelement <16 x i16> [[HSUBE]], i16 [[BEF]], i64 15
294; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HSUBF]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 12, i32 13, i32 14, i32 15>
295; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
296;
297; SSE4-LABEL: @sub_v16i16_0123u56789uBCDEF(
298; SSE4-NEXT:    [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
299; SSE4-NEXT:    [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
300; SSE4-NEXT:    [[A89:%.*]] = sub i16 [[A8]], [[A9]]
301; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
302; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
303; SSE4-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
304; SSE4-NEXT:    [[HSUB8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
305; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 10, i32 poison, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
306; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 11, i32 poison, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
307; SSE4-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
308; SSE4-NEXT:    [[HSUBB2:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 poison, i32 poison, i32 poison, i32 poison>
309; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
310; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
311; SSE4-NEXT:    [[TMP9:%.*]] = sub <16 x i16> [[TMP7]], [[TMP8]]
312; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HSUBB2]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
313; SSE4-NEXT:    ret <16 x i16> [[RESULT]]
314;
315; AVX-LABEL: @sub_v16i16_0123u56789uBCDEF(
316; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 8, i32 10, i32 poison, i32 14, i32 24, i32 26, i32 28, i32 30>
317; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 9, i32 11, i32 poison, i32 15, i32 25, i32 27, i32 29, i32 31>
318; AVX-NEXT:    [[RESULT:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
319; AVX-NEXT:    ret <16 x i16> [[RESULT]]
320;
321  %a0 = extractelement <16 x i16> %a, i32 0
322  %a1 = extractelement <16 x i16> %a, i32 1
323  %a2 = extractelement <16 x i16> %a, i32 2
324  %a3 = extractelement <16 x i16> %a, i32 3
325  %a4 = extractelement <16 x i16> %a, i32 4
326  %a5 = extractelement <16 x i16> %a, i32 5
327  %a6 = extractelement <16 x i16> %a, i32 6
328  %a7 = extractelement <16 x i16> %a, i32 7
329  %a8 = extractelement <16 x i16> %a, i32 8
330  %a9 = extractelement <16 x i16> %a, i32 9
331  %aA = extractelement <16 x i16> %a, i32 10
332  %aB = extractelement <16 x i16> %a, i32 11
333  %aC = extractelement <16 x i16> %a, i32 12
334  %aD = extractelement <16 x i16> %a, i32 13
335  %aE = extractelement <16 x i16> %a, i32 14
336  %aF = extractelement <16 x i16> %a, i32 15
337  %a01 = sub i16 %a0, %a1
338  %a23 = sub i16 %a2, %a3
339  %a45 = sub i16 %a4, %a5
340  %a67 = sub i16 %a6, %a7
341  %a89 = sub i16 %a8, %a9
342  %aAB = sub i16 %aA, %aB
343  %aCD = sub i16 %aC, %aD
344  %aEF = sub i16 %aE, %aF
345  %b0 = extractelement <16 x i16> %b, i32 0
346  %b1 = extractelement <16 x i16> %b, i32 1
347  %b2 = extractelement <16 x i16> %b, i32 2
348  %b3 = extractelement <16 x i16> %b, i32 3
349  %b4 = extractelement <16 x i16> %b, i32 4
350  %b5 = extractelement <16 x i16> %b, i32 5
351  %b6 = extractelement <16 x i16> %b, i32 6
352  %b7 = extractelement <16 x i16> %b, i32 7
353  %b8 = extractelement <16 x i16> %b, i32 8
354  %b9 = extractelement <16 x i16> %b, i32 9
355  %bA = extractelement <16 x i16> %b, i32 10
356  %bB = extractelement <16 x i16> %b, i32 11
357  %bC = extractelement <16 x i16> %b, i32 12
358  %bD = extractelement <16 x i16> %b, i32 13
359  %bE = extractelement <16 x i16> %b, i32 14
360  %bF = extractelement <16 x i16> %b, i32 15
361  %b01 = sub i16 %b0, %b1
362  %b23 = sub i16 %b2, %b3
363  %b45 = sub i16 %b4, %b5
364  %b67 = sub i16 %b6, %b7
365  %b89 = sub i16 %b8, %b9
366  %bAB = sub i16 %bA, %bB
367  %bCD = sub i16 %bC, %bD
368  %bEF = sub i16 %bE, %bF
369  %hsub0 = insertelement <16 x i16> poison, i16 %a01, i32 0
370  %hsub1 = insertelement <16 x i16> %hsub0, i16 %a23, i32 1
371  %hsub2 = insertelement <16 x i16> %hsub1, i16 %a45, i32 2
372  %hsub3 = insertelement <16 x i16> %hsub2, i16 %a67, i32 3
373  %hsub4 = insertelement <16 x i16> %hsub3, i16 %b01, i32 4
374  %hsub5 = insertelement <16 x i16> %hsub4, i16 %b23, i32 5
375  %hsub6 = insertelement <16 x i16> %hsub5, i16 %b45, i32 6
376  %hsub7 = insertelement <16 x i16> %hsub6, i16 %b67, i32 7
377  %hsub8 = insertelement <16 x i16> %hsub7, i16 %a89, i32 8
378  %hsub9 = insertelement <16 x i16> %hsub8, i16 %aAB, i32 9
379  %hsubA = insertelement <16 x i16> %hsub9, i16 %aCD, i32 10
380  %hsubB = insertelement <16 x i16> %hsubA, i16 %aEF, i32 11
381  %hsubC = insertelement <16 x i16> %hsubB, i16 %b89, i32 12
382  %hsubD = insertelement <16 x i16> %hsubC, i16 %bAB, i32 13
383  %hsubE = insertelement <16 x i16> %hsubD, i16 %bCD, i32 14
384  %hsubF = insertelement <16 x i16> %hsubE, i16 %bEF, i32 15
385  %result = shufflevector <16 x i16> %hsubF, <16 x i16> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 12, i32 13, i32 14, i32 15>
386  ret <16 x i16> %result
387}
388
389define <16 x i16> @sub_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) {
390; SSE2-LABEL: @sub_v16i16_FEuCBA98765432u0(
391; SSE2-NEXT:    [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
392; SSE2-NEXT:    [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
393; SSE2-NEXT:    [[A89:%.*]] = sub i16 [[A8]], [[A9]]
394; SSE2-NEXT:    [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
395; SSE2-NEXT:    [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
396; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
397; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
398; SSE2-NEXT:    [[BCD:%.*]] = sub i16 [[BC]], [[BD]]
399; SSE2-NEXT:    [[BEF:%.*]] = sub i16 [[BE]], [[BF]]
400; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
401; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
402; SSE2-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
403; SSE2-NEXT:    [[HSUB8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
404; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 12, i32 14, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
405; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 13, i32 15, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
406; SSE2-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
407; SSE2-NEXT:    [[HSUBC1:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison>
408; SSE2-NEXT:    [[HSUBE:%.*]] = insertelement <16 x i16> [[HSUBC1]], i16 [[BCD]], i64 14
409; SSE2-NEXT:    [[HSUBF:%.*]] = insertelement <16 x i16> [[HSUBE]], i16 [[BEF]], i64 15
410; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HSUBF]], <16 x i16> poison, <16 x i32> <i32 15, i32 14, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 poison, i32 0>
411; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
412;
413; SSE4-LABEL: @sub_v16i16_FEuCBA98765432u0(
414; SSE4-NEXT:    [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
415; SSE4-NEXT:    [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
416; SSE4-NEXT:    [[A89:%.*]] = sub i16 [[A8]], [[A9]]
417; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
418; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
419; SSE4-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
420; SSE4-NEXT:    [[HSUB8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
421; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 10, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
422; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 11, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
423; SSE4-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
424; SSE4-NEXT:    [[HSUBA2:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
425; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 poison, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
426; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 poison, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
427; SSE4-NEXT:    [[TMP9:%.*]] = sub <16 x i16> [[TMP7]], [[TMP8]]
428; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[HSUBA2]], <16 x i32> <i32 4, i32 3, i32 poison, i32 1, i32 0, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 poison, i32 16>
429; SSE4-NEXT:    ret <16 x i16> [[RESULT]]
430;
431; AVX2-LABEL: @sub_v16i16_FEuCBA98765432u0(
432; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
433; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
434; AVX2-NEXT:    [[HSUBA:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
435; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 poison, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
436; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 poison, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
437; AVX2-NEXT:    [[TMP5:%.*]] = sub <16 x i16> [[TMP3]], [[TMP4]]
438; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> [[HSUBA]], <16 x i32> <i32 4, i32 3, i32 poison, i32 1, i32 0, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 poison, i32 16>
439; AVX2-NEXT:    ret <16 x i16> [[RESULT]]
440;
441; AVX512-LABEL: @sub_v16i16_FEuCBA98765432u0(
442; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
443; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
444; AVX512-NEXT:    [[HSUBA2:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
445; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 poison, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
446; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 poison, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
447; AVX512-NEXT:    [[TMP5:%.*]] = sub <16 x i16> [[TMP3]], [[TMP4]]
448; AVX512-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> [[HSUBA2]], <16 x i32> <i32 4, i32 3, i32 poison, i32 1, i32 0, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 poison, i32 16>
449; AVX512-NEXT:    ret <16 x i16> [[RESULT]]
450;
451  %a0 = extractelement <16 x i16> %a, i32 0
452  %a1 = extractelement <16 x i16> %a, i32 1
453  %a2 = extractelement <16 x i16> %a, i32 2
454  %a3 = extractelement <16 x i16> %a, i32 3
455  %a4 = extractelement <16 x i16> %a, i32 4
456  %a5 = extractelement <16 x i16> %a, i32 5
457  %a6 = extractelement <16 x i16> %a, i32 6
458  %a7 = extractelement <16 x i16> %a, i32 7
459  %a8 = extractelement <16 x i16> %a, i32 8
460  %a9 = extractelement <16 x i16> %a, i32 9
461  %aA = extractelement <16 x i16> %a, i32 10
462  %aB = extractelement <16 x i16> %a, i32 11
463  %aC = extractelement <16 x i16> %a, i32 12
464  %aD = extractelement <16 x i16> %a, i32 13
465  %aE = extractelement <16 x i16> %a, i32 14
466  %aF = extractelement <16 x i16> %a, i32 15
467  %a01 = sub i16 %a0, %a1
468  %a23 = sub i16 %a2, %a3
469  %a45 = sub i16 %a4, %a5
470  %a67 = sub i16 %a6, %a7
471  %a89 = sub i16 %a8, %a9
472  %aAB = sub i16 %aA, %aB
473  %aCD = sub i16 %aC, %aD
474  %aEF = sub i16 %aE, %aF
475  %b0 = extractelement <16 x i16> %b, i32 0
476  %b1 = extractelement <16 x i16> %b, i32 1
477  %b2 = extractelement <16 x i16> %b, i32 2
478  %b3 = extractelement <16 x i16> %b, i32 3
479  %b4 = extractelement <16 x i16> %b, i32 4
480  %b5 = extractelement <16 x i16> %b, i32 5
481  %b6 = extractelement <16 x i16> %b, i32 6
482  %b7 = extractelement <16 x i16> %b, i32 7
483  %b8 = extractelement <16 x i16> %b, i32 8
484  %b9 = extractelement <16 x i16> %b, i32 9
485  %bA = extractelement <16 x i16> %b, i32 10
486  %bB = extractelement <16 x i16> %b, i32 11
487  %bC = extractelement <16 x i16> %b, i32 12
488  %bD = extractelement <16 x i16> %b, i32 13
489  %bE = extractelement <16 x i16> %b, i32 14
490  %bF = extractelement <16 x i16> %b, i32 15
491  %b01 = sub i16 %b0, %b1
492  %b23 = sub i16 %b2, %b3
493  %b45 = sub i16 %b4, %b5
494  %b67 = sub i16 %b6, %b7
495  %b89 = sub i16 %b8, %b9
496  %bAB = sub i16 %bA, %bB
497  %bCD = sub i16 %bC, %bD
498  %bEF = sub i16 %bE, %bF
499  %hsub0 = insertelement <16 x i16> poison, i16 %a01, i32 0
500  %hsub1 = insertelement <16 x i16> %hsub0, i16 %a23, i32 1
501  %hsub2 = insertelement <16 x i16> %hsub1, i16 %a45, i32 2
502  %hsub3 = insertelement <16 x i16> %hsub2, i16 %a67, i32 3
503  %hsub4 = insertelement <16 x i16> %hsub3, i16 %b01, i32 4
504  %hsub5 = insertelement <16 x i16> %hsub4, i16 %b23, i32 5
505  %hsub6 = insertelement <16 x i16> %hsub5, i16 %b45, i32 6
506  %hsub7 = insertelement <16 x i16> %hsub6, i16 %b67, i32 7
507  %hsub8 = insertelement <16 x i16> %hsub7, i16 %a89, i32 8
508  %hsub9 = insertelement <16 x i16> %hsub8, i16 %aAB, i32 9
509  %hsubA = insertelement <16 x i16> %hsub9, i16 %aCD, i32 10
510  %hsubB = insertelement <16 x i16> %hsubA, i16 %aEF, i32 11
511  %hsubC = insertelement <16 x i16> %hsubB, i16 %b89, i32 12
512  %hsubD = insertelement <16 x i16> %hsubC, i16 %bAB, i32 13
513  %hsubE = insertelement <16 x i16> %hsubD, i16 %bCD, i32 14
514  %hsubF = insertelement <16 x i16> %hsubE, i16 %bEF, i32 15
515  %result = shufflevector <16 x i16> %hsubF, <16 x i16> %a, <16 x i32> <i32 15, i32 14, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 poison, i32 0>
516  ret <16 x i16> %result
517}
518
519;
520; v4i32
521;
522
523define <4 x i32> @sub_v4i32_0123(<4 x i32> %a, <4 x i32> %b) {
524; CHECK-LABEL: @sub_v4i32_0123(
525; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
526; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
527; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
528; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
529;
530  %a0 = extractelement <4 x i32> %a, i32 0
531  %a1 = extractelement <4 x i32> %a, i32 1
532  %a2 = extractelement <4 x i32> %a, i32 2
533  %a3 = extractelement <4 x i32> %a, i32 3
534  %a01 = sub i32 %a0, %a1
535  %a23 = sub i32 %a2, %a3
536  %b0 = extractelement <4 x i32> %b, i32 0
537  %b1 = extractelement <4 x i32> %b, i32 1
538  %b2 = extractelement <4 x i32> %b, i32 2
539  %b3 = extractelement <4 x i32> %b, i32 3
540  %b01 = sub i32 %b0, %b1
541  %b23 = sub i32 %b2, %b3
542  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
543  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
544  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
545  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
546  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
547  ret <4 x i32> %result
548}
549
550define <4 x i32> @sub_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
551; CHECK-LABEL: @sub_v4i32_u123(
552; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
553; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
554; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
555; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
556;
557  %a0 = extractelement <4 x i32> %a, i32 0
558  %a1 = extractelement <4 x i32> %a, i32 1
559  %a2 = extractelement <4 x i32> %a, i32 2
560  %a3 = extractelement <4 x i32> %a, i32 3
561  %a01 = sub i32 %a0, %a1
562  %a23 = sub i32 %a2, %a3
563  %b0 = extractelement <4 x i32> %b, i32 0
564  %b1 = extractelement <4 x i32> %b, i32 1
565  %b2 = extractelement <4 x i32> %b, i32 2
566  %b3 = extractelement <4 x i32> %b, i32 3
567  %b01 = sub i32 %b0, %b1
568  %b23 = sub i32 %b2, %b3
569  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
570  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
571  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
572  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
573  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
574  ret <4 x i32> %result
575}
576
577define <4 x i32> @sub_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
578; CHECK-LABEL: @sub_v4i32_0u23(
579; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
580; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
581; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
582; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
583;
584  %a0 = extractelement <4 x i32> %a, i32 0
585  %a1 = extractelement <4 x i32> %a, i32 1
586  %a2 = extractelement <4 x i32> %a, i32 2
587  %a3 = extractelement <4 x i32> %a, i32 3
588  %a01 = sub i32 %a0, %a1
589  %a23 = sub i32 %a2, %a3
590  %b0 = extractelement <4 x i32> %b, i32 0
591  %b1 = extractelement <4 x i32> %b, i32 1
592  %b2 = extractelement <4 x i32> %b, i32 2
593  %b3 = extractelement <4 x i32> %b, i32 3
594  %b01 = sub i32 %b0, %b1
595  %b23 = sub i32 %b2, %b3
596  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
597  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
598  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
599  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
600  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
601  ret <4 x i32> %result
602}
603
604define <4 x i32> @sub_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
605; SSE2-LABEL: @sub_v4i32_01u3(
606; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
607; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
608; SSE2-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
609; SSE2-NEXT:    ret <4 x i32> [[TMP4]]
610;
611; SSE4-LABEL: @sub_v4i32_01u3(
612; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
613; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
614; SSE4-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
615; SSE4-NEXT:    ret <4 x i32> [[TMP4]]
616;
617; AVX2-LABEL: @sub_v4i32_01u3(
618; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
619; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
620; AVX2-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
621; AVX2-NEXT:    ret <4 x i32> [[TMP4]]
622;
623; AVX512-LABEL: @sub_v4i32_01u3(
624; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
625; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
626; AVX512-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
627; AVX512-NEXT:    ret <4 x i32> [[TMP4]]
628;
629  %a0 = extractelement <4 x i32> %a, i32 0
630  %a1 = extractelement <4 x i32> %a, i32 1
631  %a2 = extractelement <4 x i32> %a, i32 2
632  %a3 = extractelement <4 x i32> %a, i32 3
633  %a01 = sub i32 %a0, %a1
634  %a23 = sub i32 %a2, %a3
635  %b0 = extractelement <4 x i32> %b, i32 0
636  %b1 = extractelement <4 x i32> %b, i32 1
637  %b2 = extractelement <4 x i32> %b, i32 2
638  %b3 = extractelement <4 x i32> %b, i32 3
639  %b01 = sub i32 %b0, %b1
640  %b23 = sub i32 %b2, %b3
641  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
642  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
643  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
644  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
645  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
646  ret <4 x i32> %result
647}
648
649define <4 x i32> @sub_v4i32_012u(<4 x i32> %a, <4 x i32> %b) {
650; CHECK-LABEL: @sub_v4i32_012u(
651; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
652; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
653; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
654; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
655;
656  %a0 = extractelement <4 x i32> %a, i32 0
657  %a1 = extractelement <4 x i32> %a, i32 1
658  %a2 = extractelement <4 x i32> %a, i32 2
659  %a3 = extractelement <4 x i32> %a, i32 3
660  %a01 = sub i32 %a0, %a1
661  %a23 = sub i32 %a2, %a3
662  %b0 = extractelement <4 x i32> %b, i32 0
663  %b1 = extractelement <4 x i32> %b, i32 1
664  %b2 = extractelement <4 x i32> %b, i32 2
665  %b3 = extractelement <4 x i32> %b, i32 3
666  %b01 = sub i32 %b0, %b1
667  %b23 = sub i32 %b2, %b3
668  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
669  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
670  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
671  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
672  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
673  ret <4 x i32> %result
674}
675
676define <4 x i32> @sub_v4i32_uu23(<4 x i32> %a, <4 x i32> %b) {
677; CHECK-LABEL: @sub_v4i32_uu23(
678; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
679; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
680; CHECK-NEXT:    [[RESULT1:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
681; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
682;
683  %a0 = extractelement <4 x i32> %a, i32 0
684  %a1 = extractelement <4 x i32> %a, i32 1
685  %a2 = extractelement <4 x i32> %a, i32 2
686  %a3 = extractelement <4 x i32> %a, i32 3
687  %a01 = sub i32 %a0, %a1
688  %a23 = sub i32 %a2, %a3
689  %b0 = extractelement <4 x i32> %b, i32 0
690  %b1 = extractelement <4 x i32> %b, i32 1
691  %b2 = extractelement <4 x i32> %b, i32 2
692  %b3 = extractelement <4 x i32> %b, i32 3
693  %b01 = sub i32 %b0, %b1
694  %b23 = sub i32 %b2, %b3
695  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
696  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
697  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
698  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
699  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
700  ret <4 x i32> %result
701}
702
703define <4 x i32> @sub_v4i32_01uu(<4 x i32> %a, <4 x i32> %b) {
704; CHECK-LABEL: @sub_v4i32_01uu(
705; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
706; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
707; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
708; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
709;
710  %a0 = extractelement <4 x i32> %a, i32 0
711  %a1 = extractelement <4 x i32> %a, i32 1
712  %a2 = extractelement <4 x i32> %a, i32 2
713  %a3 = extractelement <4 x i32> %a, i32 3
714  %a01 = sub i32 %a0, %a1
715  %a23 = sub i32 %a2, %a3
716  %b0 = extractelement <4 x i32> %b, i32 0
717  %b1 = extractelement <4 x i32> %b, i32 1
718  %b2 = extractelement <4 x i32> %b, i32 2
719  %b3 = extractelement <4 x i32> %b, i32 3
720  %b01 = sub i32 %b0, %b1
721  %b23 = sub i32 %b2, %b3
722  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
723  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
724  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
725  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
726  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
727  ret <4 x i32> %result
728}
729
730define <4 x i32> @sub_v4i32_32u0(<4 x i32> %a, <4 x i32> %b) {
731; SSE2-LABEL: @sub_v4i32_32u0(
732; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
733; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
734; SSE2-NEXT:    [[RESULT1:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
735; SSE2-NEXT:    ret <4 x i32> [[RESULT1]]
736;
737; SSE4-LABEL: @sub_v4i32_32u0(
738; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
739; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
740; SSE4-NEXT:    [[RESULT:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
741; SSE4-NEXT:    ret <4 x i32> [[RESULT]]
742;
743; AVX2-LABEL: @sub_v4i32_32u0(
744; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
745; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
746; AVX2-NEXT:    [[RESULT:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
747; AVX2-NEXT:    ret <4 x i32> [[RESULT]]
748;
749; AVX512-LABEL: @sub_v4i32_32u0(
750; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
751; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
752; AVX512-NEXT:    [[RESULT1:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
753; AVX512-NEXT:    ret <4 x i32> [[RESULT1]]
754;
755  %a0 = extractelement <4 x i32> %a, i32 0
756  %a1 = extractelement <4 x i32> %a, i32 1
757  %a2 = extractelement <4 x i32> %a, i32 2
758  %a3 = extractelement <4 x i32> %a, i32 3
759  %a01 = sub i32 %a0, %a1
760  %a23 = sub i32 %a2, %a3
761  %b0 = extractelement <4 x i32> %b, i32 0
762  %b1 = extractelement <4 x i32> %b, i32 1
763  %b2 = extractelement <4 x i32> %b, i32 2
764  %b3 = extractelement <4 x i32> %b, i32 3
765  %b01 = sub i32 %b0, %b1
766  %b23 = sub i32 %b2, %b3
767  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
768  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
769  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
770  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
771  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 3, i32 2, i32 poison, i32 0>
772  ret <4 x i32> %result
773}
774
775;
776; v8i32
777;
778
779define <8 x i32> @sub_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
780; CHECK-LABEL: @sub_v8i32_01234567(
781; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
782; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
783; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
784; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
785;
786  %a0 = extractelement <8 x i32> %a, i32 0
787  %a1 = extractelement <8 x i32> %a, i32 1
788  %a2 = extractelement <8 x i32> %a, i32 2
789  %a3 = extractelement <8 x i32> %a, i32 3
790  %a4 = extractelement <8 x i32> %a, i32 4
791  %a5 = extractelement <8 x i32> %a, i32 5
792  %a6 = extractelement <8 x i32> %a, i32 6
793  %a7 = extractelement <8 x i32> %a, i32 7
794  %a01 = sub i32 %a0, %a1
795  %a23 = sub i32 %a2, %a3
796  %a45 = sub i32 %a4, %a5
797  %a67 = sub i32 %a6, %a7
798  %b0 = extractelement <8 x i32> %b, i32 0
799  %b1 = extractelement <8 x i32> %b, i32 1
800  %b2 = extractelement <8 x i32> %b, i32 2
801  %b3 = extractelement <8 x i32> %b, i32 3
802  %b4 = extractelement <8 x i32> %b, i32 4
803  %b5 = extractelement <8 x i32> %b, i32 5
804  %b6 = extractelement <8 x i32> %b, i32 6
805  %b7 = extractelement <8 x i32> %b, i32 7
806  %b01 = sub i32 %b0, %b1
807  %b23 = sub i32 %b2, %b3
808  %b45 = sub i32 %b4, %b5
809  %b67 = sub i32 %b6, %b7
810  %hsub0 = insertelement <8 x i32> poison, i32 %a01, i32 0
811  %hsub1 = insertelement <8 x i32> %hsub0, i32 %a23, i32 1
812  %hsub2 = insertelement <8 x i32> %hsub1, i32 %b01, i32 2
813  %hsub3 = insertelement <8 x i32> %hsub2, i32 %b23, i32 3
814  %hsub4 = insertelement <8 x i32> %hsub3, i32 %a45, i32 4
815  %hsub5 = insertelement <8 x i32> %hsub4, i32 %a67, i32 5
816  %hsub6 = insertelement <8 x i32> %hsub5, i32 %b45, i32 6
817  %hsub7 = insertelement <8 x i32> %hsub6, i32 %b67, i32 7
818  %result = shufflevector <8 x i32> %hsub7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
819  ret <8 x i32> %result
820}
821
822define <8 x i32> @sub_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
823; SSE2-LABEL: @sub_v8i32_01234u67(
824; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
825; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
826; SSE2-NEXT:    [[RESULT:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
827; SSE2-NEXT:    ret <8 x i32> [[RESULT]]
828;
829; SSE4-LABEL: @sub_v8i32_01234u67(
830; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
831; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
832; SSE4-NEXT:    [[A45:%.*]] = sub i32 [[A4]], [[A5]]
833; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
834; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
835; SSE4-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
836; SSE4-NEXT:    [[HSUB4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
837; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
838; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
839; SSE4-NEXT:    [[TMP6:%.*]] = sub <8 x i32> [[TMP4]], [[TMP5]]
840; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HSUB4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
841; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
842;
843; AVX-LABEL: @sub_v8i32_01234u67(
844; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
845; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
846; AVX-NEXT:    [[RESULT:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
847; AVX-NEXT:    ret <8 x i32> [[RESULT]]
848;
849  %a0 = extractelement <8 x i32> %a, i32 0
850  %a1 = extractelement <8 x i32> %a, i32 1
851  %a2 = extractelement <8 x i32> %a, i32 2
852  %a3 = extractelement <8 x i32> %a, i32 3
853  %a4 = extractelement <8 x i32> %a, i32 4
854  %a5 = extractelement <8 x i32> %a, i32 5
855  %a6 = extractelement <8 x i32> %a, i32 6
856  %a7 = extractelement <8 x i32> %a, i32 7
857  %a01 = sub i32 %a0, %a1
858  %a23 = sub i32 %a2, %a3
859  %a45 = sub i32 %a4, %a5
860  %a67 = sub i32 %a6, %a7
861  %b0 = extractelement <8 x i32> %b, i32 0
862  %b1 = extractelement <8 x i32> %b, i32 1
863  %b2 = extractelement <8 x i32> %b, i32 2
864  %b3 = extractelement <8 x i32> %b, i32 3
865  %b4 = extractelement <8 x i32> %b, i32 4
866  %b5 = extractelement <8 x i32> %b, i32 5
867  %b6 = extractelement <8 x i32> %b, i32 6
868  %b7 = extractelement <8 x i32> %b, i32 7
869  %b01 = sub i32 %b0, %b1
870  %b23 = sub i32 %b2, %b3
871  %b45 = sub i32 %b4, %b5
872  %b67 = sub i32 %b6, %b7
873  %hsub0 = insertelement <8 x i32> poison, i32 %a01, i32 0
874  %hsub1 = insertelement <8 x i32> %hsub0, i32 %a23, i32 1
875  %hsub2 = insertelement <8 x i32> %hsub1, i32 %b01, i32 2
876  %hsub3 = insertelement <8 x i32> %hsub2, i32 %b23, i32 3
877  %hsub4 = insertelement <8 x i32> %hsub3, i32 %a45, i32 4
878  %hsub5 = insertelement <8 x i32> %hsub4, i32 %a67, i32 5
879  %hsub6 = insertelement <8 x i32> %hsub5, i32 %b45, i32 6
880  %hsub7 = insertelement <8 x i32> %hsub6, i32 %b67, i32 7
881  %result = shufflevector <8 x i32> %hsub7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 6, i32 7>
882  ret <8 x i32> %result
883}
884
885;
886; v4f32
887;
888
889define <4 x float> @sub_v4f32_0123(<4 x float> %a, <4 x float> %b) {
890; CHECK-LABEL: @sub_v4f32_0123(
891; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
892; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
893; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
894; CHECK-NEXT:    ret <4 x float> [[TMP3]]
895;
896  %a0 = extractelement <4 x float> %a, i32 0
897  %a1 = extractelement <4 x float> %a, i32 1
898  %a2 = extractelement <4 x float> %a, i32 2
899  %a3 = extractelement <4 x float> %a, i32 3
900  %a01 = fsub float %a0, %a1
901  %a23 = fsub float %a2, %a3
902  %b0 = extractelement <4 x float> %b, i32 0
903  %b1 = extractelement <4 x float> %b, i32 1
904  %b2 = extractelement <4 x float> %b, i32 2
905  %b3 = extractelement <4 x float> %b, i32 3
906  %b01 = fsub float %b0, %b1
907  %b23 = fsub float %b2, %b3
908  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
909  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
910  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
911  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
912  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
913  ret <4 x float> %result
914}
915
916define <4 x float> @sub_v4f32_u123(<4 x float> %a, <4 x float> %b) {
917; CHECK-LABEL: @sub_v4f32_u123(
918; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
919; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
920; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
921; CHECK-NEXT:    ret <4 x float> [[TMP4]]
922;
923  %a0 = extractelement <4 x float> %a, i32 0
924  %a1 = extractelement <4 x float> %a, i32 1
925  %a2 = extractelement <4 x float> %a, i32 2
926  %a3 = extractelement <4 x float> %a, i32 3
927  %a01 = fsub float %a0, %a1
928  %a23 = fsub float %a2, %a3
929  %b0 = extractelement <4 x float> %b, i32 0
930  %b1 = extractelement <4 x float> %b, i32 1
931  %b2 = extractelement <4 x float> %b, i32 2
932  %b3 = extractelement <4 x float> %b, i32 3
933  %b01 = fsub float %b0, %b1
934  %b23 = fsub float %b2, %b3
935  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
936  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
937  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
938  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
939  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
940  ret <4 x float> %result
941}
942
943define <4 x float> @sub_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
944; CHECK-LABEL: @sub_v4f32_0u23(
945; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
946; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
947; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
948; CHECK-NEXT:    ret <4 x float> [[TMP4]]
949;
950  %a0 = extractelement <4 x float> %a, i32 0
951  %a1 = extractelement <4 x float> %a, i32 1
952  %a2 = extractelement <4 x float> %a, i32 2
953  %a3 = extractelement <4 x float> %a, i32 3
954  %a01 = fsub float %a0, %a1
955  %a23 = fsub float %a2, %a3
956  %b0 = extractelement <4 x float> %b, i32 0
957  %b1 = extractelement <4 x float> %b, i32 1
958  %b2 = extractelement <4 x float> %b, i32 2
959  %b3 = extractelement <4 x float> %b, i32 3
960  %b01 = fsub float %b0, %b1
961  %b23 = fsub float %b2, %b3
962  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
963  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
964  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
965  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
966  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
967  ret <4 x float> %result
968}
969
970define <4 x float> @sub_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
971; CHECK-LABEL: @sub_v4f32_01u3(
972; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
973; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
974; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
975; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
976;
977  %a0 = extractelement <4 x float> %a, i32 0
978  %a1 = extractelement <4 x float> %a, i32 1
979  %a2 = extractelement <4 x float> %a, i32 2
980  %a3 = extractelement <4 x float> %a, i32 3
981  %a01 = fsub float %a0, %a1
982  %a23 = fsub float %a2, %a3
983  %b0 = extractelement <4 x float> %b, i32 0
984  %b1 = extractelement <4 x float> %b, i32 1
985  %b2 = extractelement <4 x float> %b, i32 2
986  %b3 = extractelement <4 x float> %b, i32 3
987  %b01 = fsub float %b0, %b1
988  %b23 = fsub float %b2, %b3
989  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
990  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
991  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
992  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
993  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
994  ret <4 x float> %result
995}
996
997define <4 x float> @sub_v4f32_012u(<4 x float> %a, <4 x float> %b) {
998; SSE2-LABEL: @sub_v4f32_012u(
999; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
1000; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
1001; SSE2-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
1002; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
1003;
1004; SSE4-LABEL: @sub_v4f32_012u(
1005; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
1006; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
1007; SSE4-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
1008; SSE4-NEXT:    ret <4 x float> [[TMP4]]
1009;
1010; AVX2-LABEL: @sub_v4f32_012u(
1011; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
1012; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
1013; AVX2-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
1014; AVX2-NEXT:    ret <4 x float> [[TMP4]]
1015;
1016; AVX512-LABEL: @sub_v4f32_012u(
1017; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
1018; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
1019; AVX512-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
1020; AVX512-NEXT:    ret <4 x float> [[RESULT1]]
1021;
1022  %a0 = extractelement <4 x float> %a, i32 0
1023  %a1 = extractelement <4 x float> %a, i32 1
1024  %a2 = extractelement <4 x float> %a, i32 2
1025  %a3 = extractelement <4 x float> %a, i32 3
1026  %a01 = fsub float %a0, %a1
1027  %a23 = fsub float %a2, %a3
1028  %b0 = extractelement <4 x float> %b, i32 0
1029  %b1 = extractelement <4 x float> %b, i32 1
1030  %b2 = extractelement <4 x float> %b, i32 2
1031  %b3 = extractelement <4 x float> %b, i32 3
1032  %b01 = fsub float %b0, %b1
1033  %b23 = fsub float %b2, %b3
1034  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
1035  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
1036  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
1037  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
1038  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
1039  ret <4 x float> %result
1040}
1041
1042define <4 x float> @sub_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
1043; CHECK-LABEL: @sub_v4f32_uu23(
1044; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
1045; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
1046; CHECK-NEXT:    [[RESULT2:%.*]] = fsub <4 x float> [[TMP2]], [[RESULT1]]
1047; CHECK-NEXT:    ret <4 x float> [[RESULT2]]
1048;
1049  %a0 = extractelement <4 x float> %a, i32 0
1050  %a1 = extractelement <4 x float> %a, i32 1
1051  %a2 = extractelement <4 x float> %a, i32 2
1052  %a3 = extractelement <4 x float> %a, i32 3
1053  %a01 = fsub float %a0, %a1
1054  %a23 = fsub float %a2, %a3
1055  %b0 = extractelement <4 x float> %b, i32 0
1056  %b1 = extractelement <4 x float> %b, i32 1
1057  %b2 = extractelement <4 x float> %b, i32 2
1058  %b3 = extractelement <4 x float> %b, i32 3
1059  %b01 = fsub float %b0, %b1
1060  %b23 = fsub float %b2, %b3
1061  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
1062  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
1063  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
1064  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
1065  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
1066  ret <4 x float> %result
1067}
1068
1069define <4 x float> @sub_v4f32_01uu(<4 x float> %a, <4 x float> %b) {
1070; CHECK-LABEL: @sub_v4f32_01uu(
1071; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
1072; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
1073; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
1074; CHECK-NEXT:    ret <4 x float> [[TMP4]]
1075;
1076  %a0 = extractelement <4 x float> %a, i32 0
1077  %a1 = extractelement <4 x float> %a, i32 1
1078  %a2 = extractelement <4 x float> %a, i32 2
1079  %a3 = extractelement <4 x float> %a, i32 3
1080  %a01 = fsub float %a0, %a1
1081  %a23 = fsub float %a2, %a3
1082  %b0 = extractelement <4 x float> %b, i32 0
1083  %b1 = extractelement <4 x float> %b, i32 1
1084  %b2 = extractelement <4 x float> %b, i32 2
1085  %b3 = extractelement <4 x float> %b, i32 3
1086  %b01 = fsub float %b0, %b1
1087  %b23 = fsub float %b2, %b3
1088  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
1089  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
1090  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
1091  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
1092  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1093  ret <4 x float> %result
1094}
1095
1096;
1097; v8f32
1098;
1099
1100define <8 x float> @sub_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
1101; CHECK-LABEL: @sub_v8f32_01234567(
1102; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
1103; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
1104; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
1105; CHECK-NEXT:    ret <8 x float> [[TMP3]]
1106;
1107  %a0 = extractelement <8 x float> %a, i32 0
1108  %a1 = extractelement <8 x float> %a, i32 1
1109  %a2 = extractelement <8 x float> %a, i32 2
1110  %a3 = extractelement <8 x float> %a, i32 3
1111  %a4 = extractelement <8 x float> %a, i32 4
1112  %a5 = extractelement <8 x float> %a, i32 5
1113  %a6 = extractelement <8 x float> %a, i32 6
1114  %a7 = extractelement <8 x float> %a, i32 7
1115  %a01 = fsub float %a0, %a1
1116  %a23 = fsub float %a2, %a3
1117  %a45 = fsub float %a4, %a5
1118  %a67 = fsub float %a6, %a7
1119  %b0 = extractelement <8 x float> %b, i32 0
1120  %b1 = extractelement <8 x float> %b, i32 1
1121  %b2 = extractelement <8 x float> %b, i32 2
1122  %b3 = extractelement <8 x float> %b, i32 3
1123  %b4 = extractelement <8 x float> %b, i32 4
1124  %b5 = extractelement <8 x float> %b, i32 5
1125  %b6 = extractelement <8 x float> %b, i32 6
1126  %b7 = extractelement <8 x float> %b, i32 7
1127  %b01 = fsub float %b0, %b1
1128  %b23 = fsub float %b2, %b3
1129  %b45 = fsub float %b4, %b5
1130  %b67 = fsub float %b6, %b7
1131  %hsub0 = insertelement <8 x float> poison, float %a01, i32 0
1132  %hsub1 = insertelement <8 x float> %hsub0, float %a23, i32 1
1133  %hsub2 = insertelement <8 x float> %hsub1, float %b01, i32 2
1134  %hsub3 = insertelement <8 x float> %hsub2, float %b23, i32 3
1135  %hsub4 = insertelement <8 x float> %hsub3, float %a45, i32 4
1136  %hsub5 = insertelement <8 x float> %hsub4, float %a67, i32 5
1137  %hsub6 = insertelement <8 x float> %hsub5, float %b45, i32 6
1138  %hsub7 = insertelement <8 x float> %hsub6, float %b67, i32 7
1139  %result = shufflevector <8 x float> %hsub7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1140  ret <8 x float> %result
1141}
1142
1143define <8 x float> @sub_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
1144; SSE2-LABEL: @sub_v8f32_012u4567(
1145; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
1146; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
1147; SSE2-NEXT:    [[A67:%.*]] = fsub float [[A6]], [[A7]]
1148; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
1149; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
1150; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
1151; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
1152; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
1153; SSE2-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP5]], [[TMP8]]
1154; SSE2-NEXT:    [[HSUB5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
1155; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1156; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
1157; SSE2-NEXT:    ret <8 x float> [[RESULT]]
1158;
1159; SSE4-LABEL: @sub_v8f32_012u4567(
1160; SSE4-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
1161; SSE4-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
1162; SSE4-NEXT:    [[A67:%.*]] = fsub float [[A6]], [[A7]]
1163; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
1164; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
1165; SSE4-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP7]]
1166; SSE4-NEXT:    [[HSUB5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
1167; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1168; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1169; SSE4-NEXT:    [[TMP9:%.*]] = fsub <8 x float> [[TMP8]], [[TMP5]]
1170; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
1171; SSE4-NEXT:    ret <8 x float> [[RESULT]]
1172;
1173; AVX-LABEL: @sub_v8f32_012u4567(
1174; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 12, i32 14>
1175; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
1176; AVX-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
1177; AVX-NEXT:    ret <8 x float> [[TMP7]]
1178;
1179  %a0 = extractelement <8 x float> %a, i32 0
1180  %a1 = extractelement <8 x float> %a, i32 1
1181  %a2 = extractelement <8 x float> %a, i32 2
1182  %a3 = extractelement <8 x float> %a, i32 3
1183  %a4 = extractelement <8 x float> %a, i32 4
1184  %a5 = extractelement <8 x float> %a, i32 5
1185  %a6 = extractelement <8 x float> %a, i32 6
1186  %a7 = extractelement <8 x float> %a, i32 7
1187  %a01 = fsub float %a0, %a1
1188  %a23 = fsub float %a2, %a3
1189  %a45 = fsub float %a4, %a5
1190  %a67 = fsub float %a6, %a7
1191  %b0 = extractelement <8 x float> %b, i32 0
1192  %b1 = extractelement <8 x float> %b, i32 1
1193  %b2 = extractelement <8 x float> %b, i32 2
1194  %b3 = extractelement <8 x float> %b, i32 3
1195  %b4 = extractelement <8 x float> %b, i32 4
1196  %b5 = extractelement <8 x float> %b, i32 5
1197  %b6 = extractelement <8 x float> %b, i32 6
1198  %b7 = extractelement <8 x float> %b, i32 7
1199  %b01 = fsub float %b0, %b1
1200  %b23 = fsub float %b2, %b3
1201  %b45 = fsub float %b4, %b5
1202  %b67 = fsub float %b6, %b7
1203  %hsub0 = insertelement <8 x float> poison, float %a01, i32 0
1204  %hsub1 = insertelement <8 x float> %hsub0, float %a23, i32 1
1205  %hsub2 = insertelement <8 x float> %hsub1, float %b01, i32 2
1206  %hsub3 = insertelement <8 x float> %hsub2, float %b23, i32 3
1207  %hsub4 = insertelement <8 x float> %hsub3, float %a45, i32 4
1208  %hsub5 = insertelement <8 x float> %hsub4, float %a67, i32 5
1209  %hsub6 = insertelement <8 x float> %hsub5, float %b45, i32 6
1210  %hsub7 = insertelement <8 x float> %hsub6, float %b67, i32 7
1211  %result = shufflevector <8 x float> %hsub7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7>
1212  ret <8 x float> %result
1213}
1214
1215define <8 x float> @sub_v8f32_76u43210(<8 x float> %a, <8 x float> %b) {
1216; SSE2-LABEL: @sub_v8f32_76u43210(
1217; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1218; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1219; SSE2-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
1220; SSE2-NEXT:    [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i64 0
1221; SSE2-NEXT:    [[B1:%.*]] = extractelement <8 x float> [[B]], i64 1
1222; SSE2-NEXT:    [[B01:%.*]] = fsub float [[B0]], [[B1]]
1223; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
1224; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
1225; SSE2-NEXT:    [[TMP6:%.*]] = fsub <2 x float> [[TMP4]], [[TMP5]]
1226; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
1227; SSE2-NEXT:    [[HSUB4:%.*]] = insertelement <8 x float> [[TMP7]], float [[B01]], i64 4
1228; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1229; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> [[HSUB4]], <8 x i32> <i32 1, i32 0, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8>
1230; SSE2-NEXT:    ret <8 x float> [[RESULT]]
1231;
1232; SSE4-LABEL: @sub_v8f32_76u43210(
1233; SSE4-NEXT:    [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i64 0
1234; SSE4-NEXT:    [[B1:%.*]] = extractelement <8 x float> [[B]], i64 1
1235; SSE4-NEXT:    [[B01:%.*]] = fsub float [[B0]], [[B1]]
1236; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
1237; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
1238; SSE4-NEXT:    [[RESULT:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
1239; SSE4-NEXT:    [[HSUB4:%.*]] = insertelement <8 x float> [[RESULT]], float [[B01]], i64 4
1240; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1241; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1242; SSE4-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
1243; SSE4-NEXT:    [[RESULT1:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> [[HSUB4]], <8 x i32> <i32 1, i32 0, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8>
1244; SSE4-NEXT:    ret <8 x float> [[RESULT1]]
1245;
1246; AVX-LABEL: @sub_v8f32_76u43210(
1247; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> [[A:%.*]], <8 x i32> <i32 6, i32 4, i32 poison, i32 0, i32 14, i32 12, i32 10, i32 8>
1248; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> [[A]], <8 x i32> <i32 7, i32 5, i32 poison, i32 1, i32 15, i32 13, i32 11, i32 9>
1249; AVX-NEXT:    [[RESULT:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
1250; AVX-NEXT:    ret <8 x float> [[RESULT]]
1251;
1252  %a0 = extractelement <8 x float> %a, i32 0
1253  %a1 = extractelement <8 x float> %a, i32 1
1254  %a2 = extractelement <8 x float> %a, i32 2
1255  %a3 = extractelement <8 x float> %a, i32 3
1256  %a4 = extractelement <8 x float> %a, i32 4
1257  %a5 = extractelement <8 x float> %a, i32 5
1258  %a6 = extractelement <8 x float> %a, i32 6
1259  %a7 = extractelement <8 x float> %a, i32 7
1260  %a01 = fsub float %a0, %a1
1261  %a23 = fsub float %a2, %a3
1262  %a45 = fsub float %a4, %a5
1263  %a67 = fsub float %a6, %a7
1264  %b0 = extractelement <8 x float> %b, i32 0
1265  %b1 = extractelement <8 x float> %b, i32 1
1266  %b2 = extractelement <8 x float> %b, i32 2
1267  %b3 = extractelement <8 x float> %b, i32 3
1268  %b4 = extractelement <8 x float> %b, i32 4
1269  %b5 = extractelement <8 x float> %b, i32 5
1270  %b6 = extractelement <8 x float> %b, i32 6
1271  %b7 = extractelement <8 x float> %b, i32 7
1272  %b01 = fsub float %b0, %b1
1273  %b23 = fsub float %b2, %b3
1274  %b45 = fsub float %b4, %b5
1275  %b67 = fsub float %b6, %b7
1276  %hsub0 = insertelement <8 x float> poison, float %a01, i32 0
1277  %hsub1 = insertelement <8 x float> %hsub0, float %a23, i32 1
1278  %hsub2 = insertelement <8 x float> %hsub1, float %a45, i32 2
1279  %hsub3 = insertelement <8 x float> %hsub2, float %a67, i32 3
1280  %hsub4 = insertelement <8 x float> %hsub3, float %b01, i32 4
1281  %hsub5 = insertelement <8 x float> %hsub4, float %b23, i32 5
1282  %hsub6 = insertelement <8 x float> %hsub5, float %b45, i32 6
1283  %hsub7 = insertelement <8 x float> %hsub6, float %b67, i32 7
1284  %result = shufflevector <8 x float> %hsub7, <8 x float> %a, <8 x i32> <i32 7, i32 6, i32 poison, i32 4, i32 3, i32 2, i32 1, i32 0>
1285  ret <8 x float> %result
1286}
1287
1288;
1289; v2f64
1290;
1291
1292define <2 x double> @sub_v2f64_01(<2 x double> %a, <2 x double> %b) {
1293; CHECK-LABEL: @sub_v2f64_01(
1294; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
1295; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
1296; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
1297; CHECK-NEXT:    ret <2 x double> [[TMP3]]
1298;
1299  %a0 = extractelement <2 x double> %a, i32 0
1300  %a1 = extractelement <2 x double> %a, i32 1
1301  %a01 = fsub double %a0, %a1
1302  %b0 = extractelement <2 x double> %b, i32 0
1303  %b1 = extractelement <2 x double> %b, i32 1
1304  %b01 = fsub double %b0, %b1
1305  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
1306  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
1307  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 0, i32 1>
1308  ret <2 x double> %result
1309}
1310
1311define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
1312; CHECK-LABEL: @sub_v2f64_u1(
1313; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
1314; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <2 x double> [[TMP1]], [[B]]
1315; CHECK-NEXT:    ret <2 x double> [[RESULT1]]
1316;
1317  %a0 = extractelement <2 x double> %a, i32 0
1318  %a1 = extractelement <2 x double> %a, i32 1
1319  %a01 = fsub double %a0, %a1
1320  %b0 = extractelement <2 x double> %b, i32 0
1321  %b1 = extractelement <2 x double> %b, i32 1
1322  %b01 = fsub double %b0, %b1
1323  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
1324  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
1325  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 poison, i32 1>
1326  ret <2 x double> %result
1327}
1328
1329define <2 x double> @sub_v2f64_0u(<2 x double> %a, <2 x double> %b) {
1330; CHECK-LABEL: @sub_v2f64_0u(
1331; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
1332; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <2 x double> [[TMP1]], [[RESULT]]
1333; CHECK-NEXT:    ret <2 x double> [[RESULT1]]
1334;
1335  %a0 = extractelement <2 x double> %a, i32 0
1336  %a1 = extractelement <2 x double> %a, i32 1
1337  %a01 = fsub double %a0, %a1
1338  %b0 = extractelement <2 x double> %b, i32 0
1339  %b1 = extractelement <2 x double> %b, i32 1
1340  %b01 = fsub double %b0, %b1
1341  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
1342  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
1343  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 0, i32 poison>
1344  ret <2 x double> %result
1345}
1346
1347;
1348; v4f64
1349;
1350
1351define <4 x double> @sub_v4f64_0123(<4 x double> %a, <4 x double> %b) {
1352; CHECK-LABEL: @sub_v4f64_0123(
1353; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
1354; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
1355; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
1356; CHECK-NEXT:    ret <4 x double> [[TMP3]]
1357;
1358  %a0 = extractelement <4 x double> %a, i32 0
1359  %a1 = extractelement <4 x double> %a, i32 1
1360  %a2 = extractelement <4 x double> %a, i32 2
1361  %a3 = extractelement <4 x double> %a, i32 3
1362  %a01 = fsub double %a0, %a1
1363  %a23 = fsub double %a2, %a3
1364  %b0 = extractelement <4 x double> %b, i32 0
1365  %b1 = extractelement <4 x double> %b, i32 1
1366  %b2 = extractelement <4 x double> %b, i32 2
1367  %b3 = extractelement <4 x double> %b, i32 3
1368  %b01 = fsub double %b0, %b1
1369  %b23 = fsub double %b2, %b3
1370  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
1371  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
1372  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
1373  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
1374  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1375  ret <4 x double> %result
1376}
1377
1378define <4 x double> @sub_v4f64_u123(<4 x double> %a, <4 x double> %b) {
1379; SSE2-LABEL: @sub_v4f64_u123(
1380; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
1381; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
1382; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <2 x i32> <i32 0, i32 6>
1383; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <2 x i32> <i32 1, i32 7>
1384; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
1385; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
1386; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
1387; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
1388; SSE2-NEXT:    ret <4 x double> [[RESULT]]
1389;
1390; SSE4-LABEL: @sub_v4f64_u123(
1391; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
1392; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
1393; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
1394; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
1395; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
1396; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
1397; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
1398; SSE4-NEXT:    ret <4 x double> [[RESULT]]
1399;
1400; AVX-LABEL: @sub_v4f64_u123(
1401; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
1402; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
1403; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
1404; AVX-NEXT:    ret <4 x double> [[TMP4]]
1405;
1406  %a0 = extractelement <4 x double> %a, i32 0
1407  %a1 = extractelement <4 x double> %a, i32 1
1408  %a2 = extractelement <4 x double> %a, i32 2
1409  %a3 = extractelement <4 x double> %a, i32 3
1410  %a01 = fsub double %a0, %a1
1411  %a23 = fsub double %a2, %a3
1412  %b0 = extractelement <4 x double> %b, i32 0
1413  %b1 = extractelement <4 x double> %b, i32 1
1414  %b2 = extractelement <4 x double> %b, i32 2
1415  %b3 = extractelement <4 x double> %b, i32 3
1416  %b01 = fsub double %b0, %b1
1417  %b23 = fsub double %b2, %b3
1418  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
1419  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
1420  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
1421  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
1422  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
1423  ret <4 x double> %result
1424}
1425
1426define <4 x double> @sub_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
1427; SSE2-LABEL: @sub_v4f64_0u23(
1428; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
1429; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
1430; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
1431; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
1432; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
1433; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
1434; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
1435; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
1436; SSE2-NEXT:    ret <4 x double> [[RESULT]]
1437;
1438; SSE4-LABEL: @sub_v4f64_0u23(
1439; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
1440; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
1441; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
1442; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 3, i32 poison>
1443; SSE4-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
1444; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
1445; SSE4-NEXT:    ret <4 x double> [[RESULT]]
1446;
1447; AVX-LABEL: @sub_v4f64_0u23(
1448; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
1449; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 3, i32 7>
1450; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
1451; AVX-NEXT:    ret <4 x double> [[TMP4]]
1452;
1453  %a0 = extractelement <4 x double> %a, i32 0
1454  %a1 = extractelement <4 x double> %a, i32 1
1455  %a2 = extractelement <4 x double> %a, i32 2
1456  %a3 = extractelement <4 x double> %a, i32 3
1457  %a01 = fsub double %a0, %a1
1458  %a23 = fsub double %a2, %a3
1459  %b0 = extractelement <4 x double> %b, i32 0
1460  %b1 = extractelement <4 x double> %b, i32 1
1461  %b2 = extractelement <4 x double> %b, i32 2
1462  %b3 = extractelement <4 x double> %b, i32 3
1463  %b01 = fsub double %b0, %b1
1464  %b23 = fsub double %b2, %b3
1465  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
1466  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
1467  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
1468  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
1469  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
1470  ret <4 x double> %result
1471}
1472
1473define <4 x double> @sub_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
1474; SSE2-LABEL: @sub_v4f64_01u3(
1475; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
1476; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
1477; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
1478; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
1479; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
1480; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
1481; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1482; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
1483; SSE2-NEXT:    ret <4 x double> [[RESULT]]
1484;
1485; SSE4-LABEL: @sub_v4f64_01u3(
1486; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
1487; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
1488; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
1489; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
1490; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
1491; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
1492; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
1493; SSE4-NEXT:    ret <4 x double> [[RESULT]]
1494;
1495; AVX-LABEL: @sub_v4f64_01u3(
1496; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
1497; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
1498; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
1499; AVX-NEXT:    ret <4 x double> [[TMP4]]
1500;
1501  %a0 = extractelement <4 x double> %a, i32 0
1502  %a1 = extractelement <4 x double> %a, i32 1
1503  %a2 = extractelement <4 x double> %a, i32 2
1504  %a3 = extractelement <4 x double> %a, i32 3
1505  %a01 = fsub double %a0, %a1
1506  %a23 = fsub double %a2, %a3
1507  %b0 = extractelement <4 x double> %b, i32 0
1508  %b1 = extractelement <4 x double> %b, i32 1
1509  %b2 = extractelement <4 x double> %b, i32 2
1510  %b3 = extractelement <4 x double> %b, i32 3
1511  %b01 = fsub double %b0, %b1
1512  %b23 = fsub double %b2, %b3
1513  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
1514  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
1515  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
1516  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
1517  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
1518  ret <4 x double> %result
1519}
1520
1521define <4 x double> @sub_v4f64_012u(<4 x double> %a, <4 x double> %b) {
1522; SSE2-LABEL: @sub_v4f64_012u(
1523; SSE2-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
1524; SSE2-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
1525; SSE2-NEXT:    [[A23:%.*]] = fsub double [[A2]], [[A3]]
1526; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
1527; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
1528; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
1529; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1530; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[A23]], i64 2
1531; SSE2-NEXT:    ret <4 x double> [[RESULT]]
1532;
1533; SSE4-LABEL: @sub_v4f64_012u(
1534; SSE4-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
1535; SSE4-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
1536; SSE4-NEXT:    [[A23:%.*]] = fsub double [[A2]], [[A3]]
1537; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
1538; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
1539; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
1540; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
1541; SSE4-NEXT:    ret <4 x double> [[RESULT]]
1542;
1543; AVX-LABEL: @sub_v4f64_012u(
1544; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
1545; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
1546; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
1547; AVX-NEXT:    ret <4 x double> [[TMP4]]
1548;
1549  %a0 = extractelement <4 x double> %a, i32 0
1550  %a1 = extractelement <4 x double> %a, i32 1
1551  %a2 = extractelement <4 x double> %a, i32 2
1552  %a3 = extractelement <4 x double> %a, i32 3
1553  %a01 = fsub double %a0, %a1
1554  %a23 = fsub double %a2, %a3
1555  %b0 = extractelement <4 x double> %b, i32 0
1556  %b1 = extractelement <4 x double> %b, i32 1
1557  %b2 = extractelement <4 x double> %b, i32 2
1558  %b3 = extractelement <4 x double> %b, i32 3
1559  %b01 = fsub double %b0, %b1
1560  %b23 = fsub double %b2, %b3
1561  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
1562  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
1563  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
1564  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
1565  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
1566  ret <4 x double> %result
1567}
1568
1569define <4 x double> @sub_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
1570; SSE2-LABEL: @sub_v4f64_uu23(
1571; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 2, i32 6>
1572; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
1573; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
1574; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
1575; SSE2-NEXT:    ret <4 x double> [[RESULT1]]
1576;
1577; SSE4-LABEL: @sub_v4f64_uu23(
1578; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
1579; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
1580; SSE4-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
1581; SSE4-NEXT:    ret <4 x double> [[RESULT1]]
1582;
1583; AVX-LABEL: @sub_v4f64_uu23(
1584; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
1585; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
1586; AVX-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
1587; AVX-NEXT:    ret <4 x double> [[RESULT1]]
1588;
1589  %a0 = extractelement <4 x double> %a, i32 0
1590  %a1 = extractelement <4 x double> %a, i32 1
1591  %a2 = extractelement <4 x double> %a, i32 2
1592  %a3 = extractelement <4 x double> %a, i32 3
1593  %a01 = fsub double %a0, %a1
1594  %a23 = fsub double %a2, %a3
1595  %b0 = extractelement <4 x double> %b, i32 0
1596  %b1 = extractelement <4 x double> %b, i32 1
1597  %b2 = extractelement <4 x double> %b, i32 2
1598  %b3 = extractelement <4 x double> %b, i32 3
1599  %b01 = fsub double %b0, %b1
1600  %b23 = fsub double %b2, %b3
1601  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
1602  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
1603  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
1604  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
1605  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
1606  ret <4 x double> %result
1607}
1608
1609define <4 x double> @sub_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
1610; SSE2-LABEL: @sub_v4f64_01uu(
1611; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
1612; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
1613; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
1614; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1615; SSE2-NEXT:    ret <4 x double> [[TMP4]]
1616;
1617; SSE4-LABEL: @sub_v4f64_01uu(
1618; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
1619; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
1620; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
1621; SSE4-NEXT:    ret <4 x double> [[TMP3]]
1622;
1623; AVX-LABEL: @sub_v4f64_01uu(
1624; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
1625; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
1626; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
1627; AVX-NEXT:    ret <4 x double> [[TMP3]]
1628;
1629  %a0 = extractelement <4 x double> %a, i32 0
1630  %a1 = extractelement <4 x double> %a, i32 1
1631  %a2 = extractelement <4 x double> %a, i32 2
1632  %a3 = extractelement <4 x double> %a, i32 3
1633  %a01 = fsub double %a0, %a1
1634  %a23 = fsub double %a2, %a3
1635  %b0 = extractelement <4 x double> %b, i32 0
1636  %b1 = extractelement <4 x double> %b, i32 1
1637  %b2 = extractelement <4 x double> %b, i32 2
1638  %b3 = extractelement <4 x double> %b, i32 3
1639  %b01 = fsub double %b0, %b1
1640  %b23 = fsub double %b2, %b3
1641  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
1642  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
1643  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
1644  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
1645  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1646  ret <4 x double> %result
1647}
1648
1649define <4 x double> @sub_v4f64_32u0(<4 x double> %a, <4 x double> %b) {
1650; SSE2-LABEL: @sub_v4f64_32u0(
1651; SSE2-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
1652; SSE2-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
1653; SSE2-NEXT:    [[A01:%.*]] = fsub double [[A0]], [[A1]]
1654; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 2, i32 0>
1655; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 3, i32 1>
1656; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
1657; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1658; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[A01]], i64 3
1659; SSE2-NEXT:    ret <4 x double> [[RESULT]]
1660;
1661; SSE4-LABEL: @sub_v4f64_32u0(
1662; SSE4-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
1663; SSE4-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
1664; SSE4-NEXT:    [[A01:%.*]] = fsub double [[A0]], [[A1]]
1665; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
1666; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 3, i32 1, i32 poison, i32 poison>
1667; SSE4-NEXT:    [[RESULT:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
1668; SSE4-NEXT:    [[RESULT1:%.*]] = insertelement <4 x double> [[RESULT]], double [[A01]], i64 3
1669; SSE4-NEXT:    ret <4 x double> [[RESULT1]]
1670;
1671; AVX-LABEL: @sub_v4f64_32u0(
1672; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
1673; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
1674; AVX-NEXT:    [[RESULT:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
1675; AVX-NEXT:    ret <4 x double> [[RESULT]]
1676;
1677  %a0 = extractelement <4 x double> %a, i32 0
1678  %a1 = extractelement <4 x double> %a, i32 1
1679  %a2 = extractelement <4 x double> %a, i32 2
1680  %a3 = extractelement <4 x double> %a, i32 3
1681  %a01 = fsub double %a0, %a1
1682  %a23 = fsub double %a2, %a3
1683  %b0 = extractelement <4 x double> %b, i32 0
1684  %b1 = extractelement <4 x double> %b, i32 1
1685  %b2 = extractelement <4 x double> %b, i32 2
1686  %b3 = extractelement <4 x double> %b, i32 3
1687  %b01 = fsub double %b0, %b1
1688  %b23 = fsub double %b2, %b3
1689  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
1690  %hsub1 = insertelement <4 x double> %hsub0, double %a23, i32 1
1691  %hsub2 = insertelement <4 x double> %hsub1, double %b01, i32 2
1692  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
1693  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 3, i32 2, i32 poison, i32 0>
1694  ret <4 x double> %result
1695}
1696