xref: /llvm-project/llvm/test/CodeGen/X86/vector-shuffle-combining.ll (revision e6bf48d11047e970cb24554a01b65b566d6b5d22)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
4; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
5; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
6; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
7; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL
8; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE
9;
10; Verify that the DAG combiner correctly folds bitwise operations across
11; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
12; basic and always-safe patterns. Also test that the DAG combiner will combine
13; target-specific shuffle instructions where reasonable.
14
15target triple = "x86_64-unknown-unknown"
16
17declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
18declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
19declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
20
21define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
22; CHECK-LABEL: combine_pshufd1:
23; CHECK:       # %bb.0: # %entry
24; CHECK-NEXT:    retq
25entry:
26  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
27  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
28  ret <4 x i32> %c
29}
30
31define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
32; CHECK-LABEL: combine_pshufd2:
33; CHECK:       # %bb.0: # %entry
34; CHECK-NEXT:    retq
35entry:
36  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
37  %b.cast = bitcast <4 x i32> %b to <8 x i16>
38  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
39  %c.cast = bitcast <8 x i16> %c to <4 x i32>
40  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
41  ret <4 x i32> %d
42}
43
44define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
45; CHECK-LABEL: combine_pshufd3:
46; CHECK:       # %bb.0: # %entry
47; CHECK-NEXT:    retq
48entry:
49  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
50  %b.cast = bitcast <4 x i32> %b to <8 x i16>
51  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
52  %c.cast = bitcast <8 x i16> %c to <4 x i32>
53  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
54  ret <4 x i32> %d
55}
56
57define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
58; SSE-LABEL: combine_pshufd4:
59; SSE:       # %bb.0: # %entry
60; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
61; SSE-NEXT:    retq
62;
63; AVX-LABEL: combine_pshufd4:
64; AVX:       # %bb.0: # %entry
65; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
66; AVX-NEXT:    retq
67entry:
68  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
69  %b.cast = bitcast <4 x i32> %b to <8 x i16>
70  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
71  %c.cast = bitcast <8 x i16> %c to <4 x i32>
72  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
73  ret <4 x i32> %d
74}
75
76define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
77; SSE-LABEL: combine_pshufd5:
78; SSE:       # %bb.0: # %entry
79; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
80; SSE-NEXT:    retq
81;
82; AVX-LABEL: combine_pshufd5:
83; AVX:       # %bb.0: # %entry
84; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
85; AVX-NEXT:    retq
86entry:
87  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
88  %b.cast = bitcast <4 x i32> %b to <8 x i16>
89  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
90  %c.cast = bitcast <8 x i16> %c to <4 x i32>
91  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
92  ret <4 x i32> %d
93}
94
95define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
96; SSE-LABEL: combine_pshufd6:
97; SSE:       # %bb.0: # %entry
98; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
99; SSE-NEXT:    retq
100;
101; AVX1-LABEL: combine_pshufd6:
102; AVX1:       # %bb.0: # %entry
103; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
104; AVX1-NEXT:    retq
105;
106; AVX2-LABEL: combine_pshufd6:
107; AVX2:       # %bb.0: # %entry
108; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
109; AVX2-NEXT:    retq
110entry:
111  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
112  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
113  ret <4 x i32> %c
114}
115
116define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
117; CHECK-LABEL: combine_pshuflw1:
118; CHECK:       # %bb.0: # %entry
119; CHECK-NEXT:    retq
120entry:
121  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
122  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
123  ret <8 x i16> %c
124}
125
126define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
127; CHECK-LABEL: combine_pshuflw2:
128; CHECK:       # %bb.0: # %entry
129; CHECK-NEXT:    retq
130entry:
131  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
132  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
133  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
134  ret <8 x i16> %d
135}
136
137define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
138; SSE-LABEL: combine_pshuflw3:
139; SSE:       # %bb.0: # %entry
140; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
141; SSE-NEXT:    retq
142;
143; AVX-LABEL: combine_pshuflw3:
144; AVX:       # %bb.0: # %entry
145; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
146; AVX-NEXT:    retq
147entry:
148  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
149  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
150  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
151  ret <8 x i16> %d
152}
153
154define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
155; SSE-LABEL: combine_pshufhw1:
156; SSE:       # %bb.0: # %entry
157; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
158; SSE-NEXT:    retq
159;
160; AVX-LABEL: combine_pshufhw1:
161; AVX:       # %bb.0: # %entry
162; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
163; AVX-NEXT:    retq
164entry:
165  %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
166  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
167  %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
168  ret <8 x i16> %d
169}
170
171define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
172; SSE-LABEL: combine_bitwise_ops_test1:
173; SSE:       # %bb.0:
174; SSE-NEXT:    pand %xmm1, %xmm0
175; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
176; SSE-NEXT:    retq
177;
178; AVX-LABEL: combine_bitwise_ops_test1:
179; AVX:       # %bb.0:
180; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
181; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
182; AVX-NEXT:    retq
183  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
184  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
185  %and = and <4 x i32> %shuf1, %shuf2
186  ret <4 x i32> %and
187}
188
189define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
190; SSE-LABEL: combine_bitwise_ops_test2:
191; SSE:       # %bb.0:
192; SSE-NEXT:    por %xmm1, %xmm0
193; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
194; SSE-NEXT:    retq
195;
196; AVX-LABEL: combine_bitwise_ops_test2:
197; AVX:       # %bb.0:
198; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
199; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
200; AVX-NEXT:    retq
201  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
202  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
203  %or = or <4 x i32> %shuf1, %shuf2
204  ret <4 x i32> %or
205}
206
207define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
208; SSE-LABEL: combine_bitwise_ops_test3:
209; SSE:       # %bb.0:
210; SSE-NEXT:    pxor %xmm1, %xmm0
211; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
212; SSE-NEXT:    retq
213;
214; AVX-LABEL: combine_bitwise_ops_test3:
215; AVX:       # %bb.0:
216; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
217; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
218; AVX-NEXT:    retq
219  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
220  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
221  %xor = xor <4 x i32> %shuf1, %shuf2
222  ret <4 x i32> %xor
223}
224
225define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
226; SSE-LABEL: combine_bitwise_ops_test4:
227; SSE:       # %bb.0:
228; SSE-NEXT:    pand %xmm1, %xmm0
229; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
230; SSE-NEXT:    retq
231;
232; AVX-LABEL: combine_bitwise_ops_test4:
233; AVX:       # %bb.0:
234; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
235; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
236; AVX-NEXT:    retq
237  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
238  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
239  %and = and <4 x i32> %shuf1, %shuf2
240  ret <4 x i32> %and
241}
242
243define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
244; SSE-LABEL: combine_bitwise_ops_test5:
245; SSE:       # %bb.0:
246; SSE-NEXT:    por %xmm1, %xmm0
247; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
248; SSE-NEXT:    retq
249;
250; AVX-LABEL: combine_bitwise_ops_test5:
251; AVX:       # %bb.0:
252; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
253; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
254; AVX-NEXT:    retq
255  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
256  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
257  %or = or <4 x i32> %shuf1, %shuf2
258  ret <4 x i32> %or
259}
260
261define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
262; SSE-LABEL: combine_bitwise_ops_test6:
263; SSE:       # %bb.0:
264; SSE-NEXT:    pxor %xmm1, %xmm0
265; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
266; SSE-NEXT:    retq
267;
268; AVX-LABEL: combine_bitwise_ops_test6:
269; AVX:       # %bb.0:
270; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
271; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
272; AVX-NEXT:    retq
273  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
274  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
275  %xor = xor <4 x i32> %shuf1, %shuf2
276  ret <4 x i32> %xor
277}
278
279
280; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
281; are not performing a swizzle operations.
282
283define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
284; SSE2-LABEL: combine_bitwise_ops_test1b:
285; SSE2:       # %bb.0:
286; SSE2-NEXT:    pand %xmm1, %xmm0
287; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
288; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
289; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
290; SSE2-NEXT:    retq
291;
292; SSSE3-LABEL: combine_bitwise_ops_test1b:
293; SSSE3:       # %bb.0:
294; SSSE3-NEXT:    pand %xmm1, %xmm0
295; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
296; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
297; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
298; SSSE3-NEXT:    retq
299;
300; SSE41-LABEL: combine_bitwise_ops_test1b:
301; SSE41:       # %bb.0:
302; SSE41-NEXT:    andps %xmm1, %xmm0
303; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
304; SSE41-NEXT:    retq
305;
306; AVX-LABEL: combine_bitwise_ops_test1b:
307; AVX:       # %bb.0:
308; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
309; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
310; AVX-NEXT:    retq
311  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
312  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
313  %and = and <4 x i32> %shuf1, %shuf2
314  ret <4 x i32> %and
315}
316
317define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
318; SSE2-LABEL: combine_bitwise_ops_test2b:
319; SSE2:       # %bb.0:
320; SSE2-NEXT:    por %xmm1, %xmm0
321; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
322; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
323; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
324; SSE2-NEXT:    retq
325;
326; SSSE3-LABEL: combine_bitwise_ops_test2b:
327; SSSE3:       # %bb.0:
328; SSSE3-NEXT:    por %xmm1, %xmm0
329; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
330; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
331; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
332; SSSE3-NEXT:    retq
333;
334; SSE41-LABEL: combine_bitwise_ops_test2b:
335; SSE41:       # %bb.0:
336; SSE41-NEXT:    orps %xmm1, %xmm0
337; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
338; SSE41-NEXT:    retq
339;
340; AVX-LABEL: combine_bitwise_ops_test2b:
341; AVX:       # %bb.0:
342; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
343; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
344; AVX-NEXT:    retq
345  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
346  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
347  %or = or <4 x i32> %shuf1, %shuf2
348  ret <4 x i32> %or
349}
350
351define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
352; SSE2-LABEL: combine_bitwise_ops_test3b:
353; SSE2:       # %bb.0:
354; SSE2-NEXT:    xorps %xmm1, %xmm0
355; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
356; SSE2-NEXT:    retq
357;
358; SSSE3-LABEL: combine_bitwise_ops_test3b:
359; SSSE3:       # %bb.0:
360; SSSE3-NEXT:    xorps %xmm1, %xmm0
361; SSSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
362; SSSE3-NEXT:    retq
363;
364; SSE41-LABEL: combine_bitwise_ops_test3b:
365; SSE41:       # %bb.0:
366; SSE41-NEXT:    xorps %xmm1, %xmm0
367; SSE41-NEXT:    xorps %xmm1, %xmm1
368; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
369; SSE41-NEXT:    retq
370;
371; AVX-LABEL: combine_bitwise_ops_test3b:
372; AVX:       # %bb.0:
373; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
374; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
375; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
376; AVX-NEXT:    retq
377  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
378  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
379  %xor = xor <4 x i32> %shuf1, %shuf2
380  ret <4 x i32> %xor
381}
382
383define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
384; SSE2-LABEL: combine_bitwise_ops_test4b:
385; SSE2:       # %bb.0:
386; SSE2-NEXT:    pand %xmm1, %xmm0
387; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
388; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
389; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
390; SSE2-NEXT:    retq
391;
392; SSSE3-LABEL: combine_bitwise_ops_test4b:
393; SSSE3:       # %bb.0:
394; SSSE3-NEXT:    pand %xmm1, %xmm0
395; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
396; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
397; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
398; SSSE3-NEXT:    retq
399;
400; SSE41-LABEL: combine_bitwise_ops_test4b:
401; SSE41:       # %bb.0:
402; SSE41-NEXT:    andps %xmm1, %xmm0
403; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
404; SSE41-NEXT:    retq
405;
406; AVX-LABEL: combine_bitwise_ops_test4b:
407; AVX:       # %bb.0:
408; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
409; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
410; AVX-NEXT:    retq
411  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
412  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
413  %and = and <4 x i32> %shuf1, %shuf2
414  ret <4 x i32> %and
415}
416
417define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
418; SSE2-LABEL: combine_bitwise_ops_test5b:
419; SSE2:       # %bb.0:
420; SSE2-NEXT:    por %xmm1, %xmm0
421; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
422; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
423; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
424; SSE2-NEXT:    retq
425;
426; SSSE3-LABEL: combine_bitwise_ops_test5b:
427; SSSE3:       # %bb.0:
428; SSSE3-NEXT:    por %xmm1, %xmm0
429; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
430; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
431; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
432; SSSE3-NEXT:    retq
433;
434; SSE41-LABEL: combine_bitwise_ops_test5b:
435; SSE41:       # %bb.0:
436; SSE41-NEXT:    orps %xmm1, %xmm0
437; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
438; SSE41-NEXT:    retq
439;
440; AVX-LABEL: combine_bitwise_ops_test5b:
441; AVX:       # %bb.0:
442; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
443; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
444; AVX-NEXT:    retq
445  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
446  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
447  %or = or <4 x i32> %shuf1, %shuf2
448  ret <4 x i32> %or
449}
450
451define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
452; SSE2-LABEL: combine_bitwise_ops_test6b:
453; SSE2:       # %bb.0:
454; SSE2-NEXT:    xorps %xmm1, %xmm0
455; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
456; SSE2-NEXT:    retq
457;
458; SSSE3-LABEL: combine_bitwise_ops_test6b:
459; SSSE3:       # %bb.0:
460; SSSE3-NEXT:    xorps %xmm1, %xmm0
461; SSSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
462; SSSE3-NEXT:    retq
463;
464; SSE41-LABEL: combine_bitwise_ops_test6b:
465; SSE41:       # %bb.0:
466; SSE41-NEXT:    xorps %xmm1, %xmm0
467; SSE41-NEXT:    xorps %xmm1, %xmm1
468; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
469; SSE41-NEXT:    retq
470;
471; AVX-LABEL: combine_bitwise_ops_test6b:
472; AVX:       # %bb.0:
473; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
474; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
475; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
476; AVX-NEXT:    retq
477  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
478  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
479  %xor = xor <4 x i32> %shuf1, %shuf2
480  ret <4 x i32> %xor
481}
482
483define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
484; SSE-LABEL: combine_bitwise_ops_test1c:
485; SSE:       # %bb.0:
486; SSE-NEXT:    andps %xmm1, %xmm0
487; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
488; SSE-NEXT:    retq
489;
490; AVX-LABEL: combine_bitwise_ops_test1c:
491; AVX:       # %bb.0:
492; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
493; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
494; AVX-NEXT:    retq
495  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
496  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
497  %and = and <4 x i32> %shuf1, %shuf2
498  ret <4 x i32> %and
499}
500
501define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
502; SSE-LABEL: combine_bitwise_ops_test2c:
503; SSE:       # %bb.0:
504; SSE-NEXT:    orps %xmm1, %xmm0
505; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
506; SSE-NEXT:    retq
507;
508; AVX-LABEL: combine_bitwise_ops_test2c:
509; AVX:       # %bb.0:
510; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
511; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
512; AVX-NEXT:    retq
513  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
514  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
515  %or = or <4 x i32> %shuf1, %shuf2
516  ret <4 x i32> %or
517}
518
519define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
520; SSE2-LABEL: combine_bitwise_ops_test3c:
521; SSE2:       # %bb.0:
522; SSE2-NEXT:    xorps %xmm1, %xmm0
523; SSE2-NEXT:    xorps %xmm1, %xmm1
524; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
525; SSE2-NEXT:    retq
526;
527; SSSE3-LABEL: combine_bitwise_ops_test3c:
528; SSSE3:       # %bb.0:
529; SSSE3-NEXT:    xorps %xmm1, %xmm0
530; SSSE3-NEXT:    xorps %xmm1, %xmm1
531; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
532; SSSE3-NEXT:    retq
533;
534; SSE41-LABEL: combine_bitwise_ops_test3c:
535; SSE41:       # %bb.0:
536; SSE41-NEXT:    xorps %xmm1, %xmm0
537; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
538; SSE41-NEXT:    retq
539;
540; AVX-LABEL: combine_bitwise_ops_test3c:
541; AVX:       # %bb.0:
542; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
543; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
544; AVX-NEXT:    retq
545  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
546  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
547  %xor = xor <4 x i32> %shuf1, %shuf2
548  ret <4 x i32> %xor
549}
550
551define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
552; SSE-LABEL: combine_bitwise_ops_test4c:
553; SSE:       # %bb.0:
554; SSE-NEXT:    andps %xmm1, %xmm0
555; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
556; SSE-NEXT:    movaps %xmm2, %xmm0
557; SSE-NEXT:    retq
558;
559; AVX-LABEL: combine_bitwise_ops_test4c:
560; AVX:       # %bb.0:
561; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
562; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
563; AVX-NEXT:    retq
564  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
565  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
566  %and = and <4 x i32> %shuf1, %shuf2
567  ret <4 x i32> %and
568}
569
570define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
571; SSE-LABEL: combine_bitwise_ops_test5c:
572; SSE:       # %bb.0:
573; SSE-NEXT:    orps %xmm1, %xmm0
574; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
575; SSE-NEXT:    movaps %xmm2, %xmm0
576; SSE-NEXT:    retq
577;
578; AVX-LABEL: combine_bitwise_ops_test5c:
579; AVX:       # %bb.0:
580; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
581; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
582; AVX-NEXT:    retq
583  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
584  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
585  %or = or <4 x i32> %shuf1, %shuf2
586  ret <4 x i32> %or
587}
588
589define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
590; SSE2-LABEL: combine_bitwise_ops_test6c:
591; SSE2:       # %bb.0:
592; SSE2-NEXT:    xorps %xmm1, %xmm0
593; SSE2-NEXT:    xorps %xmm1, %xmm1
594; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
595; SSE2-NEXT:    movaps %xmm1, %xmm0
596; SSE2-NEXT:    retq
597;
598; SSSE3-LABEL: combine_bitwise_ops_test6c:
599; SSSE3:       # %bb.0:
600; SSSE3-NEXT:    xorps %xmm1, %xmm0
601; SSSE3-NEXT:    xorps %xmm1, %xmm1
602; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
603; SSSE3-NEXT:    movaps %xmm1, %xmm0
604; SSSE3-NEXT:    retq
605;
606; SSE41-LABEL: combine_bitwise_ops_test6c:
607; SSE41:       # %bb.0:
608; SSE41-NEXT:    xorps %xmm1, %xmm0
609; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
610; SSE41-NEXT:    retq
611;
612; AVX-LABEL: combine_bitwise_ops_test6c:
613; AVX:       # %bb.0:
614; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
615; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
616; AVX-NEXT:    retq
617  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
618  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
619  %xor = xor <4 x i32> %shuf1, %shuf2
620  ret <4 x i32> %xor
621}
622
623define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
624; SSE-LABEL: combine_nested_undef_test1:
625; SSE:       # %bb.0:
626; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
627; SSE-NEXT:    retq
628;
629; AVX-LABEL: combine_nested_undef_test1:
630; AVX:       # %bb.0:
631; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1]
632; AVX-NEXT:    retq
633  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
634  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
635  ret <4 x i32> %2
636}
637
638define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
639; SSE-LABEL: combine_nested_undef_test2:
640; SSE:       # %bb.0:
641; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
642; SSE-NEXT:    retq
643;
644; AVX-LABEL: combine_nested_undef_test2:
645; AVX:       # %bb.0:
646; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3]
647; AVX-NEXT:    retq
648  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
649  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
650  ret <4 x i32> %2
651}
652
653define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
654; SSE-LABEL: combine_nested_undef_test3:
655; SSE:       # %bb.0:
656; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
657; SSE-NEXT:    retq
658;
659; AVX-LABEL: combine_nested_undef_test3:
660; AVX:       # %bb.0:
661; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3]
662; AVX-NEXT:    retq
663  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
664  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
665  ret <4 x i32> %2
666}
667
668define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
669; SSE-LABEL: combine_nested_undef_test4:
670; SSE:       # %bb.0:
671; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
672; SSE-NEXT:    retq
673;
674; AVX1-LABEL: combine_nested_undef_test4:
675; AVX1:       # %bb.0:
676; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
677; AVX1-NEXT:    retq
678;
679; AVX2-LABEL: combine_nested_undef_test4:
680; AVX2:       # %bb.0:
681; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
682; AVX2-NEXT:    retq
683  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
684  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
685  ret <4 x i32> %2
686}
687
688define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
689; SSE-LABEL: combine_nested_undef_test5:
690; SSE:       # %bb.0:
691; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
692; SSE-NEXT:    retq
693;
694; AVX-LABEL: combine_nested_undef_test5:
695; AVX:       # %bb.0:
696; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
697; AVX-NEXT:    retq
698  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
699  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
700  ret <4 x i32> %2
701}
702
703define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
704; SSE-LABEL: combine_nested_undef_test6:
705; SSE:       # %bb.0:
706; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
707; SSE-NEXT:    retq
708;
709; AVX-LABEL: combine_nested_undef_test6:
710; AVX:       # %bb.0:
711; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
712; AVX-NEXT:    retq
713  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
714  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
715  ret <4 x i32> %2
716}
717
718define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
719; SSE-LABEL: combine_nested_undef_test7:
720; SSE:       # %bb.0:
721; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
722; SSE-NEXT:    retq
723;
724; AVX-LABEL: combine_nested_undef_test7:
725; AVX:       # %bb.0:
726; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
727; AVX-NEXT:    retq
728  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
729  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
730  ret <4 x i32> %2
731}
732
733define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
734; SSE-LABEL: combine_nested_undef_test8:
735; SSE:       # %bb.0:
736; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
737; SSE-NEXT:    retq
738;
739; AVX-LABEL: combine_nested_undef_test8:
740; AVX:       # %bb.0:
741; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
742; AVX-NEXT:    retq
743  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
744  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
745  ret <4 x i32> %2
746}
747
748define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
749; SSE-LABEL: combine_nested_undef_test9:
750; SSE:       # %bb.0:
751; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
752; SSE-NEXT:    retq
753;
754; AVX-LABEL: combine_nested_undef_test9:
755; AVX:       # %bb.0:
756; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3,2,2]
757; AVX-NEXT:    retq
758  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
759  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
760  ret <4 x i32> %2
761}
762
763define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
764; SSE-LABEL: combine_nested_undef_test10:
765; SSE:       # %bb.0:
766; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
767; SSE-NEXT:    retq
768;
769; AVX-LABEL: combine_nested_undef_test10:
770; AVX:       # %bb.0:
771; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
772; AVX-NEXT:    retq
773  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
774  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
775  ret <4 x i32> %2
776}
777
778define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
779; SSE-LABEL: combine_nested_undef_test11:
780; SSE:       # %bb.0:
781; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
782; SSE-NEXT:    retq
783;
784; AVX-LABEL: combine_nested_undef_test11:
785; AVX:       # %bb.0:
786; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,2,1]
787; AVX-NEXT:    retq
788  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
789  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
790  ret <4 x i32> %2
791}
792
793define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
794; SSE-LABEL: combine_nested_undef_test12:
795; SSE:       # %bb.0:
796; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
797; SSE-NEXT:    retq
798;
799; AVX1-LABEL: combine_nested_undef_test12:
800; AVX1:       # %bb.0:
801; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
802; AVX1-NEXT:    retq
803;
804; AVX2-LABEL: combine_nested_undef_test12:
805; AVX2:       # %bb.0:
806; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
807; AVX2-NEXT:    retq
808  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
809  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
810  ret <4 x i32> %2
811}
812
813; The following pair of shuffles is folded into vector %A.
814define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
815; CHECK-LABEL: combine_nested_undef_test13:
816; CHECK:       # %bb.0:
817; CHECK-NEXT:    retq
818  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
819  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
820  ret <4 x i32> %2
821}
822
823; The following pair of shuffles is folded into vector %B.
824define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
825; SSE-LABEL: combine_nested_undef_test14:
826; SSE:       # %bb.0:
827; SSE-NEXT:    movaps %xmm1, %xmm0
828; SSE-NEXT:    retq
829;
830; AVX-LABEL: combine_nested_undef_test14:
831; AVX:       # %bb.0:
832; AVX-NEXT:    vmovaps %xmm1, %xmm0
833; AVX-NEXT:    retq
834  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
835  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
836  ret <4 x i32> %2
837}
838
839
840; Verify that we don't optimize the following cases. We expect more than one shuffle.
841;
842; FIXME: Many of these already don't make sense, and the rest should stop
843; making sense with th enew vector shuffle lowering. Revisit at least testing for
844; it.
845
846define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
847; SSE2-LABEL: combine_nested_undef_test15:
848; SSE2:       # %bb.0:
849; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
850; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
851; SSE2-NEXT:    movaps %xmm1, %xmm0
852; SSE2-NEXT:    retq
853;
854; SSSE3-LABEL: combine_nested_undef_test15:
855; SSSE3:       # %bb.0:
856; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
857; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
858; SSSE3-NEXT:    movaps %xmm1, %xmm0
859; SSSE3-NEXT:    retq
860;
861; SSE41-LABEL: combine_nested_undef_test15:
862; SSE41:       # %bb.0:
863; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
864; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
865; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
866; SSE41-NEXT:    retq
867;
868; AVX1-LABEL: combine_nested_undef_test15:
869; AVX1:       # %bb.0:
870; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0,1,1]
871; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1]
872; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
873; AVX1-NEXT:    retq
874;
875; AVX2-LABEL: combine_nested_undef_test15:
876; AVX2:       # %bb.0:
877; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
878; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1]
879; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
880; AVX2-NEXT:    retq
881  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
882  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
883  ret <4 x i32> %2
884}
885
886define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
887; SSE2-LABEL: combine_nested_undef_test16:
888; SSE2:       # %bb.0:
889; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
890; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
891; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
892; SSE2-NEXT:    retq
893;
894; SSSE3-LABEL: combine_nested_undef_test16:
895; SSSE3:       # %bb.0:
896; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
897; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
898; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
899; SSSE3-NEXT:    retq
900;
901; SSE41-LABEL: combine_nested_undef_test16:
902; SSE41:       # %bb.0:
903; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
904; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
905; SSE41-NEXT:    retq
906;
907; AVX-LABEL: combine_nested_undef_test16:
908; AVX:       # %bb.0:
909; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
910; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
911; AVX-NEXT:    retq
912  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
913  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
914  ret <4 x i32> %2
915}
916
917define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
918; SSE2-LABEL: combine_nested_undef_test17:
919; SSE2:       # %bb.0:
920; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
921; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
922; SSE2-NEXT:    retq
923;
924; SSSE3-LABEL: combine_nested_undef_test17:
925; SSSE3:       # %bb.0:
926; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
927; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
928; SSSE3-NEXT:    retq
929;
930; SSE41-LABEL: combine_nested_undef_test17:
931; SSE41:       # %bb.0:
932; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
933; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
934; SSE41-NEXT:    retq
935;
936; AVX-LABEL: combine_nested_undef_test17:
937; AVX:       # %bb.0:
938; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
939; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1]
940; AVX-NEXT:    retq
941  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
942  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
943  ret <4 x i32> %2
944}
945
946define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
947; SSE-LABEL: combine_nested_undef_test18:
948; SSE:       # %bb.0:
949; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
950; SSE-NEXT:    retq
951;
952; AVX-LABEL: combine_nested_undef_test18:
953; AVX:       # %bb.0:
954; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[1,1,0,3]
955; AVX-NEXT:    retq
956  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
957  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
958  ret <4 x i32> %2
959}
960
961define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
962; SSE2-LABEL: combine_nested_undef_test19:
963; SSE2:       # %bb.0:
964; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
965; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
966; SSE2-NEXT:    retq
967;
968; SSSE3-LABEL: combine_nested_undef_test19:
969; SSSE3:       # %bb.0:
970; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
971; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
972; SSSE3-NEXT:    retq
973;
974; SSE41-LABEL: combine_nested_undef_test19:
975; SSE41:       # %bb.0:
976; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
977; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
978; SSE41-NEXT:    retq
979;
980; AVX-LABEL: combine_nested_undef_test19:
981; AVX:       # %bb.0:
982; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
983; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
984; AVX-NEXT:    retq
985  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
986  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
987  ret <4 x i32> %2
988}
989
990define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
991; SSE2-LABEL: combine_nested_undef_test20:
992; SSE2:       # %bb.0:
993; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
994; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
995; SSE2-NEXT:    movaps %xmm1, %xmm0
996; SSE2-NEXT:    retq
997;
998; SSSE3-LABEL: combine_nested_undef_test20:
999; SSSE3:       # %bb.0:
1000; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1001; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1002; SSSE3-NEXT:    movaps %xmm1, %xmm0
1003; SSSE3-NEXT:    retq
1004;
1005; SSE41-LABEL: combine_nested_undef_test20:
1006; SSE41:       # %bb.0:
1007; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1008; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1009; SSE41-NEXT:    retq
1010;
1011; AVX-LABEL: combine_nested_undef_test20:
1012; AVX:       # %bb.0:
1013; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1014; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,3,0]
1015; AVX-NEXT:    retq
1016  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
1017  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1018  ret <4 x i32> %2
1019}
1020
1021define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
1022; SSE2-LABEL: combine_nested_undef_test21:
1023; SSE2:       # %bb.0:
1024; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1025; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1026; SSE2-NEXT:    retq
1027;
1028; SSSE3-LABEL: combine_nested_undef_test21:
1029; SSSE3:       # %bb.0:
1030; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1031; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1032; SSSE3-NEXT:    retq
1033;
1034; SSE41-LABEL: combine_nested_undef_test21:
1035; SSE41:       # %bb.0:
1036; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1037; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1038; SSE41-NEXT:    retq
1039;
1040; AVX1-LABEL: combine_nested_undef_test21:
1041; AVX1:       # %bb.0:
1042; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1043; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1044; AVX1-NEXT:    retq
1045;
1046; AVX2-LABEL: combine_nested_undef_test21:
1047; AVX2:       # %bb.0:
1048; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1049; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1050; AVX2-NEXT:    retq
1051  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
1052  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1053  ret <4 x i32> %2
1054}
1055
1056
1057; Test that we correctly combine shuffles according to rule
1058;  shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
1059
1060define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
1061; SSE-LABEL: combine_nested_undef_test22:
1062; SSE:       # %bb.0:
1063; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1064; SSE-NEXT:    retq
1065;
1066; AVX-LABEL: combine_nested_undef_test22:
1067; AVX:       # %bb.0:
1068; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[1,1,1,3]
1069; AVX-NEXT:    retq
1070  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1071  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
1072  ret <4 x i32> %2
1073}
1074
1075define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
1076; SSE-LABEL: combine_nested_undef_test23:
1077; SSE:       # %bb.0:
1078; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1079; SSE-NEXT:    retq
1080;
1081; AVX-LABEL: combine_nested_undef_test23:
1082; AVX:       # %bb.0:
1083; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1,0,3]
1084; AVX-NEXT:    retq
1085  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1086  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1087  ret <4 x i32> %2
1088}
1089
1090define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
1091; SSE-LABEL: combine_nested_undef_test24:
1092; SSE:       # %bb.0:
1093; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1094; SSE-NEXT:    retq
1095;
1096; AVX-LABEL: combine_nested_undef_test24:
1097; AVX:       # %bb.0:
1098; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,3,2,3]
1099; AVX-NEXT:    retq
1100  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1101  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
1102  ret <4 x i32> %2
1103}
1104
1105define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
1106; SSE-LABEL: combine_nested_undef_test25:
1107; SSE:       # %bb.0:
1108; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1109; SSE-NEXT:    retq
1110;
1111; AVX1-LABEL: combine_nested_undef_test25:
1112; AVX1:       # %bb.0:
1113; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1114; AVX1-NEXT:    retq
1115;
1116; AVX2-LABEL: combine_nested_undef_test25:
1117; AVX2:       # %bb.0:
1118; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1119; AVX2-NEXT:    retq
1120  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
1121  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
1122  ret <4 x i32> %2
1123}
1124
1125define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
1126; SSE-LABEL: combine_nested_undef_test26:
1127; SSE:       # %bb.0:
1128; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1129; SSE-NEXT:    retq
1130;
1131; AVX-LABEL: combine_nested_undef_test26:
1132; AVX:       # %bb.0:
1133; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
1134; AVX-NEXT:    retq
1135  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
1136  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
1137  ret <4 x i32> %2
1138}
1139
1140define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
1141; SSE-LABEL: combine_nested_undef_test27:
1142; SSE:       # %bb.0:
1143; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1144; SSE-NEXT:    retq
1145;
1146; AVX1-LABEL: combine_nested_undef_test27:
1147; AVX1:       # %bb.0:
1148; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1149; AVX1-NEXT:    retq
1150;
1151; AVX2-LABEL: combine_nested_undef_test27:
1152; AVX2:       # %bb.0:
1153; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1154; AVX2-NEXT:    retq
1155  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
1156  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
1157  ret <4 x i32> %2
1158}
1159
1160define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
1161; SSE-LABEL: combine_nested_undef_test28:
1162; SSE:       # %bb.0:
1163; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1164; SSE-NEXT:    retq
1165;
1166; AVX-LABEL: combine_nested_undef_test28:
1167; AVX:       # %bb.0:
1168; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,1,0]
1169; AVX-NEXT:    retq
1170  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
1171  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
1172  ret <4 x i32> %2
1173}
1174
1175define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
1176; SSE-LABEL: combine_test1:
1177; SSE:       # %bb.0:
1178; SSE-NEXT:    movaps %xmm1, %xmm0
1179; SSE-NEXT:    retq
1180;
1181; AVX-LABEL: combine_test1:
1182; AVX:       # %bb.0:
1183; AVX-NEXT:    vmovaps %xmm1, %xmm0
1184; AVX-NEXT:    retq
1185  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1186  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1187  ret <4 x float> %2
1188}
1189
1190define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
1191; SSE2-LABEL: combine_test2:
1192; SSE2:       # %bb.0:
1193; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1194; SSE2-NEXT:    movaps %xmm1, %xmm0
1195; SSE2-NEXT:    retq
1196;
1197; SSSE3-LABEL: combine_test2:
1198; SSSE3:       # %bb.0:
1199; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1200; SSSE3-NEXT:    movaps %xmm1, %xmm0
1201; SSSE3-NEXT:    retq
1202;
1203; SSE41-LABEL: combine_test2:
1204; SSE41:       # %bb.0:
1205; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1206; SSE41-NEXT:    retq
1207;
1208; AVX-LABEL: combine_test2:
1209; AVX:       # %bb.0:
1210; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1211; AVX-NEXT:    retq
1212  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1213  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1214  ret <4 x float> %2
1215}
1216
1217define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
1218; SSE-LABEL: combine_test3:
1219; SSE:       # %bb.0:
1220; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1221; SSE-NEXT:    retq
1222;
1223; AVX-LABEL: combine_test3:
1224; AVX:       # %bb.0:
1225; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1226; AVX-NEXT:    retq
1227  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1228  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1229  ret <4 x float> %2
1230}
1231
1232define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
1233; SSE-LABEL: combine_test4:
1234; SSE:       # %bb.0:
1235; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1236; SSE-NEXT:    retq
1237;
1238; AVX-LABEL: combine_test4:
1239; AVX:       # %bb.0:
1240; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1241; AVX-NEXT:    retq
1242  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1243  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1244  ret <4 x float> %2
1245}
1246
1247define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
1248; SSE2-LABEL: combine_test5:
1249; SSE2:       # %bb.0:
1250; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1251; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1252; SSE2-NEXT:    retq
1253;
1254; SSSE3-LABEL: combine_test5:
1255; SSSE3:       # %bb.0:
1256; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1257; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1258; SSSE3-NEXT:    retq
1259;
1260; SSE41-LABEL: combine_test5:
1261; SSE41:       # %bb.0:
1262; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1263; SSE41-NEXT:    retq
1264;
1265; AVX-LABEL: combine_test5:
1266; AVX:       # %bb.0:
1267; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1268; AVX-NEXT:    retq
1269  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1270  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1271  ret <4 x float> %2
1272}
1273
1274define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
1275; SSE-LABEL: combine_test6:
1276; SSE:       # %bb.0:
1277; SSE-NEXT:    movaps %xmm1, %xmm0
1278; SSE-NEXT:    retq
1279;
1280; AVX-LABEL: combine_test6:
1281; AVX:       # %bb.0:
1282; AVX-NEXT:    vmovaps %xmm1, %xmm0
1283; AVX-NEXT:    retq
1284  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1285  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1286  ret <4 x i32> %2
1287}
1288
1289define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
1290; SSE2-LABEL: combine_test7:
1291; SSE2:       # %bb.0:
1292; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1293; SSE2-NEXT:    movaps %xmm1, %xmm0
1294; SSE2-NEXT:    retq
1295;
1296; SSSE3-LABEL: combine_test7:
1297; SSSE3:       # %bb.0:
1298; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1299; SSSE3-NEXT:    movaps %xmm1, %xmm0
1300; SSSE3-NEXT:    retq
1301;
1302; SSE41-LABEL: combine_test7:
1303; SSE41:       # %bb.0:
1304; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1305; SSE41-NEXT:    retq
1306;
1307; AVX-LABEL: combine_test7:
1308; AVX:       # %bb.0:
1309; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1310; AVX-NEXT:    retq
1311  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1312  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1313  ret <4 x i32> %2
1314}
1315
1316define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
1317; SSE-LABEL: combine_test8:
1318; SSE:       # %bb.0:
1319; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1320; SSE-NEXT:    retq
1321;
1322; AVX-LABEL: combine_test8:
1323; AVX:       # %bb.0:
1324; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1325; AVX-NEXT:    retq
1326  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1327  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1328  ret <4 x i32> %2
1329}
1330
1331define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
1332; SSE-LABEL: combine_test9:
1333; SSE:       # %bb.0:
1334; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1335; SSE-NEXT:    movaps %xmm1, %xmm0
1336; SSE-NEXT:    retq
1337;
1338; AVX-LABEL: combine_test9:
1339; AVX:       # %bb.0:
1340; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1341; AVX-NEXT:    retq
1342  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1343  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1344  ret <4 x i32> %2
1345}
1346
1347define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
1348; SSE2-LABEL: combine_test10:
1349; SSE2:       # %bb.0:
1350; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1351; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1352; SSE2-NEXT:    retq
1353;
1354; SSSE3-LABEL: combine_test10:
1355; SSSE3:       # %bb.0:
1356; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1357; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1358; SSSE3-NEXT:    retq
1359;
1360; SSE41-LABEL: combine_test10:
1361; SSE41:       # %bb.0:
1362; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1363; SSE41-NEXT:    retq
1364;
1365; AVX-LABEL: combine_test10:
1366; AVX:       # %bb.0:
1367; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1368; AVX-NEXT:    retq
1369  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1370  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1371  ret <4 x i32> %2
1372}
1373
1374define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
1375; CHECK-LABEL: combine_test11:
1376; CHECK:       # %bb.0:
1377; CHECK-NEXT:    retq
1378  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1379  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1380  ret <4 x float> %2
1381}
1382
1383define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
1384; SSE2-LABEL: combine_test12:
1385; SSE2:       # %bb.0:
1386; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1387; SSE2-NEXT:    movaps %xmm1, %xmm0
1388; SSE2-NEXT:    retq
1389;
1390; SSSE3-LABEL: combine_test12:
1391; SSSE3:       # %bb.0:
1392; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1393; SSSE3-NEXT:    movaps %xmm1, %xmm0
1394; SSSE3-NEXT:    retq
1395;
1396; SSE41-LABEL: combine_test12:
1397; SSE41:       # %bb.0:
1398; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1399; SSE41-NEXT:    retq
1400;
1401; AVX-LABEL: combine_test12:
1402; AVX:       # %bb.0:
1403; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1404; AVX-NEXT:    retq
1405  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1406  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1407  ret <4 x float> %2
1408}
1409
1410define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
1411; SSE-LABEL: combine_test13:
1412; SSE:       # %bb.0:
1413; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1414; SSE-NEXT:    retq
1415;
1416; AVX-LABEL: combine_test13:
1417; AVX:       # %bb.0:
1418; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1419; AVX-NEXT:    retq
1420  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1421  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1422  ret <4 x float> %2
1423}
1424
1425define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
1426; SSE-LABEL: combine_test14:
1427; SSE:       # %bb.0:
1428; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1429; SSE-NEXT:    retq
1430;
1431; AVX-LABEL: combine_test14:
1432; AVX:       # %bb.0:
1433; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1434; AVX-NEXT:    retq
1435  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1436  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1437  ret <4 x float> %2
1438}
1439
1440define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
1441; SSE2-LABEL: combine_test15:
1442; SSE2:       # %bb.0:
1443; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1444; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1445; SSE2-NEXT:    retq
1446;
1447; SSSE3-LABEL: combine_test15:
1448; SSSE3:       # %bb.0:
1449; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1450; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1451; SSSE3-NEXT:    retq
1452;
1453; SSE41-LABEL: combine_test15:
1454; SSE41:       # %bb.0:
1455; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1456; SSE41-NEXT:    retq
1457;
1458; AVX-LABEL: combine_test15:
1459; AVX:       # %bb.0:
1460; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1461; AVX-NEXT:    retq
1462  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1463  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1464  ret <4 x float> %2
1465}
1466
1467define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
1468; CHECK-LABEL: combine_test16:
1469; CHECK:       # %bb.0:
1470; CHECK-NEXT:    retq
1471  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1472  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1473  ret <4 x i32> %2
1474}
1475
1476define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
1477; SSE2-LABEL: combine_test17:
1478; SSE2:       # %bb.0:
1479; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1480; SSE2-NEXT:    movaps %xmm1, %xmm0
1481; SSE2-NEXT:    retq
1482;
1483; SSSE3-LABEL: combine_test17:
1484; SSSE3:       # %bb.0:
1485; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1486; SSSE3-NEXT:    movaps %xmm1, %xmm0
1487; SSSE3-NEXT:    retq
1488;
1489; SSE41-LABEL: combine_test17:
1490; SSE41:       # %bb.0:
1491; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1492; SSE41-NEXT:    retq
1493;
1494; AVX-LABEL: combine_test17:
1495; AVX:       # %bb.0:
1496; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1497; AVX-NEXT:    retq
1498  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1499  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1500  ret <4 x i32> %2
1501}
1502
1503define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
1504; SSE-LABEL: combine_test18:
1505; SSE:       # %bb.0:
1506; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1507; SSE-NEXT:    retq
1508;
1509; AVX-LABEL: combine_test18:
1510; AVX:       # %bb.0:
1511; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1512; AVX-NEXT:    retq
1513  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1514  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1515  ret <4 x i32> %2
1516}
1517
1518define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
1519; SSE-LABEL: combine_test19:
1520; SSE:       # %bb.0:
1521; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1522; SSE-NEXT:    retq
1523;
1524; AVX-LABEL: combine_test19:
1525; AVX:       # %bb.0:
1526; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1527; AVX-NEXT:    retq
1528  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1529  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1530  ret <4 x i32> %2
1531}
1532
1533define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
1534; SSE2-LABEL: combine_test20:
1535; SSE2:       # %bb.0:
1536; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1537; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1538; SSE2-NEXT:    retq
1539;
1540; SSSE3-LABEL: combine_test20:
1541; SSSE3:       # %bb.0:
1542; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1543; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1544; SSSE3-NEXT:    retq
1545;
1546; SSE41-LABEL: combine_test20:
1547; SSE41:       # %bb.0:
1548; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1549; SSE41-NEXT:    retq
1550;
1551; AVX-LABEL: combine_test20:
1552; AVX:       # %bb.0:
1553; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1554; AVX-NEXT:    retq
1555  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1556  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1557  ret <4 x i32> %2
1558}
1559
1560define <4 x i32> @combine_test21(<8 x i32> %a, ptr %ptr) {
1561; SSE-LABEL: combine_test21:
1562; SSE:       # %bb.0:
1563; SSE-NEXT:    movaps %xmm0, %xmm2
1564; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1565; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1566; SSE-NEXT:    movaps %xmm2, (%rdi)
1567; SSE-NEXT:    retq
1568;
1569; AVX1-LABEL: combine_test21:
1570; AVX1:       # %bb.0:
1571; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1572; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1573; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1574; AVX1-NEXT:    vmovaps %xmm2, (%rdi)
1575; AVX1-NEXT:    vzeroupper
1576; AVX1-NEXT:    retq
1577;
1578; AVX2-LABEL: combine_test21:
1579; AVX2:       # %bb.0:
1580; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[1,3,2,3]
1581; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
1582; AVX2-NEXT:    vmovaps %xmm0, (%rdi)
1583; AVX2-NEXT:    vmovaps %xmm1, %xmm0
1584; AVX2-NEXT:    vzeroupper
1585; AVX2-NEXT:    retq
1586  %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1587  %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1588  store <4 x i32> %1, ptr %ptr, align 16
1589  ret <4 x i32> %2
1590}
1591
1592define <8 x float> @combine_test22(ptr %a, ptr %b) {
1593; SSE-LABEL: combine_test22:
1594; SSE:       # %bb.0:
1595; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1596; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1597; SSE-NEXT:    retq
1598;
1599; AVX-LABEL: combine_test22:
1600; AVX:       # %bb.0:
1601; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1602; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1603; AVX-NEXT:    retq
1604; Current AVX2 lowering of this is still awful, not adding a test case.
1605  %1 = load <2 x float>, ptr %a, align 8
1606  %2 = load <2 x float>, ptr %b, align 8
1607  %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1608  ret <8 x float> %3
1609}
1610
1611; PR22359
1612define void @combine_test23(<8 x float> %v, ptr %ptr) {
1613; SSE-LABEL: combine_test23:
1614; SSE:       # %bb.0:
1615; SSE-NEXT:    movups %xmm0, (%rdi)
1616; SSE-NEXT:    retq
1617;
1618; AVX-LABEL: combine_test23:
1619; AVX:       # %bb.0:
1620; AVX-NEXT:    vmovups %xmm0, (%rdi)
1621; AVX-NEXT:    vzeroupper
1622; AVX-NEXT:    retq
1623  %idx2 = getelementptr inbounds <2 x float>, ptr %ptr, i64 1
1624  %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1>
1625  %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3>
1626  store <2 x float> %shuffle0, ptr %ptr, align 8
1627  store <2 x float> %shuffle1, ptr %idx2, align 8
1628  ret void
1629}
1630
1631; Check some negative cases.
1632; FIXME: Do any of these really make sense? Are they redundant with the above tests?
1633
1634define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
1635; SSE-LABEL: combine_test1b:
1636; SSE:       # %bb.0:
1637; SSE-NEXT:    movaps %xmm1, %xmm0
1638; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
1639; SSE-NEXT:    retq
1640;
1641; AVX-LABEL: combine_test1b:
1642; AVX:       # %bb.0:
1643; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1,2,0]
1644; AVX-NEXT:    retq
1645  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1646  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
1647  ret <4 x float> %2
1648}
1649
1650define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
1651; SSE2-LABEL: combine_test2b:
1652; SSE2:       # %bb.0:
1653; SSE2-NEXT:    movaps %xmm1, %xmm0
1654; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1655; SSE2-NEXT:    retq
1656;
1657; SSSE3-LABEL: combine_test2b:
1658; SSSE3:       # %bb.0:
1659; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1660; SSSE3-NEXT:    retq
1661;
1662; SSE41-LABEL: combine_test2b:
1663; SSE41:       # %bb.0:
1664; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1665; SSE41-NEXT:    retq
1666;
1667; AVX-LABEL: combine_test2b:
1668; AVX:       # %bb.0:
1669; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm1[0,0]
1670; AVX-NEXT:    retq
1671  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1672  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
1673  ret <4 x float> %2
1674}
1675
1676define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
1677; SSE2-LABEL: combine_test3b:
1678; SSE2:       # %bb.0:
1679; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1680; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1681; SSE2-NEXT:    retq
1682;
1683; SSSE3-LABEL: combine_test3b:
1684; SSSE3:       # %bb.0:
1685; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1686; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1687; SSSE3-NEXT:    retq
1688;
1689; SSE41-LABEL: combine_test3b:
1690; SSE41:       # %bb.0:
1691; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1692; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1693; SSE41-NEXT:    retq
1694;
1695; AVX-LABEL: combine_test3b:
1696; AVX:       # %bb.0:
1697; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1698; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1699; AVX-NEXT:    retq
1700  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
1701  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
1702  ret <4 x float> %2
1703}
1704
1705define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
1706; SSE-LABEL: combine_test4b:
1707; SSE:       # %bb.0:
1708; SSE-NEXT:    movaps %xmm1, %xmm0
1709; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
1710; SSE-NEXT:    retq
1711;
1712; AVX-LABEL: combine_test4b:
1713; AVX:       # %bb.0:
1714; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[1,1,2,3]
1715; AVX-NEXT:    retq
1716  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1717  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
1718  ret <4 x float> %2
1719}
1720
1721
1722; Verify that we correctly fold shuffles even when we use illegal vector types.
1723
1724define <4 x i8> @combine_test1c(ptr %a, ptr %b) {
1725; SSE2-LABEL: combine_test1c:
1726; SSE2:       # %bb.0:
1727; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1728; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1729; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1730; SSE2-NEXT:    andps %xmm0, %xmm2
1731; SSE2-NEXT:    andnps %xmm1, %xmm0
1732; SSE2-NEXT:    orps %xmm2, %xmm0
1733; SSE2-NEXT:    retq
1734;
1735; SSSE3-LABEL: combine_test1c:
1736; SSSE3:       # %bb.0:
1737; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1738; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1739; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1740; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
1741; SSSE3-NEXT:    retq
1742;
1743; SSE41-LABEL: combine_test1c:
1744; SSE41:       # %bb.0:
1745; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1746; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1747; SSE41-NEXT:    movss {{.*#+}} xmm0 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
1748; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1749; SSE41-NEXT:    movdqa %xmm1, %xmm0
1750; SSE41-NEXT:    retq
1751;
1752; AVX-LABEL: combine_test1c:
1753; AVX:       # %bb.0:
1754; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1755; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1756; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
1757; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1758; AVX-NEXT:    retq
1759  %A = load <4 x i8>, ptr %a
1760  %B = load <4 x i8>, ptr %b
1761  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1762  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1763  ret <4 x i8> %2
1764}
1765
1766define <4 x i8> @combine_test2c(ptr %a, ptr %b) {
1767; SSE-LABEL: combine_test2c:
1768; SSE:       # %bb.0:
1769; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1770; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1771; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1772; SSE-NEXT:    retq
1773;
1774; AVX-LABEL: combine_test2c:
1775; AVX:       # %bb.0:
1776; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1777; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1778; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1779; AVX-NEXT:    retq
1780  %A = load <4 x i8>, ptr %a
1781  %B = load <4 x i8>, ptr %b
1782  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
1783  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1784  ret <4 x i8> %2
1785}
1786
1787define <4 x i8> @combine_test3c(ptr %a, ptr %b) {
1788; SSE-LABEL: combine_test3c:
1789; SSE:       # %bb.0:
1790; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1791; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1792; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1793; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1794; SSE-NEXT:    retq
1795;
1796; AVX-LABEL: combine_test3c:
1797; AVX:       # %bb.0:
1798; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1799; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1800; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1801; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1802; AVX-NEXT:    retq
1803  %A = load <4 x i8>, ptr %a
1804  %B = load <4 x i8>, ptr %b
1805  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1806  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1807  ret <4 x i8> %2
1808}
1809
1810define <4 x i8> @combine_test4c(ptr %a, ptr %b) {
1811; SSE2-LABEL: combine_test4c:
1812; SSE2:       # %bb.0:
1813; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1814; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1815; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1816; SSE2-NEXT:    andps %xmm0, %xmm2
1817; SSE2-NEXT:    andnps %xmm1, %xmm0
1818; SSE2-NEXT:    orps %xmm2, %xmm0
1819; SSE2-NEXT:    retq
1820;
1821; SSSE3-LABEL: combine_test4c:
1822; SSSE3:       # %bb.0:
1823; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1824; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1825; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1826; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
1827; SSSE3-NEXT:    retq
1828;
1829; SSE41-LABEL: combine_test4c:
1830; SSE41:       # %bb.0:
1831; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1832; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1833; SSE41-NEXT:    movss {{.*#+}} xmm0 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
1834; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1835; SSE41-NEXT:    movdqa %xmm1, %xmm0
1836; SSE41-NEXT:    retq
1837;
1838; AVX-LABEL: combine_test4c:
1839; AVX:       # %bb.0:
1840; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1841; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1842; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
1843; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1844; AVX-NEXT:    retq
1845  %A = load <4 x i8>, ptr %a
1846  %B = load <4 x i8>, ptr %b
1847  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1848  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1849  ret <4 x i8> %2
1850}
1851
1852
1853; The following test cases are generated from this C++ code
1854;
1855;__m128 blend_01(__m128 a, __m128 b)
1856;{
1857;  __m128 s = a;
1858;  s = _mm_blend_ps( s, b, 1<<0 );
1859;  s = _mm_blend_ps( s, b, 1<<1 );
1860;  return s;
1861;}
1862;
1863;__m128 blend_02(__m128 a, __m128 b)
1864;{
1865;  __m128 s = a;
1866;  s = _mm_blend_ps( s, b, 1<<0 );
1867;  s = _mm_blend_ps( s, b, 1<<2 );
1868;  return s;
1869;}
1870;
1871;__m128 blend_123(__m128 a, __m128 b)
1872;{
1873;  __m128 s = a;
1874;  s = _mm_blend_ps( s, b, 1<<1 );
1875;  s = _mm_blend_ps( s, b, 1<<2 );
1876;  s = _mm_blend_ps( s, b, 1<<3 );
1877;  return s;
1878;}
1879
1880; Ideally, we should collapse the following shuffles into a single one.
1881
1882define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
1883; SSE2-LABEL: combine_blend_01:
1884; SSE2:       # %bb.0:
1885; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1886; SSE2-NEXT:    retq
1887;
1888; SSSE3-LABEL: combine_blend_01:
1889; SSSE3:       # %bb.0:
1890; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1891; SSSE3-NEXT:    retq
1892;
1893; SSE41-LABEL: combine_blend_01:
1894; SSE41:       # %bb.0:
1895; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1896; SSE41-NEXT:    retq
1897;
1898; AVX-LABEL: combine_blend_01:
1899; AVX:       # %bb.0:
1900; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1901; AVX-NEXT:    retq
1902  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
1903  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1904  ret <4 x float> %shuffle6
1905}
1906
1907define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
1908; SSE2-LABEL: combine_blend_02:
1909; SSE2:       # %bb.0:
1910; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
1911; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
1912; SSE2-NEXT:    movaps %xmm1, %xmm0
1913; SSE2-NEXT:    retq
1914;
1915; SSSE3-LABEL: combine_blend_02:
1916; SSSE3:       # %bb.0:
1917; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
1918; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
1919; SSSE3-NEXT:    movaps %xmm1, %xmm0
1920; SSSE3-NEXT:    retq
1921;
1922; SSE41-LABEL: combine_blend_02:
1923; SSE41:       # %bb.0:
1924; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1925; SSE41-NEXT:    retq
1926;
1927; AVX-LABEL: combine_blend_02:
1928; AVX:       # %bb.0:
1929; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1930; AVX-NEXT:    retq
1931  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
1932  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1933  ret <4 x float> %shuffle6
1934}
1935
1936define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
1937; SSE2-LABEL: combine_blend_123:
1938; SSE2:       # %bb.0:
1939; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1940; SSE2-NEXT:    movaps %xmm1, %xmm0
1941; SSE2-NEXT:    retq
1942;
1943; SSSE3-LABEL: combine_blend_123:
1944; SSSE3:       # %bb.0:
1945; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1946; SSSE3-NEXT:    movaps %xmm1, %xmm0
1947; SSSE3-NEXT:    retq
1948;
1949; SSE41-LABEL: combine_blend_123:
1950; SSE41:       # %bb.0:
1951; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1952; SSE41-NEXT:    retq
1953;
1954; AVX-LABEL: combine_blend_123:
1955; AVX:       # %bb.0:
1956; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1957; AVX-NEXT:    retq
1958  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
1959  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
1960  %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1961  ret <4 x float> %shuffle12
1962}
1963
1964define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
1965; SSE-LABEL: combine_test_movhl_1:
1966; SSE:       # %bb.0:
1967; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1968; SSE-NEXT:    movaps %xmm1, %xmm0
1969; SSE-NEXT:    retq
1970;
1971; AVX-LABEL: combine_test_movhl_1:
1972; AVX:       # %bb.0:
1973; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1974; AVX-NEXT:    retq
1975  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
1976  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
1977  ret <4 x i32> %2
1978}
1979
1980define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
1981; SSE-LABEL: combine_test_movhl_2:
1982; SSE:       # %bb.0:
1983; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1984; SSE-NEXT:    movaps %xmm1, %xmm0
1985; SSE-NEXT:    retq
1986;
1987; AVX-LABEL: combine_test_movhl_2:
1988; AVX:       # %bb.0:
1989; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1990; AVX-NEXT:    retq
1991  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
1992  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
1993  ret <4 x i32> %2
1994}
1995
1996define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
1997; SSE-LABEL: combine_test_movhl_3:
1998; SSE:       # %bb.0:
1999; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2000; SSE-NEXT:    movaps %xmm1, %xmm0
2001; SSE-NEXT:    retq
2002;
2003; AVX-LABEL: combine_test_movhl_3:
2004; AVX:       # %bb.0:
2005; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2006; AVX-NEXT:    retq
2007  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
2008  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
2009  ret <4 x i32> %2
2010}
2011
2012define <16 x i8> @combine_and_or_shuffle(<16 x i8> %x, <16 x i8> %y) {
2013; SSE2-LABEL: combine_and_or_shuffle:
2014; SSE2:       # %bb.0:
2015; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
2016; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2017; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2018; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,2,4,5,6,7]
2019; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,5,7,7]
2020; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2021; SSE2-NEXT:    pxor %xmm3, %xmm3
2022; SSE2-NEXT:    movdqa %xmm1, %xmm0
2023; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
2024; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
2025; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,1,3]
2026; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,0,0,65535,65535]
2027; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2028; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
2029; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,0,2,1,4,5,6,7]
2030; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
2031; SSE2-NEXT:    pand %xmm0, %xmm1
2032; SSE2-NEXT:    pandn %xmm4, %xmm0
2033; SSE2-NEXT:    por %xmm1, %xmm0
2034; SSE2-NEXT:    packuswb %xmm0, %xmm0
2035; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2036; SSE2-NEXT:    por %xmm2, %xmm0
2037; SSE2-NEXT:    retq
2038;
2039; SSSE3-LABEL: combine_and_or_shuffle:
2040; SSSE3:       # %bb.0:
2041; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,xmm0[u],zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero
2042; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[7,u,0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero
2043; SSSE3-NEXT:    por %xmm1, %xmm0
2044; SSSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2045; SSSE3-NEXT:    retq
2046;
2047; SSE41-LABEL: combine_and_or_shuffle:
2048; SSE41:       # %bb.0:
2049; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,xmm0[u],zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero
2050; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[7,u,0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero
2051; SSE41-NEXT:    por %xmm1, %xmm0
2052; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2053; SSE41-NEXT:    retq
2054;
2055; AVX-LABEL: combine_and_or_shuffle:
2056; AVX:       # %bb.0:
2057; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[u],zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero
2058; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[7,u,0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero
2059; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2060; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2061; AVX-NEXT:    retq
2062  %1 = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 15, i32 16, i32 1, i32 16, i32 14, i32 16, i32 2, i32 16, i32 13, i32 16, i32 3, i32 16, i32 16>
2063  %2 = shufflevector <16 x i8> %y, <16 x i8> zeroinitializer, <16 x i32> <i32 7, i32 16, i32 0, i32 16, i32 8, i32 16, i32 1, i32 16, i32 9, i32 16, i32 10, i32 16, i32 7, i32 16, i32 7, i32 16>
2064  %3 = or <16 x i8> %1, %2
2065  %4 = and <16 x i8> %3, <i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
2066  ret <16 x i8> %4
2067}
2068
2069; Verify that we fold shuffles according to rule:
2070;  (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
2071
2072define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
2073; SSE2-LABEL: combine_undef_input_test1:
2074; SSE2:       # %bb.0:
2075; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2076; SSE2-NEXT:    retq
2077;
2078; SSSE3-LABEL: combine_undef_input_test1:
2079; SSSE3:       # %bb.0:
2080; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2081; SSSE3-NEXT:    retq
2082;
2083; SSE41-LABEL: combine_undef_input_test1:
2084; SSE41:       # %bb.0:
2085; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2086; SSE41-NEXT:    retq
2087;
2088; AVX-LABEL: combine_undef_input_test1:
2089; AVX:       # %bb.0:
2090; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2091; AVX-NEXT:    retq
2092  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2093  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2094  ret <4 x float> %2
2095}
2096
2097define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
2098; SSE-LABEL: combine_undef_input_test2:
2099; SSE:       # %bb.0:
2100; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2101; SSE-NEXT:    retq
2102;
2103; AVX-LABEL: combine_undef_input_test2:
2104; AVX:       # %bb.0:
2105; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2106; AVX-NEXT:    retq
2107  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2108  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2109  ret <4 x float> %2
2110}
2111
2112define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
2113; SSE-LABEL: combine_undef_input_test3:
2114; SSE:       # %bb.0:
2115; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2116; SSE-NEXT:    retq
2117;
2118; AVX-LABEL: combine_undef_input_test3:
2119; AVX:       # %bb.0:
2120; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2121; AVX-NEXT:    retq
2122  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2123  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2124  ret <4 x float> %2
2125}
2126
2127define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
2128; SSE-LABEL: combine_undef_input_test4:
2129; SSE:       # %bb.0:
2130; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2131; SSE-NEXT:    retq
2132;
2133; AVX-LABEL: combine_undef_input_test4:
2134; AVX:       # %bb.0:
2135; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2136; AVX-NEXT:    retq
2137  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2138  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2139  ret <4 x float> %2
2140}
2141
2142define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
2143; SSE2-LABEL: combine_undef_input_test5:
2144; SSE2:       # %bb.0:
2145; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2146; SSE2-NEXT:    retq
2147;
2148; SSSE3-LABEL: combine_undef_input_test5:
2149; SSSE3:       # %bb.0:
2150; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2151; SSSE3-NEXT:    retq
2152;
2153; SSE41-LABEL: combine_undef_input_test5:
2154; SSE41:       # %bb.0:
2155; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2156; SSE41-NEXT:    retq
2157;
2158; AVX-LABEL: combine_undef_input_test5:
2159; AVX:       # %bb.0:
2160; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2161; AVX-NEXT:    retq
2162  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2163  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2164  ret <4 x float> %2
2165}
2166
2167
2168; Verify that we fold shuffles according to rule:
2169;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2170
2171define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
2172; CHECK-LABEL: combine_undef_input_test6:
2173; CHECK:       # %bb.0:
2174; CHECK-NEXT:    retq
2175  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2176  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2177  ret <4 x float> %2
2178}
2179
2180define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
2181; SSE2-LABEL: combine_undef_input_test7:
2182; SSE2:       # %bb.0:
2183; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2184; SSE2-NEXT:    retq
2185;
2186; SSSE3-LABEL: combine_undef_input_test7:
2187; SSSE3:       # %bb.0:
2188; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2189; SSSE3-NEXT:    retq
2190;
2191; SSE41-LABEL: combine_undef_input_test7:
2192; SSE41:       # %bb.0:
2193; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2194; SSE41-NEXT:    retq
2195;
2196; AVX-LABEL: combine_undef_input_test7:
2197; AVX:       # %bb.0:
2198; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2199; AVX-NEXT:    retq
2200  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2201  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2202  ret <4 x float> %2
2203}
2204
2205define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
2206; SSE2-LABEL: combine_undef_input_test8:
2207; SSE2:       # %bb.0:
2208; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2209; SSE2-NEXT:    retq
2210;
2211; SSSE3-LABEL: combine_undef_input_test8:
2212; SSSE3:       # %bb.0:
2213; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2214; SSSE3-NEXT:    retq
2215;
2216; SSE41-LABEL: combine_undef_input_test8:
2217; SSE41:       # %bb.0:
2218; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2219; SSE41-NEXT:    retq
2220;
2221; AVX-LABEL: combine_undef_input_test8:
2222; AVX:       # %bb.0:
2223; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2224; AVX-NEXT:    retq
2225  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2226  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2227  ret <4 x float> %2
2228}
2229
2230define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
2231; SSE-LABEL: combine_undef_input_test9:
2232; SSE:       # %bb.0:
2233; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
2234; SSE-NEXT:    retq
2235;
2236; AVX-LABEL: combine_undef_input_test9:
2237; AVX:       # %bb.0:
2238; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,1]
2239; AVX-NEXT:    retq
2240  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2241  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2242  ret <4 x float> %2
2243}
2244
2245define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
2246; CHECK-LABEL: combine_undef_input_test10:
2247; CHECK:       # %bb.0:
2248; CHECK-NEXT:    retq
2249  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2250  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2251  ret <4 x float> %2
2252}
2253
2254define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
2255; SSE2-LABEL: combine_undef_input_test11:
2256; SSE2:       # %bb.0:
2257; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2258; SSE2-NEXT:    retq
2259;
2260; SSSE3-LABEL: combine_undef_input_test11:
2261; SSSE3:       # %bb.0:
2262; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2263; SSSE3-NEXT:    retq
2264;
2265; SSE41-LABEL: combine_undef_input_test11:
2266; SSE41:       # %bb.0:
2267; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2268; SSE41-NEXT:    retq
2269;
2270; AVX-LABEL: combine_undef_input_test11:
2271; AVX:       # %bb.0:
2272; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2273; AVX-NEXT:    retq
2274  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2275  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
2276  ret <4 x float> %2
2277}
2278
2279define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
2280; SSE-LABEL: combine_undef_input_test12:
2281; SSE:       # %bb.0:
2282; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2283; SSE-NEXT:    retq
2284;
2285; AVX-LABEL: combine_undef_input_test12:
2286; AVX:       # %bb.0:
2287; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2288; AVX-NEXT:    retq
2289  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2290  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2291  ret <4 x float> %2
2292}
2293
2294define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
2295; SSE-LABEL: combine_undef_input_test13:
2296; SSE:       # %bb.0:
2297; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2298; SSE-NEXT:    retq
2299;
2300; AVX-LABEL: combine_undef_input_test13:
2301; AVX:       # %bb.0:
2302; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2303; AVX-NEXT:    retq
2304  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2305  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
2306  ret <4 x float> %2
2307}
2308
2309define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
2310; SSE-LABEL: combine_undef_input_test14:
2311; SSE:       # %bb.0:
2312; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2313; SSE-NEXT:    retq
2314;
2315; AVX-LABEL: combine_undef_input_test14:
2316; AVX:       # %bb.0:
2317; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2318; AVX-NEXT:    retq
2319  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2320  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2321  ret <4 x float> %2
2322}
2323
2324define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
2325; SSE2-LABEL: combine_undef_input_test15:
2326; SSE2:       # %bb.0:
2327; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2328; SSE2-NEXT:    retq
2329;
2330; SSSE3-LABEL: combine_undef_input_test15:
2331; SSSE3:       # %bb.0:
2332; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2333; SSSE3-NEXT:    retq
2334;
2335; SSE41-LABEL: combine_undef_input_test15:
2336; SSE41:       # %bb.0:
2337; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2338; SSE41-NEXT:    retq
2339;
2340; AVX-LABEL: combine_undef_input_test15:
2341; AVX:       # %bb.0:
2342; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2343; AVX-NEXT:    retq
2344  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2345  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2346  ret <4 x float> %2
2347}
2348
2349
2350; Verify that shuffles are canonicalized according to rules:
2351;  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
2352;
2353; This allows to trigger the following combine rule:
2354;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2355;
2356; As a result, all the shuffle pairs in each function below should be
2357; combined into a single legal shuffle operation.
2358
2359define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
2360; CHECK-LABEL: combine_undef_input_test16:
2361; CHECK:       # %bb.0:
2362; CHECK-NEXT:    retq
2363  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2364  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
2365  ret <4 x float> %2
2366}
2367
2368define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
2369; SSE2-LABEL: combine_undef_input_test17:
2370; SSE2:       # %bb.0:
2371; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2372; SSE2-NEXT:    retq
2373;
2374; SSSE3-LABEL: combine_undef_input_test17:
2375; SSSE3:       # %bb.0:
2376; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2377; SSSE3-NEXT:    retq
2378;
2379; SSE41-LABEL: combine_undef_input_test17:
2380; SSE41:       # %bb.0:
2381; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2382; SSE41-NEXT:    retq
2383;
2384; AVX-LABEL: combine_undef_input_test17:
2385; AVX:       # %bb.0:
2386; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2387; AVX-NEXT:    retq
2388  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2389  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2390  ret <4 x float> %2
2391}
2392
2393define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
2394; SSE2-LABEL: combine_undef_input_test18:
2395; SSE2:       # %bb.0:
2396; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2397; SSE2-NEXT:    retq
2398;
2399; SSSE3-LABEL: combine_undef_input_test18:
2400; SSSE3:       # %bb.0:
2401; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2402; SSSE3-NEXT:    retq
2403;
2404; SSE41-LABEL: combine_undef_input_test18:
2405; SSE41:       # %bb.0:
2406; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2407; SSE41-NEXT:    retq
2408;
2409; AVX-LABEL: combine_undef_input_test18:
2410; AVX:       # %bb.0:
2411; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2412; AVX-NEXT:    retq
2413  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2414  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
2415  ret <4 x float> %2
2416}
2417
2418define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
2419; SSE-LABEL: combine_undef_input_test19:
2420; SSE:       # %bb.0:
2421; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
2422; SSE-NEXT:    retq
2423;
2424; AVX-LABEL: combine_undef_input_test19:
2425; AVX:       # %bb.0:
2426; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,1]
2427; AVX-NEXT:    retq
2428  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2429  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2430  ret <4 x float> %2
2431}
2432
2433define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
2434; CHECK-LABEL: combine_undef_input_test20:
2435; CHECK:       # %bb.0:
2436; CHECK-NEXT:    retq
2437  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2438  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2439  ret <4 x float> %2
2440}
2441
2442; These tests are designed to test the ability to combine away unnecessary
2443; operations feeding into a shuffle. The AVX cases are the important ones as
2444; they leverage operations which cannot be done naturally on the entire vector
2445; and thus are decomposed into multiple smaller operations.
2446
2447define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
2448; SSE-LABEL: combine_unneeded_subvector1:
2449; SSE:       # %bb.0:
2450; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
2451; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2452; SSE-NEXT:    movdqa %xmm0, %xmm1
2453; SSE-NEXT:    retq
2454;
2455; AVX1-LABEL: combine_unneeded_subvector1:
2456; AVX1:       # %bb.0:
2457; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2458; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2459; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2460; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2461; AVX1-NEXT:    retq
2462;
2463; AVX2-SLOW-LABEL: combine_unneeded_subvector1:
2464; AVX2-SLOW:       # %bb.0:
2465; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2466; AVX2-SLOW-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2467; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
2468; AVX2-SLOW-NEXT:    retq
2469;
2470; AVX2-FAST-ALL-LABEL: combine_unneeded_subvector1:
2471; AVX2-FAST-ALL:       # %bb.0:
2472; AVX2-FAST-ALL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2473; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
2474; AVX2-FAST-ALL-NEXT:    # ymm1 = mem[0,1,0,1]
2475; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2476; AVX2-FAST-ALL-NEXT:    retq
2477;
2478; AVX2-FAST-PERLANE-LABEL: combine_unneeded_subvector1:
2479; AVX2-FAST-PERLANE:       # %bb.0:
2480; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2481; AVX2-FAST-PERLANE-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2482; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
2483; AVX2-FAST-PERLANE-NEXT:    retq
2484  %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2485  %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
2486  ret <8 x i32> %c
2487}
2488
2489define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
2490; SSE-LABEL: combine_unneeded_subvector2:
2491; SSE:       # %bb.0:
2492; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
2493; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
2494; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2495; SSE-NEXT:    retq
2496;
2497; AVX1-LABEL: combine_unneeded_subvector2:
2498; AVX1:       # %bb.0:
2499; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2500; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2501; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2502; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2503; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2504; AVX1-NEXT:    retq
2505;
2506; AVX2-LABEL: combine_unneeded_subvector2:
2507; AVX2:       # %bb.0:
2508; AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2509; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2510; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2511; AVX2-NEXT:    retq
2512  %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2513  %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
2514  ret <8 x i32> %d
2515}
2516
2517define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
2518; SSE2-LABEL: combine_insertps1:
2519; SSE2:       # %bb.0:
2520; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2521; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2522; SSE2-NEXT:    movaps %xmm1, %xmm0
2523; SSE2-NEXT:    retq
2524;
2525; SSSE3-LABEL: combine_insertps1:
2526; SSSE3:       # %bb.0:
2527; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2528; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2529; SSSE3-NEXT:    movaps %xmm1, %xmm0
2530; SSSE3-NEXT:    retq
2531;
2532; SSE41-LABEL: combine_insertps1:
2533; SSE41:       # %bb.0:
2534; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2535; SSE41-NEXT:    retq
2536;
2537; AVX-LABEL: combine_insertps1:
2538; AVX:       # %bb.0:
2539; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2540; AVX-NEXT:    retq
2541
2542  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
2543  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
2544  ret <4 x float> %d
2545}
2546
2547define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
2548; SSE2-LABEL: combine_insertps2:
2549; SSE2:       # %bb.0:
2550; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2551; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2552; SSE2-NEXT:    movaps %xmm1, %xmm0
2553; SSE2-NEXT:    retq
2554;
2555; SSSE3-LABEL: combine_insertps2:
2556; SSSE3:       # %bb.0:
2557; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2558; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2559; SSSE3-NEXT:    movaps %xmm1, %xmm0
2560; SSSE3-NEXT:    retq
2561;
2562; SSE41-LABEL: combine_insertps2:
2563; SSE41:       # %bb.0:
2564; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2565; SSE41-NEXT:    retq
2566;
2567; AVX-LABEL: combine_insertps2:
2568; AVX:       # %bb.0:
2569; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2570; AVX-NEXT:    retq
2571
2572  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
2573  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2574  ret <4 x float> %d
2575}
2576
2577define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
2578; SSE2-LABEL: combine_insertps3:
2579; SSE2:       # %bb.0:
2580; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2581; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2582; SSE2-NEXT:    retq
2583;
2584; SSSE3-LABEL: combine_insertps3:
2585; SSSE3:       # %bb.0:
2586; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2587; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2588; SSSE3-NEXT:    retq
2589;
2590; SSE41-LABEL: combine_insertps3:
2591; SSE41:       # %bb.0:
2592; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2593; SSE41-NEXT:    retq
2594;
2595; AVX-LABEL: combine_insertps3:
2596; AVX:       # %bb.0:
2597; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2598; AVX-NEXT:    retq
2599
2600  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2601  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
2602  ret <4 x float> %d
2603}
2604
2605define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
2606; SSE2-LABEL: combine_insertps4:
2607; SSE2:       # %bb.0:
2608; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
2609; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2610; SSE2-NEXT:    retq
2611;
2612; SSSE3-LABEL: combine_insertps4:
2613; SSSE3:       # %bb.0:
2614; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
2615; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2616; SSSE3-NEXT:    retq
2617;
2618; SSE41-LABEL: combine_insertps4:
2619; SSE41:       # %bb.0:
2620; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2621; SSE41-NEXT:    retq
2622;
2623; AVX-LABEL: combine_insertps4:
2624; AVX:       # %bb.0:
2625; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2626; AVX-NEXT:    retq
2627
2628  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2629  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
2630  ret <4 x float> %d
2631}
2632
2633define void @combine_scalar_load_with_blend_with_zero(ptr %a0, ptr %a1) {
2634; SSE-LABEL: combine_scalar_load_with_blend_with_zero:
2635; SSE:       # %bb.0:
2636; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2637; SSE-NEXT:    movaps %xmm0, (%rsi)
2638; SSE-NEXT:    retq
2639;
2640; AVX-LABEL: combine_scalar_load_with_blend_with_zero:
2641; AVX:       # %bb.0:
2642; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2643; AVX-NEXT:    vmovaps %xmm0, (%rsi)
2644; AVX-NEXT:    retq
2645  %1 = load double, ptr %a0, align 8
2646  %2 = insertelement <2 x double> undef, double %1, i32 0
2647  %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1
2648  %4 = bitcast <2 x double> %3 to <4 x float>
2649  %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
2650  store <4 x float> %5, ptr %a1, align 16
2651  ret void
2652}
2653
2654; PR30371
2655define <4 x float> @combine_constant_insertion_v4f32(float %f) {
2656; SSE2-LABEL: combine_constant_insertion_v4f32:
2657; SSE2:       # %bb.0:
2658; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [u,4.0E+0,5.0E+0,3.0E+0]
2659; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2660; SSE2-NEXT:    movaps %xmm1, %xmm0
2661; SSE2-NEXT:    retq
2662;
2663; SSSE3-LABEL: combine_constant_insertion_v4f32:
2664; SSSE3:       # %bb.0:
2665; SSSE3-NEXT:    movaps {{.*#+}} xmm1 = [u,4.0E+0,5.0E+0,3.0E+0]
2666; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2667; SSSE3-NEXT:    movaps %xmm1, %xmm0
2668; SSSE3-NEXT:    retq
2669;
2670; SSE41-LABEL: combine_constant_insertion_v4f32:
2671; SSE41:       # %bb.0:
2672; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
2673; SSE41-NEXT:    retq
2674;
2675; AVX-LABEL: combine_constant_insertion_v4f32:
2676; AVX:       # %bb.0:
2677; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
2678; AVX-NEXT:    retq
2679  %a0 = insertelement <4 x float> undef, float %f, i32 0
2680  %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2681  ret <4 x float> %ret
2682}
2683
2684define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) {
2685; SSE2-LABEL: combine_constant_insertion_v4i32:
2686; SSE2:       # %bb.0:
2687; SSE2-NEXT:    movd %edi, %xmm1
2688; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [u,4,5,30]
2689; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2690; SSE2-NEXT:    retq
2691;
2692; SSSE3-LABEL: combine_constant_insertion_v4i32:
2693; SSSE3:       # %bb.0:
2694; SSSE3-NEXT:    movd %edi, %xmm1
2695; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = [u,4,5,30]
2696; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2697; SSSE3-NEXT:    retq
2698;
2699; SSE41-LABEL: combine_constant_insertion_v4i32:
2700; SSE41:       # %bb.0:
2701; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm0 = [0,4,5,30]
2702; SSE41-NEXT:    pinsrd $0, %edi, %xmm0
2703; SSE41-NEXT:    retq
2704;
2705; AVX-LABEL: combine_constant_insertion_v4i32:
2706; AVX:       # %bb.0:
2707; AVX-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,4,5,30]
2708; AVX-NEXT:    vpinsrd $0, %edi, %xmm0, %xmm0
2709; AVX-NEXT:    retq
2710  %a0 = insertelement <4 x i32> undef, i32 %f, i32 0
2711  %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2712  ret <4 x i32> %ret
2713}
2714
2715define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
2716; SSE2-LABEL: PR22377:
2717; SSE2:       # %bb.0: # %entry
2718; SSE2-NEXT:    movaps %xmm0, %xmm1
2719; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3]
2720; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2721; SSE2-NEXT:    addps %xmm0, %xmm1
2722; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2723; SSE2-NEXT:    retq
2724;
2725; SSSE3-LABEL: PR22377:
2726; SSSE3:       # %bb.0: # %entry
2727; SSSE3-NEXT:    movaps %xmm0, %xmm1
2728; SSSE3-NEXT:    haddps %xmm0, %xmm1
2729; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2730; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2731; SSSE3-NEXT:    retq
2732;
2733; SSE41-LABEL: PR22377:
2734; SSE41:       # %bb.0: # %entry
2735; SSE41-NEXT:    movaps %xmm0, %xmm1
2736; SSE41-NEXT:    haddps %xmm0, %xmm1
2737; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2738; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2739; SSE41-NEXT:    retq
2740;
2741; AVX-LABEL: PR22377:
2742; AVX:       # %bb.0: # %entry
2743; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm1
2744; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2745; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2746; AVX-NEXT:    retq
2747entry:
2748  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
2749  %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2750  %r2 = fadd <4 x float> %s1, %s2
2751  %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2752  ret <4 x float> %s3
2753}
2754
2755define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
2756; SSE2-LABEL: PR22390:
2757; SSE2:       # %bb.0: # %entry
2758; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2759; SSE2-NEXT:    movaps %xmm0, %xmm2
2760; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2761; SSE2-NEXT:    addps %xmm2, %xmm0
2762; SSE2-NEXT:    retq
2763;
2764; SSSE3-LABEL: PR22390:
2765; SSSE3:       # %bb.0: # %entry
2766; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2767; SSSE3-NEXT:    movaps %xmm0, %xmm2
2768; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2769; SSSE3-NEXT:    addps %xmm2, %xmm0
2770; SSSE3-NEXT:    retq
2771;
2772; SSE41-LABEL: PR22390:
2773; SSE41:       # %bb.0: # %entry
2774; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2775; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2776; SSE41-NEXT:    addps %xmm1, %xmm0
2777; SSE41-NEXT:    retq
2778;
2779; AVX-LABEL: PR22390:
2780; AVX:       # %bb.0: # %entry
2781; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2782; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2783; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2784; AVX-NEXT:    retq
2785entry:
2786  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
2787  %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
2788  %r2 = fadd <4 x float> %s1, %s2
2789  ret <4 x float> %r2
2790}
2791
2792define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
2793; SSE-LABEL: PR22412:
2794; SSE:       # %bb.0: # %entry
2795; SSE-NEXT:    movaps %xmm3, %xmm1
2796; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2797; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[3,2]
2798; SSE-NEXT:    retq
2799;
2800; AVX1-LABEL: PR22412:
2801; AVX1:       # %bb.0: # %entry
2802; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
2803; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2804; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[3,2],ymm0[5,4],ymm2[7,6]
2805; AVX1-NEXT:    retq
2806;
2807; AVX2-LABEL: PR22412:
2808; AVX2:       # %bb.0: # %entry
2809; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2810; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
2811; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
2812; AVX2-NEXT:    retq
2813entry:
2814  %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2815  %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>
2816  ret <8 x float> %s2
2817}
2818
2819define <4 x float> @PR30264(<4 x float> %x) {
2820; SSE2-LABEL: PR30264:
2821; SSE2:       # %bb.0:
2822; SSE2-NEXT:    xorps %xmm1, %xmm1
2823; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2824; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[0],mem[1]
2825; SSE2-NEXT:    movapd %xmm1, %xmm0
2826; SSE2-NEXT:    retq
2827;
2828; SSSE3-LABEL: PR30264:
2829; SSSE3:       # %bb.0:
2830; SSSE3-NEXT:    xorps %xmm1, %xmm1
2831; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2832; SSSE3-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[0],mem[1]
2833; SSSE3-NEXT:    movapd %xmm1, %xmm0
2834; SSSE3-NEXT:    retq
2835;
2836; SSE41-LABEL: PR30264:
2837; SSE41:       # %bb.0:
2838; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [u,u,4.0E+0,1.0E+0]
2839; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3]
2840; SSE41-NEXT:    movaps %xmm1, %xmm0
2841; SSE41-NEXT:    retq
2842;
2843; AVX-LABEL: PR30264:
2844; AVX:       # %bb.0:
2845; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [4.0E+0,1.0E+0,4.0E+0,1.0E+0]
2846; AVX-NEXT:    # xmm1 = mem[0,0]
2847; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3]
2848; AVX-NEXT:    retq
2849  %shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
2850  %shuf2 = shufflevector <4 x float> %shuf1, <4 x float> <float undef, float undef, float 4.0, float 1.0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2851  ret <4 x float> %shuf2
2852}
2853
2854define <8 x i16> @PR39549(<16 x i8> %x) {
2855; SSE-LABEL: PR39549:
2856; SSE:       # %bb.0:
2857; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2858; SSE-NEXT:    psraw $8, %xmm0
2859; SSE-NEXT:    retq
2860;
2861; AVX-LABEL: PR39549:
2862; AVX:       # %bb.0:
2863; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2864; AVX-NEXT:    vpsraw $8, %xmm0, %xmm0
2865; AVX-NEXT:    retq
2866  %a = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15, i32 undef>
2867  %b = bitcast <16 x i8> %a to <8 x i16>
2868  %c = shl <8 x i16> %b, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2869  %d = ashr <8 x i16> %c, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2870  ret <8 x i16> %d
2871}
2872
2873define <4 x i32> @PR41545(<4 x i32> %a0, <16 x i8> %a1) {
2874; SSE-LABEL: PR41545:
2875; SSE:       # %bb.0:
2876; SSE-NEXT:    paddd %xmm1, %xmm0
2877; SSE-NEXT:    retq
2878;
2879; AVX-LABEL: PR41545:
2880; AVX:       # %bb.0:
2881; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2882; AVX-NEXT:    retq
2883  %1  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
2884  %2  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
2885  %3  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
2886  %4  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
2887  %5  = zext <4 x i8> %1 to <4 x i32>
2888  %6  = zext <4 x i8> %2 to <4 x i32>
2889  %7  = zext <4 x i8> %3 to <4 x i32>
2890  %8  = zext <4 x i8> %4 to <4 x i32>
2891  %9  = shl <4 x i32> %6, <i32 8, i32 8, i32 8, i32 8>
2892  %10 = shl <4 x i32> %7, <i32 16, i32 16, i32 16, i32 16>
2893  %11 = shl <4 x i32> %8, <i32 24, i32 24, i32 24, i32 24>
2894  %12 = or <4 x i32> %5, %9
2895  %13 = or <4 x i32> %12, %10
2896  %14 = or <4 x i32> %13, %11
2897  %15 = add <4 x i32> %a0, %14
2898  ret <4 x i32> %15
2899}
2900
2901define <8 x i16> @shuffle_extract_insert(<8 x i16> %a) {
2902; SSE-LABEL: shuffle_extract_insert:
2903; SSE:       # %bb.0:
2904; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2905; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2906; SSE-NEXT:    retq
2907;
2908; AVX1-LABEL: shuffle_extract_insert:
2909; AVX1:       # %bb.0:
2910; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2911; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2912; AVX1-NEXT:    retq
2913;
2914; AVX2-SLOW-LABEL: shuffle_extract_insert:
2915; AVX2-SLOW:       # %bb.0:
2916; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2917; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2918; AVX2-SLOW-NEXT:    retq
2919;
2920; AVX2-FAST-LABEL: shuffle_extract_insert:
2921; AVX2-FAST:       # %bb.0:
2922; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15]
2923; AVX2-FAST-NEXT:    retq
2924  %a0 = extractelement <8 x i16> %a, i32 0
2925  %a1 = extractelement <8 x i16> %a, i32 1
2926  %a3 = extractelement <8 x i16> %a, i32 3
2927  %a4 = extractelement <8 x i16> %a, i32 4
2928  %a5 = extractelement <8 x i16> %a, i32 5
2929  %a6 = extractelement <8 x i16> %a, i32 6
2930  %a7 = extractelement <8 x i16> %a, i32 7
2931  %1 = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2932  %2 = insertelement <8 x i16> %1, i16 %a1, i32 1
2933  %3 = insertelement <8 x i16> %2, i16 %a0, i32 2
2934  %4 = insertelement <8 x i16> %3, i16 %a3, i32 3
2935  %5 = insertelement <8 x i16> %4, i16 %a6, i32 4
2936  %6 = insertelement <8 x i16> %5, i16 %a5, i32 5
2937  %7 = insertelement <8 x i16> %6, i16 %a4, i32 6
2938  %8 = insertelement <8 x i16> %7, i16 %a7, i32 7
2939  ret <8 x i16> %8
2940}
2941
2942define <8 x i16> @shuffle_extract_insert_double(<8 x i16> %a, <8 x i16> %b) {
2943; SSE2-LABEL: shuffle_extract_insert_double:
2944; SSE2:       # %bb.0:
2945; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
2946; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
2947; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2948; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
2949; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2950; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2951; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2952; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
2953; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2954; SSE2-NEXT:    retq
2955;
2956; SSSE3-LABEL: shuffle_extract_insert_double:
2957; SSSE3:       # %bb.0:
2958; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2959; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2960; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2961; SSSE3-NEXT:    retq
2962;
2963; SSE41-LABEL: shuffle_extract_insert_double:
2964; SSE41:       # %bb.0:
2965; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2966; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2967; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2968; SSE41-NEXT:    retq
2969;
2970; AVX-LABEL: shuffle_extract_insert_double:
2971; AVX:       # %bb.0:
2972; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2973; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2974; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2975; AVX-NEXT:    retq
2976  %a0 = extractelement <8 x i16> %a, i32 0
2977  %a4 = extractelement <8 x i16> %a, i32 4
2978  %a6 = extractelement <8 x i16> %a, i32 6
2979  %b11 = extractelement <8 x i16> %b, i32 3
2980  %b13 = extractelement <8 x i16> %b, i32 5
2981  %b15 = extractelement <8 x i16> %b, i32 7
2982  %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2983  %2 = insertelement <8 x i16> %1, i16 %a0, i32 2
2984  %3 = insertelement <8 x i16> %2, i16 %b11, i32 3
2985  %4 = insertelement <8 x i16> %3, i16 %a6, i32 4
2986  %5 = insertelement <8 x i16> %4, i16 %b13, i32 5
2987  %6 = insertelement <8 x i16> %5, i16 %a4, i32 6
2988  %7 = insertelement <8 x i16> %6, i16 %b15, i32 7
2989  ret <8 x i16> %7
2990}
2991
2992define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) {
2993; SSE2-LABEL: shuffle_extract_concat_insert:
2994; SSE2:       # %bb.0:
2995; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2996; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2997; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2998; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2999; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
3000; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
3001; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
3002; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3003; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
3004; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3005; SSE2-NEXT:    retq
3006;
3007; SSSE3-LABEL: shuffle_extract_concat_insert:
3008; SSSE3:       # %bb.0:
3009; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3010; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
3011; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3012; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3013; SSSE3-NEXT:    retq
3014;
3015; SSE41-LABEL: shuffle_extract_concat_insert:
3016; SSE41:       # %bb.0:
3017; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3018; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
3019; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3020; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3021; SSE41-NEXT:    retq
3022;
3023; AVX-LABEL: shuffle_extract_concat_insert:
3024; AVX:       # %bb.0:
3025; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3026; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
3027; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3028; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3029; AVX-NEXT:    retq
3030  %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3031  %a0 = extractelement <8 x i16> %a, i32 0
3032  %a4 = extractelement <8 x i16> %a, i32 4
3033  %a6 = extractelement <8 x i16> %a, i32 6
3034  %b11 = extractelement <8 x i16> %b, i32 3
3035  %b13 = extractelement <8 x i16> %b, i32 5
3036  %b15 = extractelement <8 x i16> %b, i32 7
3037  %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3038  %2 = insertelement <8 x i16> %1, i16 %a0, i32 2
3039  %3 = insertelement <8 x i16> %2, i16 %b11, i32 3
3040  %4 = insertelement <8 x i16> %3, i16 %a6, i32 4
3041  %5 = insertelement <8 x i16> %4, i16 %b13, i32 5
3042  %6 = insertelement <8 x i16> %5, i16 %a4, i32 6
3043  %7 = insertelement <8 x i16> %6, i16 %b15, i32 7
3044  ret <8 x i16> %7
3045}
3046
3047define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) {
3048; SSE2-LABEL: shuffle_scalar_to_vector_extract:
3049; SSE2:       # %bb.0:
3050; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3051; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3052; SSE2-NEXT:    psraw $8, %xmm1
3053; SSE2-NEXT:    pextrw $7, %xmm1, %eax
3054; SSE2-NEXT:    movd %eax, %xmm2
3055; SSE2-NEXT:    movsbl (%rsi), %eax
3056; SSE2-NEXT:    movd %eax, %xmm0
3057; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
3058; SSE2-NEXT:    movsbl (%rdx), %eax
3059; SSE2-NEXT:    movd %eax, %xmm0
3060; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3061; SSE2-NEXT:    pxor %xmm0, %xmm0
3062; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3063; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3064; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3065; SSE2-NEXT:    retq
3066;
3067; SSSE3-LABEL: shuffle_scalar_to_vector_extract:
3068; SSSE3:       # %bb.0:
3069; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3070; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3071; SSSE3-NEXT:    psraw $8, %xmm1
3072; SSSE3-NEXT:    movsbl (%rsi), %eax
3073; SSSE3-NEXT:    movd %eax, %xmm2
3074; SSSE3-NEXT:    palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
3075; SSSE3-NEXT:    movsbl (%rdx), %eax
3076; SSSE3-NEXT:    movd %eax, %xmm0
3077; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3078; SSSE3-NEXT:    pxor %xmm0, %xmm0
3079; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3080; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3081; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3082; SSSE3-NEXT:    retq
3083;
3084; SSE41-LABEL: shuffle_scalar_to_vector_extract:
3085; SSE41:       # %bb.0:
3086; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
3087; SSE41-NEXT:    pextrw $4, %xmm0, %eax
3088; SSE41-NEXT:    pextrw $7, %xmm0, %ecx
3089; SSE41-NEXT:    pxor %xmm0, %xmm0
3090; SSE41-NEXT:    pinsrw $1, %eax, %xmm0
3091; SSE41-NEXT:    movl $65531, %eax # imm = 0xFFFB
3092; SSE41-NEXT:    pinsrw $2, %eax, %xmm0
3093; SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
3094; SSE41-NEXT:    movsbl (%rsi), %eax
3095; SSE41-NEXT:    pinsrw $5, %eax, %xmm0
3096; SSE41-NEXT:    movsbl (%rdx), %eax
3097; SSE41-NEXT:    pinsrw $6, %eax, %xmm0
3098; SSE41-NEXT:    retq
3099;
3100; AVX-LABEL: shuffle_scalar_to_vector_extract:
3101; AVX:       # %bb.0:
3102; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
3103; AVX-NEXT:    vpextrw $4, %xmm0, %eax
3104; AVX-NEXT:    vpextrw $7, %xmm0, %ecx
3105; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3106; AVX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
3107; AVX-NEXT:    movl $65531, %eax # imm = 0xFFFB
3108; AVX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
3109; AVX-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
3110; AVX-NEXT:    movsbl (%rsi), %eax
3111; AVX-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
3112; AVX-NEXT:    movsbl (%rdx), %eax
3113; AVX-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
3114; AVX-NEXT:    retq
3115  %tmp = load <8 x i8>, ptr %p0, align 1
3116  %tmp1 = sext <8 x i8> %tmp to <8 x i16>
3117  %tmp2 = load i8, ptr %p1, align 1
3118  %cvt1 = sext i8 %tmp2 to i16
3119  %tmp3 = load i8, ptr %p2, align 1
3120  %cvt2 = sext i8 %tmp3 to i16
3121  %tmp4 = extractelement <8 x i16> %tmp1, i32 4
3122  %tmp5 = extractelement <8 x i16> %tmp1, i32 7
3123  %tmp6 = insertelement <8 x i16> <i16 undef, i16 undef, i16 -5, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 undef, i32 0
3124  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp4, i32 1
3125  %tmp8 = insertelement <8 x i16> %tmp7, i16 undef, i32 3
3126  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp5, i32 4
3127  %tmp10 = insertelement <8 x i16> %tmp9, i16 %cvt1, i32 5
3128  %tmp11 = insertelement <8 x i16> %tmp10, i16 %cvt2, i32 6
3129  %tmp12 = insertelement <8 x i16> %tmp11, i16 undef, i32 7
3130  %tmp13 = shufflevector <8 x i16> %tmp12, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
3131  ret <8 x i16> %tmp13
3132}
3133
3134; Bug noticed in D96345
3135define i32 @shuffle_binops_with_undef() {
3136; SSE-LABEL: shuffle_binops_with_undef:
3137; SSE:       # %bb.0: # %entry
3138; SSE-NEXT:    movdqa (%rax), %xmm0
3139; SSE-NEXT:    paddw %xmm0, %xmm0
3140; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
3141; SSE-NEXT:    psrlw %xmm1, %xmm0
3142; SSE-NEXT:    movdqa %xmm0, (%rax)
3143; SSE-NEXT:    retq
3144;
3145; AVX-LABEL: shuffle_binops_with_undef:
3146; AVX:       # %bb.0: # %entry
3147; AVX-NEXT:    vmovdqa (%rax), %xmm0
3148; AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
3149; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
3150; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
3151; AVX-NEXT:    vmovdqa %xmm0, (%rax)
3152; AVX-NEXT:    retq
3153entry:
3154  %load0 = load <8 x i16>, ptr undef, align 16
3155  %load1 = load <8 x i16>, ptr undef, align 16
3156  %shuf0 = shufflevector <16 x i8> undef, <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
3157  %addi = add <8 x i16> %load0, %load1
3158  %bc0 = bitcast <8 x i16> %addi to <2 x i64>
3159  %bc1 = bitcast <16 x i8> %shuf0 to <8 x i16>
3160  %shuf1 = shufflevector <8 x i16> %load1, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
3161  %addi24 = add <8 x i16> %shuf1, %bc1
3162  %bc2 = bitcast <8 x i16> %addi24 to <2 x i64>
3163  %shuf2 = shufflevector <2 x i64> %bc0, <2 x i64> %bc2, <2 x i32> <i32 0, i32 2>
3164  %bc3 = bitcast <2 x i64> %shuf2 to <8 x i16>
3165  %psrli = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %bc3, i32 ptrtoint (ptr @shuffle_binops_with_undef to i32))
3166  store <8 x i16> %psrli, ptr undef, align 16
3167  ret i32 undef
3168}
3169declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
3170
3171define void @PR43024() {
3172; SSE2-LABEL: PR43024:
3173; SSE2:       # %bb.0:
3174; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3175; SSE2-NEXT:    movaps %xmm0, (%rax)
3176; SSE2-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3177; SSE2-NEXT:    xorps %xmm1, %xmm1
3178; SSE2-NEXT:    addss %xmm1, %xmm0
3179; SSE2-NEXT:    addss %xmm1, %xmm0
3180; SSE2-NEXT:    movss %xmm0, (%rax)
3181; SSE2-NEXT:    retq
3182;
3183; SSSE3-LABEL: PR43024:
3184; SSSE3:       # %bb.0:
3185; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3186; SSSE3-NEXT:    movaps %xmm0, (%rax)
3187; SSSE3-NEXT:    addss %xmm0, %xmm0
3188; SSSE3-NEXT:    xorps %xmm1, %xmm1
3189; SSSE3-NEXT:    addss %xmm1, %xmm0
3190; SSSE3-NEXT:    addss %xmm1, %xmm0
3191; SSSE3-NEXT:    movss %xmm0, (%rax)
3192; SSSE3-NEXT:    retq
3193;
3194; SSE41-LABEL: PR43024:
3195; SSE41:       # %bb.0:
3196; SSE41-NEXT:    movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3197; SSE41-NEXT:    movaps %xmm0, (%rax)
3198; SSE41-NEXT:    addss %xmm0, %xmm0
3199; SSE41-NEXT:    xorps %xmm1, %xmm1
3200; SSE41-NEXT:    addss %xmm1, %xmm0
3201; SSE41-NEXT:    addss %xmm1, %xmm0
3202; SSE41-NEXT:    movss %xmm0, (%rax)
3203; SSE41-NEXT:    retq
3204;
3205; AVX-LABEL: PR43024:
3206; AVX:       # %bb.0:
3207; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3208; AVX-NEXT:    vmovaps %xmm0, (%rax)
3209; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0
3210; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
3211; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
3212; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
3213; AVX-NEXT:    vmovss %xmm0, (%rax)
3214; AVX-NEXT:    retq
3215  store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, ptr undef, align 16
3216  %1 = load <4 x float>, ptr undef, align 16
3217  %2 = fmul <4 x float> %1, <float 0x0, float 0x0, float 0x0, float 0x0>
3218  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
3219  %4 = fadd <4 x float> %2, %3
3220  %5 = fadd <4 x float> zeroinitializer, %4
3221  %6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
3222  %7 = fadd <4 x float> %6, %5
3223  %8 = extractelement <4 x float> %7, i32 0
3224  store float %8, ptr undef, align 8
3225  ret void
3226}
3227
3228declare <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float>, <4 x float>, metadata, metadata)
3229declare <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float>, <4 x float>, metadata, metadata)
3230
3231define void @PR43024_strictfp() strictfp {
3232; SSE2-LABEL: PR43024_strictfp:
3233; SSE2:       # %bb.0:
3234; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3235; SSE2-NEXT:    movaps %xmm0, (%rax)
3236; SSE2-NEXT:    xorps %xmm1, %xmm1
3237; SSE2-NEXT:    mulps %xmm1, %xmm0
3238; SSE2-NEXT:    movaps %xmm0, %xmm2
3239; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
3240; SSE2-NEXT:    addps %xmm0, %xmm2
3241; SSE2-NEXT:    addps %xmm1, %xmm2
3242; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
3243; SSE2-NEXT:    addps %xmm2, %xmm0
3244; SSE2-NEXT:    movss %xmm0, (%rax)
3245; SSE2-NEXT:    retq
3246;
3247; SSSE3-LABEL: PR43024_strictfp:
3248; SSSE3:       # %bb.0:
3249; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3250; SSSE3-NEXT:    movaps %xmm0, (%rax)
3251; SSSE3-NEXT:    xorps %xmm1, %xmm1
3252; SSSE3-NEXT:    mulps %xmm1, %xmm0
3253; SSSE3-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
3254; SSSE3-NEXT:    addps %xmm0, %xmm2
3255; SSSE3-NEXT:    addps %xmm1, %xmm2
3256; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
3257; SSSE3-NEXT:    addps %xmm2, %xmm0
3258; SSSE3-NEXT:    movss %xmm0, (%rax)
3259; SSSE3-NEXT:    retq
3260;
3261; SSE41-LABEL: PR43024_strictfp:
3262; SSE41:       # %bb.0:
3263; SSE41-NEXT:    movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3264; SSE41-NEXT:    movaps %xmm0, (%rax)
3265; SSE41-NEXT:    xorps %xmm1, %xmm1
3266; SSE41-NEXT:    mulps %xmm1, %xmm0
3267; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
3268; SSE41-NEXT:    addps %xmm0, %xmm2
3269; SSE41-NEXT:    addps %xmm1, %xmm2
3270; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
3271; SSE41-NEXT:    addps %xmm2, %xmm0
3272; SSE41-NEXT:    movss %xmm0, (%rax)
3273; SSE41-NEXT:    retq
3274;
3275; AVX-LABEL: PR43024_strictfp:
3276; AVX:       # %bb.0:
3277; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3278; AVX-NEXT:    vmovaps %xmm0, (%rax)
3279; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
3280; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
3281; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
3282; AVX-NEXT:    vaddps %xmm2, %xmm0, %xmm2
3283; AVX-NEXT:    vaddps %xmm2, %xmm1, %xmm1
3284; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
3285; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
3286; AVX-NEXT:    vmovss %xmm0, (%rax)
3287; AVX-NEXT:    retq
3288  store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, ptr undef, align 16
3289  %1 = load <4 x float>, ptr undef, align 16
3290  %2 = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> %1, <4 x float> zeroinitializer, metadata !"round.dynamic", metadata !"fpexcept.strict")
3291  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
3292  %4 = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %2, <4 x float> %3, metadata !"round.dynamic", metadata !"fpexcept.strict")
3293  %5 = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> zeroinitializer, <4 x float> %4, metadata !"round.dynamic", metadata !"fpexcept.strict")
3294  %6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
3295  %7 = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %6, <4 x float> %5, metadata !"round.dynamic", metadata !"fpexcept.strict")
3296  %8 = extractelement <4 x float> %7, i32 0
3297  store float %8, ptr undef, align 8
3298  ret void
3299}
3300
3301define void @PR45604(ptr %dst, ptr %src) {
3302; SSE2-LABEL: PR45604:
3303; SSE2:       # %bb.0:
3304; SSE2-NEXT:    movdqa (%rsi), %xmm0
3305; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
3306; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
3307; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
3308; SSE2-NEXT:    movdqa %xmm2, %xmm3
3309; SSE2-NEXT:    pandn %xmm1, %xmm3
3310; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,0,0,11,0,0,0,0,0,0,0,11,0,0,0]
3311; SSE2-NEXT:    por %xmm1, %xmm3
3312; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
3313; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
3314; SSE2-NEXT:    movdqa %xmm2, %xmm5
3315; SSE2-NEXT:    pandn %xmm4, %xmm5
3316; SSE2-NEXT:    por %xmm1, %xmm5
3317; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,2,2,2]
3318; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
3319; SSE2-NEXT:    movdqa %xmm2, %xmm6
3320; SSE2-NEXT:    pandn %xmm4, %xmm6
3321; SSE2-NEXT:    por %xmm1, %xmm6
3322; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
3323; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
3324; SSE2-NEXT:    pandn %xmm0, %xmm2
3325; SSE2-NEXT:    por %xmm1, %xmm2
3326; SSE2-NEXT:    movdqa %xmm2, 48(%rdi)
3327; SSE2-NEXT:    movdqa %xmm6, 32(%rdi)
3328; SSE2-NEXT:    movdqa %xmm5, 16(%rdi)
3329; SSE2-NEXT:    movdqa %xmm3, (%rdi)
3330; SSE2-NEXT:    retq
3331;
3332; SSSE3-LABEL: PR45604:
3333; SSSE3:       # %bb.0:
3334; SSSE3-NEXT:    movdqa (%rsi), %xmm0
3335; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3336; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[2,3],zero,zero,zero,zero,zero,zero
3337; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,0,0,0,11,0,0,0,0,0,0,0,11,0,0,0]
3338; SSSE3-NEXT:    por %xmm2, %xmm1
3339; SSSE3-NEXT:    movdqa %xmm0, %xmm3
3340; SSSE3-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[4,5],zero,zero,zero,zero,zero,zero,xmm3[6,7],zero,zero,zero,zero,zero,zero
3341; SSSE3-NEXT:    por %xmm2, %xmm3
3342; SSSE3-NEXT:    movdqa %xmm0, %xmm4
3343; SSSE3-NEXT:    pshufb {{.*#+}} xmm4 = xmm4[8,9],zero,zero,zero,zero,zero,zero,xmm4[10,11],zero,zero,zero,zero,zero,zero
3344; SSSE3-NEXT:    por %xmm2, %xmm4
3345; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13],zero,zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero
3346; SSSE3-NEXT:    por %xmm2, %xmm0
3347; SSSE3-NEXT:    movdqa %xmm0, 48(%rdi)
3348; SSSE3-NEXT:    movdqa %xmm4, 32(%rdi)
3349; SSSE3-NEXT:    movdqa %xmm3, 16(%rdi)
3350; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
3351; SSSE3-NEXT:    retq
3352;
3353; SSE41-LABEL: PR45604:
3354; SSE41:       # %bb.0:
3355; SSE41-NEXT:    movdqa (%rsi), %xmm0
3356; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
3357; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3358; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [0,11,0,11]
3359; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
3360; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
3361; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3362; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3],xmm3[4],xmm2[5,6,7]
3363; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
3364; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
3365; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1,2,3],xmm4[4],xmm2[5,6,7]
3366; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3367; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
3368; SSE41-NEXT:    movdqa %xmm0, (%rdi)
3369; SSE41-NEXT:    movdqa %xmm4, 48(%rdi)
3370; SSE41-NEXT:    movdqa %xmm3, 32(%rdi)
3371; SSE41-NEXT:    movdqa %xmm1, 16(%rdi)
3372; SSE41-NEXT:    retq
3373;
3374; AVX1-LABEL: PR45604:
3375; AVX1:       # %bb.0:
3376; AVX1-NEXT:    vmovdqa (%rsi), %xmm0
3377; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
3378; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3379; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0]
3380; AVX1-NEXT:    # xmm2 = mem[0,0]
3381; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
3382; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
3383; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3384; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3385; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
3386; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
3387; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3388; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3389; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3390; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
3391; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
3392; AVX1-NEXT:    vmovups %ymm0, (%rdi)
3393; AVX1-NEXT:    vmovups %ymm1, 32(%rdi)
3394; AVX1-NEXT:    vzeroupper
3395; AVX1-NEXT:    retq
3396;
3397; AVX2-LABEL: PR45604:
3398; AVX2:       # %bb.0:
3399; AVX2-NEXT:    vmovdqa (%rsi), %xmm0
3400; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2]
3401; AVX2-NEXT:    vpmovsxdq {{.*#+}} ymm2 = [151519488,185205506,218891524,252577542]
3402; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3403; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0]
3404; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
3405; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
3406; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3407; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
3408; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
3409; AVX2-NEXT:    vmovdqu %ymm1, (%rdi)
3410; AVX2-NEXT:    vzeroupper
3411; AVX2-NEXT:    retq
3412  %v1 = load <8 x i16>, ptr %src, align 16
3413  %v2 = shufflevector <8 x i16> %v1, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3414  %v3 = shufflevector <16 x i16> %v2, <16 x i16> <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
3415  store <32 x i16> %v3, ptr %dst, align 16
3416  ret void
3417}
3418
3419; getFauxShuffle AND/ANDN decoding wrongly assumed an undef src always gives an undef dst.
3420define <2 x i64> @PR55157(ptr %0) {
3421; SSE-LABEL: PR55157:
3422; SSE:       # %bb.0:
3423; SSE-NEXT:    xorps %xmm0, %xmm0
3424; SSE-NEXT:    retq
3425;
3426; AVX-LABEL: PR55157:
3427; AVX:       # %bb.0:
3428; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
3429; AVX-NEXT:    retq
3430  %2 = load <16 x i8>, ptr %0, align 16
3431  %3 = icmp eq <16 x i8> %2, zeroinitializer
3432  %4 = tail call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer)
3433  %5 = select <16 x i1> %3, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %4
3434  %6 = shufflevector <16 x i8> %5, <16 x i8> poison, <16 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
3435  %7 = bitcast <16 x i8> %6 to <2 x i64>
3436  ret <2 x i64> %7
3437}
3438declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>)
3439
3440; SelectionDAG::isSplatValue - incorrect handling of undef sub-elements
3441define <2 x i64> @PR56520(<16 x i8> %0) {
3442; SSE-LABEL: PR56520:
3443; SSE:       # %bb.0:
3444; SSE-NEXT:    pxor %xmm1, %xmm1
3445; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
3446; SSE-NEXT:    movd %xmm1, %eax
3447; SSE-NEXT:    movsbl %al, %eax
3448; SSE-NEXT:    movd %eax, %xmm0
3449; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3450; SSE-NEXT:    retq
3451;
3452; AVX1-LABEL: PR56520:
3453; AVX1:       # %bb.0:
3454; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3455; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
3456; AVX1-NEXT:    vmovd %xmm0, %eax
3457; AVX1-NEXT:    movsbl %al, %eax
3458; AVX1-NEXT:    vmovd %eax, %xmm0
3459; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3460; AVX1-NEXT:    retq
3461;
3462; AVX2-SLOW-LABEL: PR56520:
3463; AVX2-SLOW:       # %bb.0:
3464; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3465; AVX2-SLOW-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
3466; AVX2-SLOW-NEXT:    vmovd %xmm0, %eax
3467; AVX2-SLOW-NEXT:    movsbl %al, %eax
3468; AVX2-SLOW-NEXT:    vmovd %eax, %xmm0
3469; AVX2-SLOW-NEXT:    vpbroadcastq %xmm0, %xmm0
3470; AVX2-SLOW-NEXT:    retq
3471;
3472; AVX2-FAST-LABEL: PR56520:
3473; AVX2-FAST:       # %bb.0:
3474; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3475; AVX2-FAST-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
3476; AVX2-FAST-NEXT:    vmovd %xmm0, %eax
3477; AVX2-FAST-NEXT:    movsbl %al, %eax
3478; AVX2-FAST-NEXT:    vmovd %eax, %xmm0
3479; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
3480; AVX2-FAST-NEXT:    retq
3481  %2 = icmp eq <16 x i8> zeroinitializer, %0
3482  %3 = extractelement <16 x i1> %2, i64 0
3483  %4 = sext i1 %3 to i32
3484  %5 = insertelement <2 x i32> zeroinitializer, i32 %4, i64 0
3485  %6 = zext <2 x i32> %5 to <2 x i64>
3486  %7 = shufflevector <2 x i64> %6, <2 x i64> zeroinitializer, <2 x i32> zeroinitializer
3487  ret <2 x i64> %7
3488}
3489
3490define <4 x i32> @PR63700(i128 %0) {
3491; SSE2-LABEL: PR63700:
3492; SSE2:       # %bb.0:
3493; SSE2-NEXT:    movd %edi, %xmm0
3494; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3495; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3496; SSE2-NEXT:    retq
3497;
3498; SSSE3-LABEL: PR63700:
3499; SSSE3:       # %bb.0:
3500; SSSE3-NEXT:    movd %edi, %xmm0
3501; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
3502; SSSE3-NEXT:    retq
3503;
3504; SSE41-LABEL: PR63700:
3505; SSE41:       # %bb.0:
3506; SSE41-NEXT:    movd %edi, %xmm0
3507; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3508; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
3509; SSE41-NEXT:    retq
3510;
3511; AVX1-LABEL: PR63700:
3512; AVX1:       # %bb.0:
3513; AVX1-NEXT:    vmovd %edi, %xmm0
3514; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3515; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
3516; AVX1-NEXT:    retq
3517;
3518; AVX2-SLOW-LABEL: PR63700:
3519; AVX2-SLOW:       # %bb.0:
3520; AVX2-SLOW-NEXT:    vmovd %edi, %xmm0
3521; AVX2-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
3522; AVX2-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
3523; AVX2-SLOW-NEXT:    retq
3524;
3525; AVX2-FAST-LABEL: PR63700:
3526; AVX2-FAST:       # %bb.0:
3527; AVX2-FAST-NEXT:    vmovq %rdi, %xmm0
3528; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
3529; AVX2-FAST-NEXT:    retq
3530  %vcmp = bitcast i128 %0 to <4 x i32>
3531  %shuffle.i = shufflevector <4 x i32> %vcmp, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
3532  %shuffle.i11 = shufflevector <4 x i32> %shuffle.i, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3533  ret <4 x i32> %shuffle.i11
3534}
3535
3536define <16 x i8> @PR107289(<16 x i8> %0) {
3537; SSE-LABEL: PR107289:
3538; SSE:       # %bb.0:
3539; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
3540; SSE-NEXT:    retq
3541;
3542; AVX-LABEL: PR107289:
3543; AVX:       # %bb.0:
3544; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
3545; AVX-NEXT:    retq
3546  %src = bitcast <16 x i8> %0 to i128
3547  %shl = shl i128 %src, 8
3548  %res = bitcast i128 %shl to <16 x i8>
3549  ret <16 x i8> %res
3550}
3551
3552; Test case reported on D105827
3553define void @SpinningCube() {
3554; SSE2-LABEL: SpinningCube:
3555; SSE2:       # %bb.0: # %entry
3556; SSE2-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3557; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0]
3558; SSE2-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
3559; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u]
3560; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3561; SSE2-NEXT:    xorps %xmm3, %xmm3
3562; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
3563; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
3564; SSE2-NEXT:    addps %xmm3, %xmm1
3565; SSE2-NEXT:    movaps %xmm1, (%rax)
3566; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3567; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
3568; SSE2-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
3569; SSE2-NEXT:    addps %xmm0, %xmm1
3570; SSE2-NEXT:    movaps %xmm1, (%rax)
3571; SSE2-NEXT:    retq
3572;
3573; SSSE3-LABEL: SpinningCube:
3574; SSSE3:       # %bb.0: # %entry
3575; SSSE3-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3576; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0]
3577; SSSE3-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
3578; SSSE3-NEXT:    movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u]
3579; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3580; SSSE3-NEXT:    xorps %xmm3, %xmm3
3581; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
3582; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
3583; SSSE3-NEXT:    addps %xmm3, %xmm1
3584; SSSE3-NEXT:    movaps %xmm1, (%rax)
3585; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3586; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,2]
3587; SSSE3-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
3588; SSSE3-NEXT:    addps %xmm0, %xmm1
3589; SSSE3-NEXT:    movaps %xmm1, (%rax)
3590; SSSE3-NEXT:    retq
3591;
3592; SSE41-LABEL: SpinningCube:
3593; SSE41:       # %bb.0: # %entry
3594; SSE41-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3595; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0]
3596; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u]
3597; SSE41-NEXT:    movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
3598; SSE41-NEXT:    movaps %xmm1, %xmm3
3599; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0]
3600; SSE41-NEXT:    movaps %xmm0, %xmm4
3601; SSE41-NEXT:    insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3]
3602; SSE41-NEXT:    addps %xmm3, %xmm4
3603; SSE41-NEXT:    movaps %xmm4, (%rax)
3604; SSE41-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3605; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0,0,2]
3606; SSE41-NEXT:    mulps %xmm1, %xmm2
3607; SSE41-NEXT:    addps %xmm0, %xmm2
3608; SSE41-NEXT:    movaps %xmm2, (%rax)
3609; SSE41-NEXT:    retq
3610;
3611; AVX-LABEL: SpinningCube:
3612; AVX:       # %bb.0: # %entry
3613; AVX-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3614; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
3615; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u]
3616; AVX-NEXT:    vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
3617; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
3618; AVX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3]
3619; AVX-NEXT:    vaddps %xmm2, %xmm3, %xmm2
3620; AVX-NEXT:    vmovaps %xmm2, (%rax)
3621; AVX-NEXT:    vbroadcastss (%rax), %xmm2
3622; AVX-NEXT:    vmulps %xmm1, %xmm2, %xmm1
3623; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
3624; AVX-NEXT:    vmovaps %xmm0, (%rax)
3625; AVX-NEXT:    retq
3626entry:
3627  store float 1.000000e+00, ptr undef, align 4
3628  %0 = load float, ptr undef, align 4
3629  %1 = fmul float undef, 0.000000e+00
3630  %2 = insertelement <4 x float> poison, float %0, i32 3
3631  %3 = load float, ptr undef, align 4
3632  %4 = insertelement <2 x float> poison, float %3, i32 0
3633  %5 = shufflevector <2 x float> %4, <2 x float> poison, <2 x i32> zeroinitializer
3634  %6 = fmul <2 x float> %5, <float 0.000000e+00, float -2.000000e+00>
3635  %7 = fadd float %1, undef
3636  %8 = shufflevector <2 x float> %6, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
3637  %9 = shufflevector <4 x float> undef, <4 x float> %8, <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
3638  %10 = insertelement <4 x float> %9, float %7, i32 3
3639  %11 = insertelement <4 x float> %2, float 0x7FF8000000000000, i32 1
3640  %12 = insertelement <4 x float> %11, float undef, i32 0
3641  %13 = insertelement <4 x float> %12, float undef, i32 2
3642  %14 = fadd <4 x float> %10, %13
3643  store <4 x float> %14, ptr undef, align 16
3644  %15 = load float, ptr undef, align 4
3645  %16 = insertelement <2 x float> poison, float %15, i32 0
3646  %17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> zeroinitializer
3647  %18 = fmul <2 x float> %17, <float 0.000000e+00, float -2.000000e+00>
3648  %19 = shufflevector <2 x float> %18, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
3649  %20 = shufflevector <4 x float> undef, <4 x float> %19, <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
3650  %21 = fadd <4 x float> %20, %2
3651  store <4 x float> %21, ptr undef, align 16
3652  ret void
3653}
3654
3655; Inifite loop test case reported on 5ca77541446d
3656define void @autogen_SD25931() {
3657; CHECK-LABEL: autogen_SD25931:
3658; CHECK:       # %bb.0: # %BB
3659; CHECK-NEXT:    .p2align 4
3660; CHECK-NEXT:  .LBB142_1: # %CF242
3661; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3662; CHECK-NEXT:    jmp .LBB142_1
3663BB:
3664  %Cmp16 = icmp uge <2 x i1> zeroinitializer, zeroinitializer
3665  %Shuff19 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Cmp16, <2 x i32> <i32 3, i32 1>
3666  %Shuff33 = shufflevector <2 x i1> %Shuff19, <2 x i1> zeroinitializer, <2 x i32> <i32 0, i32 2>
3667  br label %CF250
3668
3669CF250:                                            ; preds = %CF250, %BB
3670  br i1 poison, label %CF250, label %CF259
3671
3672CF259:                                            ; preds = %CF250
3673  %Cmp83 = icmp ule <2 x i1> %Shuff19, zeroinitializer
3674  br label %CF242
3675
3676CF242:                                            ; preds = %CF242, %CF259
3677  %Shuff153 = shufflevector <2 x i1> %Shuff33, <2 x i1> poison, <2 x i32> <i32 3, i32 1>
3678  %Shuff161 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Cmp83, <2 x i32> <i32 1, i32 3>
3679  br label %CF242
3680}
3681