xref: /llvm-project/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll (revision 7b3bbd83c0c24087072ec5b22a76799ab31f87d5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=-sse,-sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-BASELINE
3; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,-sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE1
4; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE2
5; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+xop < %s | FileCheck %s --check-prefixes=CHECK,CHECK-XOP
6
7; https://bugs.llvm.org/show_bug.cgi?id=37104
8
9; All the advanced stuff (negative tests, commutativity) is handled in the
10; scalar version of the test only.
11
12; ============================================================================ ;
13; 8-bit vector width
14; ============================================================================ ;
15
16define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
17; CHECK-LABEL: out_v1i8:
18; CHECK:       # %bb.0:
19; CHECK-NEXT:    movl %edx, %eax
20; CHECK-NEXT:    andl %edx, %edi
21; CHECK-NEXT:    notb %al
22; CHECK-NEXT:    andb %sil, %al
23; CHECK-NEXT:    orb %dil, %al
24; CHECK-NEXT:    # kill: def $al killed $al killed $eax
25; CHECK-NEXT:    retq
26  %mx = and <1 x i8> %x, %mask
27  %notmask = xor <1 x i8> %mask, <i8 -1>
28  %my = and <1 x i8> %y, %notmask
29  %r = or <1 x i8> %mx, %my
30  ret <1 x i8> %r
31}
32
33; ============================================================================ ;
34; 16-bit vector width
35; ============================================================================ ;
36
37define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
38; CHECK-BASELINE-LABEL: out_v2i8:
39; CHECK-BASELINE:       # %bb.0:
40; CHECK-BASELINE-NEXT:    movl %r8d, %eax
41; CHECK-BASELINE-NEXT:    andl %r9d, %esi
42; CHECK-BASELINE-NEXT:    andl %r8d, %edi
43; CHECK-BASELINE-NEXT:    notb %al
44; CHECK-BASELINE-NEXT:    notb %r9b
45; CHECK-BASELINE-NEXT:    andb %cl, %r9b
46; CHECK-BASELINE-NEXT:    andb %dl, %al
47; CHECK-BASELINE-NEXT:    orb %dil, %al
48; CHECK-BASELINE-NEXT:    orb %sil, %r9b
49; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
50; CHECK-BASELINE-NEXT:    movl %r9d, %edx
51; CHECK-BASELINE-NEXT:    retq
52;
53; CHECK-SSE1-LABEL: out_v2i8:
54; CHECK-SSE1:       # %bb.0:
55; CHECK-SSE1-NEXT:    movl %r8d, %eax
56; CHECK-SSE1-NEXT:    andl %r9d, %esi
57; CHECK-SSE1-NEXT:    andl %r8d, %edi
58; CHECK-SSE1-NEXT:    notb %al
59; CHECK-SSE1-NEXT:    notb %r9b
60; CHECK-SSE1-NEXT:    andb %cl, %r9b
61; CHECK-SSE1-NEXT:    andb %dl, %al
62; CHECK-SSE1-NEXT:    orb %dil, %al
63; CHECK-SSE1-NEXT:    orb %sil, %r9b
64; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
65; CHECK-SSE1-NEXT:    movl %r9d, %edx
66; CHECK-SSE1-NEXT:    retq
67;
68; CHECK-SSE2-LABEL: out_v2i8:
69; CHECK-SSE2:       # %bb.0:
70; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
71; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
72; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
73; CHECK-SSE2-NEXT:    retq
74;
75; CHECK-XOP-LABEL: out_v2i8:
76; CHECK-XOP:       # %bb.0:
77; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
78; CHECK-XOP-NEXT:    retq
79  %mx = and <2 x i8> %x, %mask
80  %notmask = xor <2 x i8> %mask, <i8 -1, i8 -1>
81  %my = and <2 x i8> %y, %notmask
82  %r = or <2 x i8> %mx, %my
83  ret <2 x i8> %r
84}
85
86define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
87; CHECK-LABEL: out_v1i16:
88; CHECK:       # %bb.0:
89; CHECK-NEXT:    movl %edx, %eax
90; CHECK-NEXT:    andl %edx, %edi
91; CHECK-NEXT:    notl %eax
92; CHECK-NEXT:    andl %esi, %eax
93; CHECK-NEXT:    orl %edi, %eax
94; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
95; CHECK-NEXT:    retq
96  %mx = and <1 x i16> %x, %mask
97  %notmask = xor <1 x i16> %mask, <i16 -1>
98  %my = and <1 x i16> %y, %notmask
99  %r = or <1 x i16> %mx, %my
100  ret <1 x i16> %r
101}
102
103; ============================================================================ ;
104; 32-bit vector width
105; ============================================================================ ;
106
107define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
108; CHECK-BASELINE-LABEL: out_v4i8:
109; CHECK-BASELINE:       # %bb.0:
110; CHECK-BASELINE-NEXT:    movq %rdi, %rax
111; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
112; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
113; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
114; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
115; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
116; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
117; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
118; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
119; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
120; CHECK-BASELINE-NEXT:    xorb %r10b, %cl
121; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
122; CHECK-BASELINE-NEXT:    xorb %r10b, %cl
123; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
124; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
125; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
126; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rax)
127; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
128; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
129; CHECK-BASELINE-NEXT:    movb %sil, (%rax)
130; CHECK-BASELINE-NEXT:    retq
131;
132; CHECK-SSE1-LABEL: out_v4i8:
133; CHECK-SSE1:       # %bb.0:
134; CHECK-SSE1-NEXT:    movq %rdi, %rax
135; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
136; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
137; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
138; CHECK-SSE1-NEXT:    xorl %r9d, %esi
139; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
140; CHECK-SSE1-NEXT:    xorb %r9b, %sil
141; CHECK-SSE1-NEXT:    xorb %r11b, %dl
142; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
143; CHECK-SSE1-NEXT:    xorb %r11b, %dl
144; CHECK-SSE1-NEXT:    xorb %r10b, %cl
145; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
146; CHECK-SSE1-NEXT:    xorb %r10b, %cl
147; CHECK-SSE1-NEXT:    xorb %dil, %r8b
148; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
149; CHECK-SSE1-NEXT:    xorb %dil, %r8b
150; CHECK-SSE1-NEXT:    movb %r8b, 3(%rax)
151; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
152; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
153; CHECK-SSE1-NEXT:    movb %sil, (%rax)
154; CHECK-SSE1-NEXT:    retq
155;
156; CHECK-SSE2-LABEL: out_v4i8:
157; CHECK-SSE2:       # %bb.0:
158; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
159; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
160; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
161; CHECK-SSE2-NEXT:    retq
162;
163; CHECK-XOP-LABEL: out_v4i8:
164; CHECK-XOP:       # %bb.0:
165; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
166; CHECK-XOP-NEXT:    retq
167  %mx = and <4 x i8> %x, %mask
168  %notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1>
169  %my = and <4 x i8> %y, %notmask
170  %r = or <4 x i8> %mx, %my
171  ret <4 x i8> %r
172}
173
174define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
175; CHECK-BASELINE-LABEL: out_v4i8_undef:
176; CHECK-BASELINE:       # %bb.0:
177; CHECK-BASELINE-NEXT:    movq %rdi, %rax
178; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
179; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
180; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
181; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
182; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
183; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
184; CHECK-BASELINE-NEXT:    xorb %r10b, %dl
185; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
186; CHECK-BASELINE-NEXT:    xorb %r10b, %dl
187; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
188; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
189; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
190; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
191; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rax)
192; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
193; CHECK-BASELINE-NEXT:    movb %sil, (%rax)
194; CHECK-BASELINE-NEXT:    retq
195;
196; CHECK-SSE1-LABEL: out_v4i8_undef:
197; CHECK-SSE1:       # %bb.0:
198; CHECK-SSE1-NEXT:    movq %rdi, %rax
199; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
200; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
201; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
202; CHECK-SSE1-NEXT:    xorl %r9d, %esi
203; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
204; CHECK-SSE1-NEXT:    xorb %r9b, %sil
205; CHECK-SSE1-NEXT:    xorb %r10b, %dl
206; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
207; CHECK-SSE1-NEXT:    xorb %r10b, %dl
208; CHECK-SSE1-NEXT:    xorb %dil, %r8b
209; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
210; CHECK-SSE1-NEXT:    xorb %dil, %r8b
211; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
212; CHECK-SSE1-NEXT:    movb %r8b, 3(%rax)
213; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
214; CHECK-SSE1-NEXT:    movb %sil, (%rax)
215; CHECK-SSE1-NEXT:    retq
216;
217; CHECK-SSE2-LABEL: out_v4i8_undef:
218; CHECK-SSE2:       # %bb.0:
219; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
220; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
221; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
222; CHECK-SSE2-NEXT:    retq
223;
224; CHECK-XOP-LABEL: out_v4i8_undef:
225; CHECK-XOP:       # %bb.0:
226; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
227; CHECK-XOP-NEXT:    retq
228  %mx = and <4 x i8> %x, %mask
229  %notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 undef, i8 -1>
230  %my = and <4 x i8> %y, %notmask
231  %r = or <4 x i8> %mx, %my
232  ret <4 x i8> %r
233}
234
235define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
236; CHECK-BASELINE-LABEL: out_v2i16:
237; CHECK-BASELINE:       # %bb.0:
238; CHECK-BASELINE-NEXT:    movl %r8d, %eax
239; CHECK-BASELINE-NEXT:    andl %r9d, %esi
240; CHECK-BASELINE-NEXT:    andl %r8d, %edi
241; CHECK-BASELINE-NEXT:    notl %eax
242; CHECK-BASELINE-NEXT:    notl %r9d
243; CHECK-BASELINE-NEXT:    andl %ecx, %r9d
244; CHECK-BASELINE-NEXT:    orl %esi, %r9d
245; CHECK-BASELINE-NEXT:    andl %edx, %eax
246; CHECK-BASELINE-NEXT:    orl %edi, %eax
247; CHECK-BASELINE-NEXT:    # kill: def $ax killed $ax killed $eax
248; CHECK-BASELINE-NEXT:    movl %r9d, %edx
249; CHECK-BASELINE-NEXT:    retq
250;
251; CHECK-SSE1-LABEL: out_v2i16:
252; CHECK-SSE1:       # %bb.0:
253; CHECK-SSE1-NEXT:    movl %r8d, %eax
254; CHECK-SSE1-NEXT:    andl %r9d, %esi
255; CHECK-SSE1-NEXT:    andl %r8d, %edi
256; CHECK-SSE1-NEXT:    notl %eax
257; CHECK-SSE1-NEXT:    notl %r9d
258; CHECK-SSE1-NEXT:    andl %ecx, %r9d
259; CHECK-SSE1-NEXT:    orl %esi, %r9d
260; CHECK-SSE1-NEXT:    andl %edx, %eax
261; CHECK-SSE1-NEXT:    orl %edi, %eax
262; CHECK-SSE1-NEXT:    # kill: def $ax killed $ax killed $eax
263; CHECK-SSE1-NEXT:    movl %r9d, %edx
264; CHECK-SSE1-NEXT:    retq
265;
266; CHECK-SSE2-LABEL: out_v2i16:
267; CHECK-SSE2:       # %bb.0:
268; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
269; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
270; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
271; CHECK-SSE2-NEXT:    retq
272;
273; CHECK-XOP-LABEL: out_v2i16:
274; CHECK-XOP:       # %bb.0:
275; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
276; CHECK-XOP-NEXT:    retq
277  %mx = and <2 x i16> %x, %mask
278  %notmask = xor <2 x i16> %mask, <i16 -1, i16 -1>
279  %my = and <2 x i16> %y, %notmask
280  %r = or <2 x i16> %mx, %my
281  ret <2 x i16> %r
282}
283
284define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
285; CHECK-LABEL: out_v1i32:
286; CHECK:       # %bb.0:
287; CHECK-NEXT:    movl %edi, %eax
288; CHECK-NEXT:    xorl %esi, %eax
289; CHECK-NEXT:    andl %edx, %eax
290; CHECK-NEXT:    xorl %esi, %eax
291; CHECK-NEXT:    retq
292  %mx = and <1 x i32> %x, %mask
293  %notmask = xor <1 x i32> %mask, <i32 -1>
294  %my = and <1 x i32> %y, %notmask
295  %r = or <1 x i32> %mx, %my
296  ret <1 x i32> %r
297}
298
299; ============================================================================ ;
300; 64-bit vector width
301; ============================================================================ ;
302
303define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
304; CHECK-BASELINE-LABEL: out_v8i8:
305; CHECK-BASELINE:       # %bb.0:
306; CHECK-BASELINE-NEXT:    pushq %rbp
307; CHECK-BASELINE-NEXT:    pushq %r15
308; CHECK-BASELINE-NEXT:    pushq %r14
309; CHECK-BASELINE-NEXT:    pushq %r12
310; CHECK-BASELINE-NEXT:    pushq %rbx
311; CHECK-BASELINE-NEXT:    movq %rdi, %rax
312; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
313; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
314; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
315; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
316; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
317; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
318; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
319; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
320; CHECK-BASELINE-NEXT:    xorb %r12b, %sil
321; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
322; CHECK-BASELINE-NEXT:    xorb %r12b, %sil
323; CHECK-BASELINE-NEXT:    xorb %r15b, %dl
324; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
325; CHECK-BASELINE-NEXT:    xorb %r15b, %dl
326; CHECK-BASELINE-NEXT:    xorb %r14b, %cl
327; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
328; CHECK-BASELINE-NEXT:    xorb %r14b, %cl
329; CHECK-BASELINE-NEXT:    xorb %bpl, %r8b
330; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
331; CHECK-BASELINE-NEXT:    xorb %bpl, %r8b
332; CHECK-BASELINE-NEXT:    xorb %bl, %r9b
333; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
334; CHECK-BASELINE-NEXT:    xorb %bl, %r9b
335; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
336; CHECK-BASELINE-NEXT:    xorb %r11b, %bl
337; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
338; CHECK-BASELINE-NEXT:    xorb %r11b, %bl
339; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
340; CHECK-BASELINE-NEXT:    xorb %r10b, %r11b
341; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
342; CHECK-BASELINE-NEXT:    xorb %r10b, %r11b
343; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
344; CHECK-BASELINE-NEXT:    xorb %dil, %r10b
345; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
346; CHECK-BASELINE-NEXT:    xorb %dil, %r10b
347; CHECK-BASELINE-NEXT:    movb %r10b, 7(%rax)
348; CHECK-BASELINE-NEXT:    movb %r11b, 6(%rax)
349; CHECK-BASELINE-NEXT:    movb %bl, 5(%rax)
350; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rax)
351; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rax)
352; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
353; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
354; CHECK-BASELINE-NEXT:    movb %sil, (%rax)
355; CHECK-BASELINE-NEXT:    popq %rbx
356; CHECK-BASELINE-NEXT:    popq %r12
357; CHECK-BASELINE-NEXT:    popq %r14
358; CHECK-BASELINE-NEXT:    popq %r15
359; CHECK-BASELINE-NEXT:    popq %rbp
360; CHECK-BASELINE-NEXT:    retq
361;
362; CHECK-SSE1-LABEL: out_v8i8:
363; CHECK-SSE1:       # %bb.0:
364; CHECK-SSE1-NEXT:    pushq %rbp
365; CHECK-SSE1-NEXT:    pushq %r15
366; CHECK-SSE1-NEXT:    pushq %r14
367; CHECK-SSE1-NEXT:    pushq %r12
368; CHECK-SSE1-NEXT:    pushq %rbx
369; CHECK-SSE1-NEXT:    movq %rdi, %rax
370; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
371; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
372; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
373; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
374; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
375; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
376; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
377; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
378; CHECK-SSE1-NEXT:    xorb %r12b, %sil
379; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
380; CHECK-SSE1-NEXT:    xorb %r12b, %sil
381; CHECK-SSE1-NEXT:    xorb %r15b, %dl
382; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
383; CHECK-SSE1-NEXT:    xorb %r15b, %dl
384; CHECK-SSE1-NEXT:    xorb %r14b, %cl
385; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
386; CHECK-SSE1-NEXT:    xorb %r14b, %cl
387; CHECK-SSE1-NEXT:    xorb %bpl, %r8b
388; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
389; CHECK-SSE1-NEXT:    xorb %bpl, %r8b
390; CHECK-SSE1-NEXT:    xorb %bl, %r9b
391; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
392; CHECK-SSE1-NEXT:    xorb %bl, %r9b
393; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
394; CHECK-SSE1-NEXT:    xorb %r11b, %bl
395; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
396; CHECK-SSE1-NEXT:    xorb %r11b, %bl
397; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
398; CHECK-SSE1-NEXT:    xorb %r10b, %r11b
399; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
400; CHECK-SSE1-NEXT:    xorb %r10b, %r11b
401; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
402; CHECK-SSE1-NEXT:    xorb %dil, %r10b
403; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
404; CHECK-SSE1-NEXT:    xorb %dil, %r10b
405; CHECK-SSE1-NEXT:    movb %r10b, 7(%rax)
406; CHECK-SSE1-NEXT:    movb %r11b, 6(%rax)
407; CHECK-SSE1-NEXT:    movb %bl, 5(%rax)
408; CHECK-SSE1-NEXT:    movb %r9b, 4(%rax)
409; CHECK-SSE1-NEXT:    movb %r8b, 3(%rax)
410; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
411; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
412; CHECK-SSE1-NEXT:    movb %sil, (%rax)
413; CHECK-SSE1-NEXT:    popq %rbx
414; CHECK-SSE1-NEXT:    popq %r12
415; CHECK-SSE1-NEXT:    popq %r14
416; CHECK-SSE1-NEXT:    popq %r15
417; CHECK-SSE1-NEXT:    popq %rbp
418; CHECK-SSE1-NEXT:    retq
419;
420; CHECK-SSE2-LABEL: out_v8i8:
421; CHECK-SSE2:       # %bb.0:
422; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
423; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
424; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
425; CHECK-SSE2-NEXT:    retq
426;
427; CHECK-XOP-LABEL: out_v8i8:
428; CHECK-XOP:       # %bb.0:
429; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
430; CHECK-XOP-NEXT:    retq
431  %mx = and <8 x i8> %x, %mask
432  %notmask = xor <8 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
433  %my = and <8 x i8> %y, %notmask
434  %r = or <8 x i8> %mx, %my
435  ret <8 x i8> %r
436}
437
438define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
439; CHECK-BASELINE-LABEL: out_v4i16:
440; CHECK-BASELINE:       # %bb.0:
441; CHECK-BASELINE-NEXT:    movq %rdi, %rax
442; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
443; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
444; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
445; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
446; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
447; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
448; CHECK-BASELINE-NEXT:    xorl %r10d, %ecx
449; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
450; CHECK-BASELINE-NEXT:    xorl %r10d, %ecx
451; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
452; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
453; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
454; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
455; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
456; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
457; CHECK-BASELINE-NEXT:    movw %si, (%rax)
458; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
459; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
460; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
461; CHECK-BASELINE-NEXT:    retq
462;
463; CHECK-SSE1-LABEL: out_v4i16:
464; CHECK-SSE1:       # %bb.0:
465; CHECK-SSE1-NEXT:    movq %rdi, %rax
466; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
467; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
468; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
469; CHECK-SSE1-NEXT:    xorl %r11d, %edx
470; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
471; CHECK-SSE1-NEXT:    xorl %r11d, %edx
472; CHECK-SSE1-NEXT:    xorl %r10d, %ecx
473; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
474; CHECK-SSE1-NEXT:    xorl %r10d, %ecx
475; CHECK-SSE1-NEXT:    xorl %edi, %r8d
476; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
477; CHECK-SSE1-NEXT:    xorl %edi, %r8d
478; CHECK-SSE1-NEXT:    xorl %r9d, %esi
479; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
480; CHECK-SSE1-NEXT:    xorl %r9d, %esi
481; CHECK-SSE1-NEXT:    movw %si, (%rax)
482; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
483; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
484; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
485; CHECK-SSE1-NEXT:    retq
486;
487; CHECK-SSE2-LABEL: out_v4i16:
488; CHECK-SSE2:       # %bb.0:
489; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
490; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
491; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
492; CHECK-SSE2-NEXT:    retq
493;
494; CHECK-XOP-LABEL: out_v4i16:
495; CHECK-XOP:       # %bb.0:
496; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
497; CHECK-XOP-NEXT:    retq
498  %mx = and <4 x i16> %x, %mask
499  %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1>
500  %my = and <4 x i16> %y, %notmask
501  %r = or <4 x i16> %mx, %my
502  ret <4 x i16> %r
503}
504
505define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
506; CHECK-BASELINE-LABEL: out_v4i16_undef:
507; CHECK-BASELINE:       # %bb.0:
508; CHECK-BASELINE-NEXT:    movq %rdi, %rax
509; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
510; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
511; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
512; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
513; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
514; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
515; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
516; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
517; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
518; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
519; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
520; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
521; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
522; CHECK-BASELINE-NEXT:    movw %si, (%rax)
523; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
524; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
525; CHECK-BASELINE-NEXT:    retq
526;
527; CHECK-SSE1-LABEL: out_v4i16_undef:
528; CHECK-SSE1:       # %bb.0:
529; CHECK-SSE1-NEXT:    movq %rdi, %rax
530; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
531; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
532; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
533; CHECK-SSE1-NEXT:    xorl %r10d, %edx
534; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
535; CHECK-SSE1-NEXT:    xorl %r10d, %edx
536; CHECK-SSE1-NEXT:    xorl %edi, %r8d
537; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
538; CHECK-SSE1-NEXT:    xorl %edi, %r8d
539; CHECK-SSE1-NEXT:    xorl %r9d, %esi
540; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
541; CHECK-SSE1-NEXT:    xorl %r9d, %esi
542; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
543; CHECK-SSE1-NEXT:    movw %si, (%rax)
544; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
545; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
546; CHECK-SSE1-NEXT:    retq
547;
548; CHECK-SSE2-LABEL: out_v4i16_undef:
549; CHECK-SSE2:       # %bb.0:
550; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
551; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
552; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
553; CHECK-SSE2-NEXT:    retq
554;
555; CHECK-XOP-LABEL: out_v4i16_undef:
556; CHECK-XOP:       # %bb.0:
557; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
558; CHECK-XOP-NEXT:    retq
559  %mx = and <4 x i16> %x, %mask
560  %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 undef, i16 -1>
561  %my = and <4 x i16> %y, %notmask
562  %r = or <4 x i16> %mx, %my
563  ret <4 x i16> %r
564}
565
566define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
567; CHECK-BASELINE-LABEL: out_v2i32:
568; CHECK-BASELINE:       # %bb.0:
569; CHECK-BASELINE-NEXT:    movl %edi, %eax
570; CHECK-BASELINE-NEXT:    xorl %edx, %eax
571; CHECK-BASELINE-NEXT:    andl %r8d, %eax
572; CHECK-BASELINE-NEXT:    xorl %edx, %eax
573; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
574; CHECK-BASELINE-NEXT:    andl %r9d, %esi
575; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
576; CHECK-BASELINE-NEXT:    movl %esi, %edx
577; CHECK-BASELINE-NEXT:    retq
578;
579; CHECK-SSE1-LABEL: out_v2i32:
580; CHECK-SSE1:       # %bb.0:
581; CHECK-SSE1-NEXT:    movl %edi, %eax
582; CHECK-SSE1-NEXT:    xorl %edx, %eax
583; CHECK-SSE1-NEXT:    andl %r8d, %eax
584; CHECK-SSE1-NEXT:    xorl %edx, %eax
585; CHECK-SSE1-NEXT:    xorl %ecx, %esi
586; CHECK-SSE1-NEXT:    andl %r9d, %esi
587; CHECK-SSE1-NEXT:    xorl %ecx, %esi
588; CHECK-SSE1-NEXT:    movl %esi, %edx
589; CHECK-SSE1-NEXT:    retq
590;
591; CHECK-SSE2-LABEL: out_v2i32:
592; CHECK-SSE2:       # %bb.0:
593; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
594; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
595; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
596; CHECK-SSE2-NEXT:    retq
597;
598; CHECK-XOP-LABEL: out_v2i32:
599; CHECK-XOP:       # %bb.0:
600; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
601; CHECK-XOP-NEXT:    retq
602  %mx = and <2 x i32> %x, %mask
603  %notmask = xor <2 x i32> %mask, <i32 -1, i32 -1>
604  %my = and <2 x i32> %y, %notmask
605  %r = or <2 x i32> %mx, %my
606  ret <2 x i32> %r
607}
608
609define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
610; CHECK-LABEL: out_v1i64:
611; CHECK:       # %bb.0:
612; CHECK-NEXT:    movq %rdi, %rax
613; CHECK-NEXT:    xorq %rsi, %rax
614; CHECK-NEXT:    andq %rdx, %rax
615; CHECK-NEXT:    xorq %rsi, %rax
616; CHECK-NEXT:    retq
617  %mx = and <1 x i64> %x, %mask
618  %notmask = xor <1 x i64> %mask, <i64 -1>
619  %my = and <1 x i64> %y, %notmask
620  %r = or <1 x i64> %mx, %my
621  ret <1 x i64> %r
622}
623
624; ============================================================================ ;
625; 128-bit vector width
626; ============================================================================ ;
627
628define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
629; CHECK-BASELINE-LABEL: out_v16i8:
630; CHECK-BASELINE:       # %bb.0:
631; CHECK-BASELINE-NEXT:    pushq %rbp
632; CHECK-BASELINE-NEXT:    pushq %r15
633; CHECK-BASELINE-NEXT:    pushq %r14
634; CHECK-BASELINE-NEXT:    pushq %r13
635; CHECK-BASELINE-NEXT:    pushq %r12
636; CHECK-BASELINE-NEXT:    pushq %rbx
637; CHECK-BASELINE-NEXT:    movl %edx, %r11d
638; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
639; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
640; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
641; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
642; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
643; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
644; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
645; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
646; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
647; CHECK-BASELINE-NEXT:    xorb %r10b, %sil
648; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
649; CHECK-BASELINE-NEXT:    xorb %r10b, %sil
650; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
651; CHECK-BASELINE-NEXT:    xorb %dl, %r11b
652; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
653; CHECK-BASELINE-NEXT:    xorb %dl, %r11b
654; CHECK-BASELINE-NEXT:    xorb %al, %cl
655; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
656; CHECK-BASELINE-NEXT:    xorb %al, %cl
657; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
658; CHECK-BASELINE-NEXT:    xorb %bl, %r8b
659; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
660; CHECK-BASELINE-NEXT:    xorb %bl, %r8b
661; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
662; CHECK-BASELINE-NEXT:    xorb %r14b, %r9b
663; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
664; CHECK-BASELINE-NEXT:    xorb %r14b, %r9b
665; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
666; CHECK-BASELINE-NEXT:    xorb %r12b, %r14b
667; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
668; CHECK-BASELINE-NEXT:    xorb %r12b, %r14b
669; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
670; CHECK-BASELINE-NEXT:    xorb %bpl, %r12b
671; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
672; CHECK-BASELINE-NEXT:    xorb %bpl, %r12b
673; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
674; CHECK-BASELINE-NEXT:    xorb %r15b, %sil
675; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
676; CHECK-BASELINE-NEXT:    xorb %r15b, %sil
677; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
678; CHECK-BASELINE-NEXT:    xorb %r13b, %dl
679; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
680; CHECK-BASELINE-NEXT:    xorb %r13b, %dl
681; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
682; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
683; CHECK-BASELINE-NEXT:    xorb %al, %cl
684; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
685; CHECK-BASELINE-NEXT:    xorb %al, %cl
686; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
687; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
688; CHECK-BASELINE-NEXT:    xorb %al, %r13b
689; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
690; CHECK-BASELINE-NEXT:    xorb %al, %r13b
691; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
692; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
693; CHECK-BASELINE-NEXT:    xorb %al, %r15b
694; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
695; CHECK-BASELINE-NEXT:    xorb %al, %r15b
696; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
697; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
698; CHECK-BASELINE-NEXT:    xorb %al, %bpl
699; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
700; CHECK-BASELINE-NEXT:    xorb %al, %bpl
701; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
702; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
703; CHECK-BASELINE-NEXT:    xorb %al, %bl
704; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
705; CHECK-BASELINE-NEXT:    xorb %al, %bl
706; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
707; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
708; CHECK-BASELINE-NEXT:    xorb %r8b, %al
709; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
710; CHECK-BASELINE-NEXT:    xorb %r8b, %al
711; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
712; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
713; CHECK-BASELINE-NEXT:    xorb %r8b, %r10b
714; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
715; CHECK-BASELINE-NEXT:    xorb %r8b, %r10b
716; CHECK-BASELINE-NEXT:    movb %r10b, 15(%rdi)
717; CHECK-BASELINE-NEXT:    movb %al, 14(%rdi)
718; CHECK-BASELINE-NEXT:    movb %bl, 13(%rdi)
719; CHECK-BASELINE-NEXT:    movb %bpl, 12(%rdi)
720; CHECK-BASELINE-NEXT:    movb %r15b, 11(%rdi)
721; CHECK-BASELINE-NEXT:    movb %r13b, 10(%rdi)
722; CHECK-BASELINE-NEXT:    movb %cl, 9(%rdi)
723; CHECK-BASELINE-NEXT:    movb %dl, 8(%rdi)
724; CHECK-BASELINE-NEXT:    movb %sil, 7(%rdi)
725; CHECK-BASELINE-NEXT:    movb %r12b, 6(%rdi)
726; CHECK-BASELINE-NEXT:    movb %r14b, 5(%rdi)
727; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdi)
728; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
729; CHECK-BASELINE-NEXT:    movb %al, 3(%rdi)
730; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
731; CHECK-BASELINE-NEXT:    movb %al, 2(%rdi)
732; CHECK-BASELINE-NEXT:    movb %r11b, 1(%rdi)
733; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
734; CHECK-BASELINE-NEXT:    movb %al, (%rdi)
735; CHECK-BASELINE-NEXT:    movq %rdi, %rax
736; CHECK-BASELINE-NEXT:    popq %rbx
737; CHECK-BASELINE-NEXT:    popq %r12
738; CHECK-BASELINE-NEXT:    popq %r13
739; CHECK-BASELINE-NEXT:    popq %r14
740; CHECK-BASELINE-NEXT:    popq %r15
741; CHECK-BASELINE-NEXT:    popq %rbp
742; CHECK-BASELINE-NEXT:    retq
743;
744; CHECK-SSE1-LABEL: out_v16i8:
745; CHECK-SSE1:       # %bb.0:
746; CHECK-SSE1-NEXT:    pushq %rbp
747; CHECK-SSE1-NEXT:    pushq %r15
748; CHECK-SSE1-NEXT:    pushq %r14
749; CHECK-SSE1-NEXT:    pushq %r13
750; CHECK-SSE1-NEXT:    pushq %r12
751; CHECK-SSE1-NEXT:    pushq %rbx
752; CHECK-SSE1-NEXT:    movl %edx, %r11d
753; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
754; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
755; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
756; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
757; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
758; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
759; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
760; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
761; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
762; CHECK-SSE1-NEXT:    xorb %r10b, %sil
763; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
764; CHECK-SSE1-NEXT:    xorb %r10b, %sil
765; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
766; CHECK-SSE1-NEXT:    xorb %dl, %r11b
767; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
768; CHECK-SSE1-NEXT:    xorb %dl, %r11b
769; CHECK-SSE1-NEXT:    xorb %al, %cl
770; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
771; CHECK-SSE1-NEXT:    xorb %al, %cl
772; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
773; CHECK-SSE1-NEXT:    xorb %bl, %r8b
774; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
775; CHECK-SSE1-NEXT:    xorb %bl, %r8b
776; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
777; CHECK-SSE1-NEXT:    xorb %r14b, %r9b
778; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
779; CHECK-SSE1-NEXT:    xorb %r14b, %r9b
780; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
781; CHECK-SSE1-NEXT:    xorb %r12b, %r14b
782; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
783; CHECK-SSE1-NEXT:    xorb %r12b, %r14b
784; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
785; CHECK-SSE1-NEXT:    xorb %bpl, %r12b
786; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
787; CHECK-SSE1-NEXT:    xorb %bpl, %r12b
788; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
789; CHECK-SSE1-NEXT:    xorb %r15b, %sil
790; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
791; CHECK-SSE1-NEXT:    xorb %r15b, %sil
792; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
793; CHECK-SSE1-NEXT:    xorb %r13b, %dl
794; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
795; CHECK-SSE1-NEXT:    xorb %r13b, %dl
796; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
797; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
798; CHECK-SSE1-NEXT:    xorb %al, %cl
799; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
800; CHECK-SSE1-NEXT:    xorb %al, %cl
801; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
802; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
803; CHECK-SSE1-NEXT:    xorb %al, %r13b
804; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
805; CHECK-SSE1-NEXT:    xorb %al, %r13b
806; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
807; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
808; CHECK-SSE1-NEXT:    xorb %al, %r15b
809; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
810; CHECK-SSE1-NEXT:    xorb %al, %r15b
811; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
812; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
813; CHECK-SSE1-NEXT:    xorb %al, %bpl
814; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
815; CHECK-SSE1-NEXT:    xorb %al, %bpl
816; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
817; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
818; CHECK-SSE1-NEXT:    xorb %al, %bl
819; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
820; CHECK-SSE1-NEXT:    xorb %al, %bl
821; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
822; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
823; CHECK-SSE1-NEXT:    xorb %r8b, %al
824; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
825; CHECK-SSE1-NEXT:    xorb %r8b, %al
826; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
827; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
828; CHECK-SSE1-NEXT:    xorb %r8b, %r10b
829; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
830; CHECK-SSE1-NEXT:    xorb %r8b, %r10b
831; CHECK-SSE1-NEXT:    movb %r10b, 15(%rdi)
832; CHECK-SSE1-NEXT:    movb %al, 14(%rdi)
833; CHECK-SSE1-NEXT:    movb %bl, 13(%rdi)
834; CHECK-SSE1-NEXT:    movb %bpl, 12(%rdi)
835; CHECK-SSE1-NEXT:    movb %r15b, 11(%rdi)
836; CHECK-SSE1-NEXT:    movb %r13b, 10(%rdi)
837; CHECK-SSE1-NEXT:    movb %cl, 9(%rdi)
838; CHECK-SSE1-NEXT:    movb %dl, 8(%rdi)
839; CHECK-SSE1-NEXT:    movb %sil, 7(%rdi)
840; CHECK-SSE1-NEXT:    movb %r12b, 6(%rdi)
841; CHECK-SSE1-NEXT:    movb %r14b, 5(%rdi)
842; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdi)
843; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
844; CHECK-SSE1-NEXT:    movb %al, 3(%rdi)
845; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
846; CHECK-SSE1-NEXT:    movb %al, 2(%rdi)
847; CHECK-SSE1-NEXT:    movb %r11b, 1(%rdi)
848; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
849; CHECK-SSE1-NEXT:    movb %al, (%rdi)
850; CHECK-SSE1-NEXT:    movq %rdi, %rax
851; CHECK-SSE1-NEXT:    popq %rbx
852; CHECK-SSE1-NEXT:    popq %r12
853; CHECK-SSE1-NEXT:    popq %r13
854; CHECK-SSE1-NEXT:    popq %r14
855; CHECK-SSE1-NEXT:    popq %r15
856; CHECK-SSE1-NEXT:    popq %rbp
857; CHECK-SSE1-NEXT:    retq
858;
859; CHECK-SSE2-LABEL: out_v16i8:
860; CHECK-SSE2:       # %bb.0:
861; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
862; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
863; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
864; CHECK-SSE2-NEXT:    retq
865;
866; CHECK-XOP-LABEL: out_v16i8:
867; CHECK-XOP:       # %bb.0:
868; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
869; CHECK-XOP-NEXT:    retq
870  %mx = and <16 x i8> %x, %mask
871  %notmask = xor <16 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
872  %my = and <16 x i8> %y, %notmask
873  %r = or <16 x i8> %mx, %my
874  ret <16 x i8> %r
875}
876
877define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
878; CHECK-BASELINE-LABEL: out_v8i16:
879; CHECK-BASELINE:       # %bb.0:
880; CHECK-BASELINE-NEXT:    pushq %rbp
881; CHECK-BASELINE-NEXT:    pushq %r15
882; CHECK-BASELINE-NEXT:    pushq %r14
883; CHECK-BASELINE-NEXT:    pushq %r12
884; CHECK-BASELINE-NEXT:    pushq %rbx
885; CHECK-BASELINE-NEXT:    movq %rdi, %rax
886; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
887; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
888; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
889; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
890; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
891; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
892; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
893; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
894; CHECK-BASELINE-NEXT:    xorl %r12d, %esi
895; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
896; CHECK-BASELINE-NEXT:    xorl %r12d, %esi
897; CHECK-BASELINE-NEXT:    xorl %r15d, %edx
898; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
899; CHECK-BASELINE-NEXT:    xorl %r15d, %edx
900; CHECK-BASELINE-NEXT:    xorl %r14d, %ecx
901; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
902; CHECK-BASELINE-NEXT:    xorl %r14d, %ecx
903; CHECK-BASELINE-NEXT:    xorl %ebp, %r8d
904; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
905; CHECK-BASELINE-NEXT:    xorl %ebp, %r8d
906; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
907; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
908; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
909; CHECK-BASELINE-NEXT:    movl %r11d, %ebx
910; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
911; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
912; CHECK-BASELINE-NEXT:    xorl %r11d, %ebx
913; CHECK-BASELINE-NEXT:    movl %r10d, %r11d
914; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %r11w
915; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
916; CHECK-BASELINE-NEXT:    xorl %r10d, %r11d
917; CHECK-BASELINE-NEXT:    movl %edi, %r10d
918; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %r10w
919; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
920; CHECK-BASELINE-NEXT:    xorl %edi, %r10d
921; CHECK-BASELINE-NEXT:    movw %r10w, 14(%rax)
922; CHECK-BASELINE-NEXT:    movw %r11w, 12(%rax)
923; CHECK-BASELINE-NEXT:    movw %bx, 10(%rax)
924; CHECK-BASELINE-NEXT:    movw %r9w, 8(%rax)
925; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
926; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
927; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
928; CHECK-BASELINE-NEXT:    movw %si, (%rax)
929; CHECK-BASELINE-NEXT:    popq %rbx
930; CHECK-BASELINE-NEXT:    popq %r12
931; CHECK-BASELINE-NEXT:    popq %r14
932; CHECK-BASELINE-NEXT:    popq %r15
933; CHECK-BASELINE-NEXT:    popq %rbp
934; CHECK-BASELINE-NEXT:    retq
935;
936; CHECK-SSE1-LABEL: out_v8i16:
937; CHECK-SSE1:       # %bb.0:
938; CHECK-SSE1-NEXT:    pushq %rbp
939; CHECK-SSE1-NEXT:    pushq %r15
940; CHECK-SSE1-NEXT:    pushq %r14
941; CHECK-SSE1-NEXT:    pushq %r12
942; CHECK-SSE1-NEXT:    pushq %rbx
943; CHECK-SSE1-NEXT:    movq %rdi, %rax
944; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
945; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
946; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
947; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
948; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
949; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
950; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
951; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
952; CHECK-SSE1-NEXT:    xorl %r12d, %esi
953; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
954; CHECK-SSE1-NEXT:    xorl %r12d, %esi
955; CHECK-SSE1-NEXT:    xorl %r15d, %edx
956; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
957; CHECK-SSE1-NEXT:    xorl %r15d, %edx
958; CHECK-SSE1-NEXT:    xorl %r14d, %ecx
959; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
960; CHECK-SSE1-NEXT:    xorl %r14d, %ecx
961; CHECK-SSE1-NEXT:    xorl %ebp, %r8d
962; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
963; CHECK-SSE1-NEXT:    xorl %ebp, %r8d
964; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
965; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
966; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
967; CHECK-SSE1-NEXT:    movl %r11d, %ebx
968; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
969; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
970; CHECK-SSE1-NEXT:    xorl %r11d, %ebx
971; CHECK-SSE1-NEXT:    movl %r10d, %r11d
972; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %r11w
973; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
974; CHECK-SSE1-NEXT:    xorl %r10d, %r11d
975; CHECK-SSE1-NEXT:    movl %edi, %r10d
976; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %r10w
977; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
978; CHECK-SSE1-NEXT:    xorl %edi, %r10d
979; CHECK-SSE1-NEXT:    movw %r10w, 14(%rax)
980; CHECK-SSE1-NEXT:    movw %r11w, 12(%rax)
981; CHECK-SSE1-NEXT:    movw %bx, 10(%rax)
982; CHECK-SSE1-NEXT:    movw %r9w, 8(%rax)
983; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
984; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
985; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
986; CHECK-SSE1-NEXT:    movw %si, (%rax)
987; CHECK-SSE1-NEXT:    popq %rbx
988; CHECK-SSE1-NEXT:    popq %r12
989; CHECK-SSE1-NEXT:    popq %r14
990; CHECK-SSE1-NEXT:    popq %r15
991; CHECK-SSE1-NEXT:    popq %rbp
992; CHECK-SSE1-NEXT:    retq
993;
994; CHECK-SSE2-LABEL: out_v8i16:
995; CHECK-SSE2:       # %bb.0:
996; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
997; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
998; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
999; CHECK-SSE2-NEXT:    retq
1000;
1001; CHECK-XOP-LABEL: out_v8i16:
1002; CHECK-XOP:       # %bb.0:
1003; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
1004; CHECK-XOP-NEXT:    retq
1005  %mx = and <8 x i16> %x, %mask
1006  %notmask = xor <8 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
1007  %my = and <8 x i16> %y, %notmask
1008  %r = or <8 x i16> %mx, %my
1009  ret <8 x i16> %r
1010}
1011
1012define <4 x i32> @out_v4i32(ptr%px, ptr%py, ptr%pmask) nounwind {
1013; CHECK-BASELINE-LABEL: out_v4i32:
1014; CHECK-BASELINE:       # %bb.0:
1015; CHECK-BASELINE-NEXT:    movq %rdi, %rax
1016; CHECK-BASELINE-NEXT:    movl 12(%rdx), %edi
1017; CHECK-BASELINE-NEXT:    movl 8(%rdx), %r8d
1018; CHECK-BASELINE-NEXT:    movl (%rdx), %r9d
1019; CHECK-BASELINE-NEXT:    movl 4(%rdx), %r10d
1020; CHECK-BASELINE-NEXT:    movl (%rsi), %edx
1021; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
1022; CHECK-BASELINE-NEXT:    andl (%rcx), %edx
1023; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
1024; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r9d
1025; CHECK-BASELINE-NEXT:    xorl %r10d, %r9d
1026; CHECK-BASELINE-NEXT:    andl 4(%rcx), %r9d
1027; CHECK-BASELINE-NEXT:    xorl %r10d, %r9d
1028; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r10d
1029; CHECK-BASELINE-NEXT:    xorl %r8d, %r10d
1030; CHECK-BASELINE-NEXT:    andl 8(%rcx), %r10d
1031; CHECK-BASELINE-NEXT:    xorl %r8d, %r10d
1032; CHECK-BASELINE-NEXT:    movl 12(%rsi), %esi
1033; CHECK-BASELINE-NEXT:    xorl %edi, %esi
1034; CHECK-BASELINE-NEXT:    andl 12(%rcx), %esi
1035; CHECK-BASELINE-NEXT:    xorl %edi, %esi
1036; CHECK-BASELINE-NEXT:    movl %esi, 12(%rax)
1037; CHECK-BASELINE-NEXT:    movl %r10d, 8(%rax)
1038; CHECK-BASELINE-NEXT:    movl %r9d, 4(%rax)
1039; CHECK-BASELINE-NEXT:    movl %edx, (%rax)
1040; CHECK-BASELINE-NEXT:    retq
1041;
1042; CHECK-SSE1-LABEL: out_v4i32:
1043; CHECK-SSE1:       # %bb.0:
1044; CHECK-SSE1-NEXT:    movq %rdi, %rax
1045; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
1046; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
1047; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
1048; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm0
1049; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
1050; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
1051; CHECK-SSE1-NEXT:    retq
1052;
1053; CHECK-SSE2-LABEL: out_v4i32:
1054; CHECK-SSE2:       # %bb.0:
1055; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
1056; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm1
1057; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
1058; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
1059; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
1060; CHECK-SSE2-NEXT:    retq
1061;
1062; CHECK-XOP-LABEL: out_v4i32:
1063; CHECK-XOP:       # %bb.0:
1064; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
1065; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
1066; CHECK-XOP-NEXT:    vpcmov %xmm1, (%rsi), %xmm0, %xmm0
1067; CHECK-XOP-NEXT:    retq
1068  %x = load <4 x i32>, ptr%px, align 16
1069  %y = load <4 x i32>, ptr%py, align 16
1070  %mask = load <4 x i32>, ptr%pmask, align 16
1071  %mx = and <4 x i32> %x, %mask
1072  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
1073  %my = and <4 x i32> %y, %notmask
1074  %r = or <4 x i32> %mx, %my
1075  ret <4 x i32> %r
1076}
1077
1078define <4 x i32> @out_v4i32_undef(ptr%px, ptr%py, ptr%pmask) nounwind {
1079; CHECK-BASELINE-LABEL: out_v4i32_undef:
1080; CHECK-BASELINE:       # %bb.0:
1081; CHECK-BASELINE-NEXT:    movq %rdi, %rax
1082; CHECK-BASELINE-NEXT:    movl 8(%rsi), %edi
1083; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r8d
1084; CHECK-BASELINE-NEXT:    movl (%rdx), %r9d
1085; CHECK-BASELINE-NEXT:    movl 4(%rdx), %edx
1086; CHECK-BASELINE-NEXT:    andl 8(%rcx), %edi
1087; CHECK-BASELINE-NEXT:    movl (%rsi), %r10d
1088; CHECK-BASELINE-NEXT:    xorl %r9d, %r10d
1089; CHECK-BASELINE-NEXT:    andl (%rcx), %r10d
1090; CHECK-BASELINE-NEXT:    xorl %r9d, %r10d
1091; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r9d
1092; CHECK-BASELINE-NEXT:    xorl %edx, %r9d
1093; CHECK-BASELINE-NEXT:    andl 4(%rcx), %r9d
1094; CHECK-BASELINE-NEXT:    xorl %edx, %r9d
1095; CHECK-BASELINE-NEXT:    movl 12(%rsi), %edx
1096; CHECK-BASELINE-NEXT:    xorl %r8d, %edx
1097; CHECK-BASELINE-NEXT:    andl 12(%rcx), %edx
1098; CHECK-BASELINE-NEXT:    xorl %r8d, %edx
1099; CHECK-BASELINE-NEXT:    movl %edi, 8(%rax)
1100; CHECK-BASELINE-NEXT:    movl %edx, 12(%rax)
1101; CHECK-BASELINE-NEXT:    movl %r9d, 4(%rax)
1102; CHECK-BASELINE-NEXT:    movl %r10d, (%rax)
1103; CHECK-BASELINE-NEXT:    retq
1104;
1105; CHECK-SSE1-LABEL: out_v4i32_undef:
1106; CHECK-SSE1:       # %bb.0:
1107; CHECK-SSE1-NEXT:    movq %rdi, %rax
1108; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
1109; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
1110; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
1111; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm0
1112; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
1113; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
1114; CHECK-SSE1-NEXT:    retq
1115;
1116; CHECK-SSE2-LABEL: out_v4i32_undef:
1117; CHECK-SSE2:       # %bb.0:
1118; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
1119; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm1
1120; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
1121; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
1122; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
1123; CHECK-SSE2-NEXT:    retq
1124;
1125; CHECK-XOP-LABEL: out_v4i32_undef:
1126; CHECK-XOP:       # %bb.0:
1127; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
1128; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
1129; CHECK-XOP-NEXT:    vpcmov %xmm1, (%rsi), %xmm0, %xmm0
1130; CHECK-XOP-NEXT:    retq
1131  %x = load <4 x i32>, ptr%px, align 16
1132  %y = load <4 x i32>, ptr%py, align 16
1133  %mask = load <4 x i32>, ptr%pmask, align 16
1134  %mx = and <4 x i32> %x, %mask
1135  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 undef, i32 -1>
1136  %my = and <4 x i32> %y, %notmask
1137  %r = or <4 x i32> %mx, %my
1138  ret <4 x i32> %r
1139}
1140
1141define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
1142; CHECK-BASELINE-LABEL: out_v2i64:
1143; CHECK-BASELINE:       # %bb.0:
1144; CHECK-BASELINE-NEXT:    movq %rdi, %rax
1145; CHECK-BASELINE-NEXT:    xorq %rdx, %rax
1146; CHECK-BASELINE-NEXT:    andq %r8, %rax
1147; CHECK-BASELINE-NEXT:    xorq %rdx, %rax
1148; CHECK-BASELINE-NEXT:    xorq %rcx, %rsi
1149; CHECK-BASELINE-NEXT:    andq %r9, %rsi
1150; CHECK-BASELINE-NEXT:    xorq %rcx, %rsi
1151; CHECK-BASELINE-NEXT:    movq %rsi, %rdx
1152; CHECK-BASELINE-NEXT:    retq
1153;
1154; CHECK-SSE1-LABEL: out_v2i64:
1155; CHECK-SSE1:       # %bb.0:
1156; CHECK-SSE1-NEXT:    movq %rdi, %rax
1157; CHECK-SSE1-NEXT:    xorq %rdx, %rax
1158; CHECK-SSE1-NEXT:    andq %r8, %rax
1159; CHECK-SSE1-NEXT:    xorq %rdx, %rax
1160; CHECK-SSE1-NEXT:    xorq %rcx, %rsi
1161; CHECK-SSE1-NEXT:    andq %r9, %rsi
1162; CHECK-SSE1-NEXT:    xorq %rcx, %rsi
1163; CHECK-SSE1-NEXT:    movq %rsi, %rdx
1164; CHECK-SSE1-NEXT:    retq
1165;
1166; CHECK-SSE2-LABEL: out_v2i64:
1167; CHECK-SSE2:       # %bb.0:
1168; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
1169; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
1170; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
1171; CHECK-SSE2-NEXT:    retq
1172;
1173; CHECK-XOP-LABEL: out_v2i64:
1174; CHECK-XOP:       # %bb.0:
1175; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
1176; CHECK-XOP-NEXT:    retq
1177  %mx = and <2 x i64> %x, %mask
1178  %notmask = xor <2 x i64> %mask, <i64 -1, i64 -1>
1179  %my = and <2 x i64> %y, %notmask
1180  %r = or <2 x i64> %mx, %my
1181  ret <2 x i64> %r
1182}
1183
1184; ============================================================================ ;
1185; 256-bit vector width
1186; ============================================================================ ;
1187
1188define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
1189; CHECK-BASELINE-LABEL: out_v32i8:
1190; CHECK-BASELINE:       # %bb.0:
1191; CHECK-BASELINE-NEXT:    pushq %rbp
1192; CHECK-BASELINE-NEXT:    pushq %r15
1193; CHECK-BASELINE-NEXT:    pushq %r14
1194; CHECK-BASELINE-NEXT:    pushq %r13
1195; CHECK-BASELINE-NEXT:    pushq %r12
1196; CHECK-BASELINE-NEXT:    pushq %rbx
1197; CHECK-BASELINE-NEXT:    movq %rcx, %r10
1198; CHECK-BASELINE-NEXT:    movq %rdx, %r8
1199; CHECK-BASELINE-NEXT:    movq %rsi, %r9
1200; CHECK-BASELINE-NEXT:    movq %rdi, %r11
1201; CHECK-BASELINE-NEXT:    movzbl 15(%rdx), %eax
1202; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1203; CHECK-BASELINE-NEXT:    movzbl 14(%rdx), %eax
1204; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1205; CHECK-BASELINE-NEXT:    movzbl 13(%rdx), %eax
1206; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1207; CHECK-BASELINE-NEXT:    movzbl 12(%rdx), %eax
1208; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1209; CHECK-BASELINE-NEXT:    movzbl 11(%rdx), %eax
1210; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1211; CHECK-BASELINE-NEXT:    movzbl 10(%rdx), %eax
1212; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1213; CHECK-BASELINE-NEXT:    movzbl 9(%rdx), %ebp
1214; CHECK-BASELINE-NEXT:    movzbl 8(%rdx), %r14d
1215; CHECK-BASELINE-NEXT:    movzbl 7(%rdx), %r15d
1216; CHECK-BASELINE-NEXT:    movzbl 6(%rdx), %r12d
1217; CHECK-BASELINE-NEXT:    movzbl 5(%rdx), %r13d
1218; CHECK-BASELINE-NEXT:    movzbl 4(%rdx), %esi
1219; CHECK-BASELINE-NEXT:    movzbl 3(%rdx), %edx
1220; CHECK-BASELINE-NEXT:    movzbl 2(%r8), %edi
1221; CHECK-BASELINE-NEXT:    movzbl (%r8), %eax
1222; CHECK-BASELINE-NEXT:    movzbl 1(%r8), %ecx
1223; CHECK-BASELINE-NEXT:    movzbl (%r9), %ebx
1224; CHECK-BASELINE-NEXT:    xorb %al, %bl
1225; CHECK-BASELINE-NEXT:    andb (%r10), %bl
1226; CHECK-BASELINE-NEXT:    xorb %al, %bl
1227; CHECK-BASELINE-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1228; CHECK-BASELINE-NEXT:    movzbl 1(%r9), %eax
1229; CHECK-BASELINE-NEXT:    xorb %cl, %al
1230; CHECK-BASELINE-NEXT:    andb 1(%r10), %al
1231; CHECK-BASELINE-NEXT:    xorb %cl, %al
1232; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1233; CHECK-BASELINE-NEXT:    movzbl 2(%r9), %eax
1234; CHECK-BASELINE-NEXT:    xorb %dil, %al
1235; CHECK-BASELINE-NEXT:    andb 2(%r10), %al
1236; CHECK-BASELINE-NEXT:    xorb %dil, %al
1237; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1238; CHECK-BASELINE-NEXT:    movzbl 3(%r9), %eax
1239; CHECK-BASELINE-NEXT:    xorb %dl, %al
1240; CHECK-BASELINE-NEXT:    andb 3(%r10), %al
1241; CHECK-BASELINE-NEXT:    xorb %dl, %al
1242; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1243; CHECK-BASELINE-NEXT:    movzbl 4(%r9), %eax
1244; CHECK-BASELINE-NEXT:    xorb %sil, %al
1245; CHECK-BASELINE-NEXT:    andb 4(%r10), %al
1246; CHECK-BASELINE-NEXT:    xorb %sil, %al
1247; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1248; CHECK-BASELINE-NEXT:    movzbl 5(%r9), %eax
1249; CHECK-BASELINE-NEXT:    xorb %r13b, %al
1250; CHECK-BASELINE-NEXT:    andb 5(%r10), %al
1251; CHECK-BASELINE-NEXT:    xorb %r13b, %al
1252; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1253; CHECK-BASELINE-NEXT:    movzbl 6(%r9), %eax
1254; CHECK-BASELINE-NEXT:    xorb %r12b, %al
1255; CHECK-BASELINE-NEXT:    andb 6(%r10), %al
1256; CHECK-BASELINE-NEXT:    xorb %r12b, %al
1257; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1258; CHECK-BASELINE-NEXT:    movzbl 7(%r9), %eax
1259; CHECK-BASELINE-NEXT:    xorb %r15b, %al
1260; CHECK-BASELINE-NEXT:    andb 7(%r10), %al
1261; CHECK-BASELINE-NEXT:    xorb %r15b, %al
1262; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1263; CHECK-BASELINE-NEXT:    movzbl 8(%r9), %eax
1264; CHECK-BASELINE-NEXT:    xorb %r14b, %al
1265; CHECK-BASELINE-NEXT:    andb 8(%r10), %al
1266; CHECK-BASELINE-NEXT:    xorb %r14b, %al
1267; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1268; CHECK-BASELINE-NEXT:    movzbl 9(%r9), %eax
1269; CHECK-BASELINE-NEXT:    xorb %bpl, %al
1270; CHECK-BASELINE-NEXT:    andb 9(%r10), %al
1271; CHECK-BASELINE-NEXT:    xorb %bpl, %al
1272; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1273; CHECK-BASELINE-NEXT:    movzbl 10(%r9), %eax
1274; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1275; CHECK-BASELINE-NEXT:    xorb %cl, %al
1276; CHECK-BASELINE-NEXT:    andb 10(%r10), %al
1277; CHECK-BASELINE-NEXT:    xorb %cl, %al
1278; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1279; CHECK-BASELINE-NEXT:    movzbl 11(%r9), %eax
1280; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1281; CHECK-BASELINE-NEXT:    xorb %cl, %al
1282; CHECK-BASELINE-NEXT:    andb 11(%r10), %al
1283; CHECK-BASELINE-NEXT:    xorb %cl, %al
1284; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1285; CHECK-BASELINE-NEXT:    movzbl 12(%r9), %eax
1286; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1287; CHECK-BASELINE-NEXT:    xorb %cl, %al
1288; CHECK-BASELINE-NEXT:    andb 12(%r10), %al
1289; CHECK-BASELINE-NEXT:    xorb %cl, %al
1290; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1291; CHECK-BASELINE-NEXT:    movzbl 13(%r9), %eax
1292; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1293; CHECK-BASELINE-NEXT:    xorb %cl, %al
1294; CHECK-BASELINE-NEXT:    andb 13(%r10), %al
1295; CHECK-BASELINE-NEXT:    xorb %cl, %al
1296; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1297; CHECK-BASELINE-NEXT:    movzbl 14(%r9), %eax
1298; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1299; CHECK-BASELINE-NEXT:    xorb %cl, %al
1300; CHECK-BASELINE-NEXT:    andb 14(%r10), %al
1301; CHECK-BASELINE-NEXT:    xorb %cl, %al
1302; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1303; CHECK-BASELINE-NEXT:    movzbl 15(%r9), %eax
1304; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1305; CHECK-BASELINE-NEXT:    xorb %cl, %al
1306; CHECK-BASELINE-NEXT:    andb 15(%r10), %al
1307; CHECK-BASELINE-NEXT:    xorb %cl, %al
1308; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1309; CHECK-BASELINE-NEXT:    movzbl 16(%r8), %eax
1310; CHECK-BASELINE-NEXT:    movzbl 16(%r9), %ecx
1311; CHECK-BASELINE-NEXT:    xorb %al, %cl
1312; CHECK-BASELINE-NEXT:    andb 16(%r10), %cl
1313; CHECK-BASELINE-NEXT:    xorb %al, %cl
1314; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1315; CHECK-BASELINE-NEXT:    movzbl 17(%r8), %eax
1316; CHECK-BASELINE-NEXT:    movzbl 17(%r9), %ecx
1317; CHECK-BASELINE-NEXT:    xorb %al, %cl
1318; CHECK-BASELINE-NEXT:    andb 17(%r10), %cl
1319; CHECK-BASELINE-NEXT:    xorb %al, %cl
1320; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1321; CHECK-BASELINE-NEXT:    movzbl 18(%r8), %eax
1322; CHECK-BASELINE-NEXT:    movzbl 18(%r9), %ecx
1323; CHECK-BASELINE-NEXT:    xorb %al, %cl
1324; CHECK-BASELINE-NEXT:    andb 18(%r10), %cl
1325; CHECK-BASELINE-NEXT:    xorb %al, %cl
1326; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1327; CHECK-BASELINE-NEXT:    movzbl 19(%r8), %eax
1328; CHECK-BASELINE-NEXT:    movzbl 19(%r9), %ecx
1329; CHECK-BASELINE-NEXT:    xorb %al, %cl
1330; CHECK-BASELINE-NEXT:    andb 19(%r10), %cl
1331; CHECK-BASELINE-NEXT:    xorb %al, %cl
1332; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1333; CHECK-BASELINE-NEXT:    movzbl 20(%r8), %eax
1334; CHECK-BASELINE-NEXT:    movzbl 20(%r9), %ecx
1335; CHECK-BASELINE-NEXT:    xorb %al, %cl
1336; CHECK-BASELINE-NEXT:    andb 20(%r10), %cl
1337; CHECK-BASELINE-NEXT:    xorb %al, %cl
1338; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1339; CHECK-BASELINE-NEXT:    movzbl 21(%r8), %eax
1340; CHECK-BASELINE-NEXT:    movzbl 21(%r9), %r13d
1341; CHECK-BASELINE-NEXT:    xorb %al, %r13b
1342; CHECK-BASELINE-NEXT:    andb 21(%r10), %r13b
1343; CHECK-BASELINE-NEXT:    xorb %al, %r13b
1344; CHECK-BASELINE-NEXT:    movzbl 22(%r8), %eax
1345; CHECK-BASELINE-NEXT:    movzbl 22(%r9), %r12d
1346; CHECK-BASELINE-NEXT:    xorb %al, %r12b
1347; CHECK-BASELINE-NEXT:    andb 22(%r10), %r12b
1348; CHECK-BASELINE-NEXT:    xorb %al, %r12b
1349; CHECK-BASELINE-NEXT:    movzbl 23(%r8), %eax
1350; CHECK-BASELINE-NEXT:    movzbl 23(%r9), %r15d
1351; CHECK-BASELINE-NEXT:    xorb %al, %r15b
1352; CHECK-BASELINE-NEXT:    andb 23(%r10), %r15b
1353; CHECK-BASELINE-NEXT:    xorb %al, %r15b
1354; CHECK-BASELINE-NEXT:    movzbl 24(%r8), %eax
1355; CHECK-BASELINE-NEXT:    movzbl 24(%r9), %r14d
1356; CHECK-BASELINE-NEXT:    xorb %al, %r14b
1357; CHECK-BASELINE-NEXT:    andb 24(%r10), %r14b
1358; CHECK-BASELINE-NEXT:    xorb %al, %r14b
1359; CHECK-BASELINE-NEXT:    movzbl 25(%r8), %eax
1360; CHECK-BASELINE-NEXT:    movzbl 25(%r9), %ebp
1361; CHECK-BASELINE-NEXT:    xorb %al, %bpl
1362; CHECK-BASELINE-NEXT:    andb 25(%r10), %bpl
1363; CHECK-BASELINE-NEXT:    xorb %al, %bpl
1364; CHECK-BASELINE-NEXT:    movzbl 26(%r8), %eax
1365; CHECK-BASELINE-NEXT:    movzbl 26(%r9), %edi
1366; CHECK-BASELINE-NEXT:    xorb %al, %dil
1367; CHECK-BASELINE-NEXT:    andb 26(%r10), %dil
1368; CHECK-BASELINE-NEXT:    xorb %al, %dil
1369; CHECK-BASELINE-NEXT:    movzbl 27(%r8), %eax
1370; CHECK-BASELINE-NEXT:    movzbl 27(%r9), %esi
1371; CHECK-BASELINE-NEXT:    xorb %al, %sil
1372; CHECK-BASELINE-NEXT:    andb 27(%r10), %sil
1373; CHECK-BASELINE-NEXT:    xorb %al, %sil
1374; CHECK-BASELINE-NEXT:    movzbl 28(%r8), %eax
1375; CHECK-BASELINE-NEXT:    movzbl 28(%r9), %edx
1376; CHECK-BASELINE-NEXT:    xorb %al, %dl
1377; CHECK-BASELINE-NEXT:    andb 28(%r10), %dl
1378; CHECK-BASELINE-NEXT:    xorb %al, %dl
1379; CHECK-BASELINE-NEXT:    movzbl 29(%r8), %eax
1380; CHECK-BASELINE-NEXT:    movzbl 29(%r9), %ecx
1381; CHECK-BASELINE-NEXT:    xorb %al, %cl
1382; CHECK-BASELINE-NEXT:    andb 29(%r10), %cl
1383; CHECK-BASELINE-NEXT:    xorb %al, %cl
1384; CHECK-BASELINE-NEXT:    movzbl 30(%r8), %ebx
1385; CHECK-BASELINE-NEXT:    movzbl 30(%r9), %eax
1386; CHECK-BASELINE-NEXT:    xorb %bl, %al
1387; CHECK-BASELINE-NEXT:    andb 30(%r10), %al
1388; CHECK-BASELINE-NEXT:    xorb %bl, %al
1389; CHECK-BASELINE-NEXT:    movzbl 31(%r8), %r8d
1390; CHECK-BASELINE-NEXT:    movzbl 31(%r9), %r9d
1391; CHECK-BASELINE-NEXT:    xorb %r8b, %r9b
1392; CHECK-BASELINE-NEXT:    andb 31(%r10), %r9b
1393; CHECK-BASELINE-NEXT:    xorb %r8b, %r9b
1394; CHECK-BASELINE-NEXT:    movb %r9b, 31(%r11)
1395; CHECK-BASELINE-NEXT:    movb %al, 30(%r11)
1396; CHECK-BASELINE-NEXT:    movb %cl, 29(%r11)
1397; CHECK-BASELINE-NEXT:    movb %dl, 28(%r11)
1398; CHECK-BASELINE-NEXT:    movb %sil, 27(%r11)
1399; CHECK-BASELINE-NEXT:    movb %dil, 26(%r11)
1400; CHECK-BASELINE-NEXT:    movb %bpl, 25(%r11)
1401; CHECK-BASELINE-NEXT:    movb %r14b, 24(%r11)
1402; CHECK-BASELINE-NEXT:    movb %r15b, 23(%r11)
1403; CHECK-BASELINE-NEXT:    movb %r12b, 22(%r11)
1404; CHECK-BASELINE-NEXT:    movb %r13b, 21(%r11)
1405; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1406; CHECK-BASELINE-NEXT:    movb %al, 20(%r11)
1407; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1408; CHECK-BASELINE-NEXT:    movb %al, 19(%r11)
1409; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1410; CHECK-BASELINE-NEXT:    movb %al, 18(%r11)
1411; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1412; CHECK-BASELINE-NEXT:    movb %al, 17(%r11)
1413; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1414; CHECK-BASELINE-NEXT:    movb %al, 16(%r11)
1415; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1416; CHECK-BASELINE-NEXT:    movb %al, 15(%r11)
1417; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1418; CHECK-BASELINE-NEXT:    movb %al, 14(%r11)
1419; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1420; CHECK-BASELINE-NEXT:    movb %al, 13(%r11)
1421; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1422; CHECK-BASELINE-NEXT:    movb %al, 12(%r11)
1423; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1424; CHECK-BASELINE-NEXT:    movb %al, 11(%r11)
1425; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1426; CHECK-BASELINE-NEXT:    movb %al, 10(%r11)
1427; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1428; CHECK-BASELINE-NEXT:    movb %al, 9(%r11)
1429; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1430; CHECK-BASELINE-NEXT:    movb %al, 8(%r11)
1431; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1432; CHECK-BASELINE-NEXT:    movb %al, 7(%r11)
1433; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1434; CHECK-BASELINE-NEXT:    movb %al, 6(%r11)
1435; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1436; CHECK-BASELINE-NEXT:    movb %al, 5(%r11)
1437; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1438; CHECK-BASELINE-NEXT:    movb %al, 4(%r11)
1439; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1440; CHECK-BASELINE-NEXT:    movb %al, 3(%r11)
1441; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1442; CHECK-BASELINE-NEXT:    movb %al, 2(%r11)
1443; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1444; CHECK-BASELINE-NEXT:    movb %al, 1(%r11)
1445; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1446; CHECK-BASELINE-NEXT:    movb %al, (%r11)
1447; CHECK-BASELINE-NEXT:    movq %r11, %rax
1448; CHECK-BASELINE-NEXT:    popq %rbx
1449; CHECK-BASELINE-NEXT:    popq %r12
1450; CHECK-BASELINE-NEXT:    popq %r13
1451; CHECK-BASELINE-NEXT:    popq %r14
1452; CHECK-BASELINE-NEXT:    popq %r15
1453; CHECK-BASELINE-NEXT:    popq %rbp
1454; CHECK-BASELINE-NEXT:    retq
1455;
1456; CHECK-SSE1-LABEL: out_v32i8:
1457; CHECK-SSE1:       # %bb.0:
1458; CHECK-SSE1-NEXT:    pushq %rbp
1459; CHECK-SSE1-NEXT:    pushq %r15
1460; CHECK-SSE1-NEXT:    pushq %r14
1461; CHECK-SSE1-NEXT:    pushq %r13
1462; CHECK-SSE1-NEXT:    pushq %r12
1463; CHECK-SSE1-NEXT:    pushq %rbx
1464; CHECK-SSE1-NEXT:    movq %rcx, %r10
1465; CHECK-SSE1-NEXT:    movq %rdx, %r8
1466; CHECK-SSE1-NEXT:    movq %rsi, %r9
1467; CHECK-SSE1-NEXT:    movq %rdi, %r11
1468; CHECK-SSE1-NEXT:    movzbl 15(%rdx), %eax
1469; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1470; CHECK-SSE1-NEXT:    movzbl 14(%rdx), %eax
1471; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1472; CHECK-SSE1-NEXT:    movzbl 13(%rdx), %eax
1473; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1474; CHECK-SSE1-NEXT:    movzbl 12(%rdx), %eax
1475; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1476; CHECK-SSE1-NEXT:    movzbl 11(%rdx), %eax
1477; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1478; CHECK-SSE1-NEXT:    movzbl 10(%rdx), %eax
1479; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1480; CHECK-SSE1-NEXT:    movzbl 9(%rdx), %ebp
1481; CHECK-SSE1-NEXT:    movzbl 8(%rdx), %r14d
1482; CHECK-SSE1-NEXT:    movzbl 7(%rdx), %r15d
1483; CHECK-SSE1-NEXT:    movzbl 6(%rdx), %r12d
1484; CHECK-SSE1-NEXT:    movzbl 5(%rdx), %r13d
1485; CHECK-SSE1-NEXT:    movzbl 4(%rdx), %esi
1486; CHECK-SSE1-NEXT:    movzbl 3(%rdx), %edx
1487; CHECK-SSE1-NEXT:    movzbl 2(%r8), %edi
1488; CHECK-SSE1-NEXT:    movzbl (%r8), %eax
1489; CHECK-SSE1-NEXT:    movzbl 1(%r8), %ecx
1490; CHECK-SSE1-NEXT:    movzbl (%r9), %ebx
1491; CHECK-SSE1-NEXT:    xorb %al, %bl
1492; CHECK-SSE1-NEXT:    andb (%r10), %bl
1493; CHECK-SSE1-NEXT:    xorb %al, %bl
1494; CHECK-SSE1-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1495; CHECK-SSE1-NEXT:    movzbl 1(%r9), %eax
1496; CHECK-SSE1-NEXT:    xorb %cl, %al
1497; CHECK-SSE1-NEXT:    andb 1(%r10), %al
1498; CHECK-SSE1-NEXT:    xorb %cl, %al
1499; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1500; CHECK-SSE1-NEXT:    movzbl 2(%r9), %eax
1501; CHECK-SSE1-NEXT:    xorb %dil, %al
1502; CHECK-SSE1-NEXT:    andb 2(%r10), %al
1503; CHECK-SSE1-NEXT:    xorb %dil, %al
1504; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1505; CHECK-SSE1-NEXT:    movzbl 3(%r9), %eax
1506; CHECK-SSE1-NEXT:    xorb %dl, %al
1507; CHECK-SSE1-NEXT:    andb 3(%r10), %al
1508; CHECK-SSE1-NEXT:    xorb %dl, %al
1509; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1510; CHECK-SSE1-NEXT:    movzbl 4(%r9), %eax
1511; CHECK-SSE1-NEXT:    xorb %sil, %al
1512; CHECK-SSE1-NEXT:    andb 4(%r10), %al
1513; CHECK-SSE1-NEXT:    xorb %sil, %al
1514; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1515; CHECK-SSE1-NEXT:    movzbl 5(%r9), %eax
1516; CHECK-SSE1-NEXT:    xorb %r13b, %al
1517; CHECK-SSE1-NEXT:    andb 5(%r10), %al
1518; CHECK-SSE1-NEXT:    xorb %r13b, %al
1519; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1520; CHECK-SSE1-NEXT:    movzbl 6(%r9), %eax
1521; CHECK-SSE1-NEXT:    xorb %r12b, %al
1522; CHECK-SSE1-NEXT:    andb 6(%r10), %al
1523; CHECK-SSE1-NEXT:    xorb %r12b, %al
1524; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1525; CHECK-SSE1-NEXT:    movzbl 7(%r9), %eax
1526; CHECK-SSE1-NEXT:    xorb %r15b, %al
1527; CHECK-SSE1-NEXT:    andb 7(%r10), %al
1528; CHECK-SSE1-NEXT:    xorb %r15b, %al
1529; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1530; CHECK-SSE1-NEXT:    movzbl 8(%r9), %eax
1531; CHECK-SSE1-NEXT:    xorb %r14b, %al
1532; CHECK-SSE1-NEXT:    andb 8(%r10), %al
1533; CHECK-SSE1-NEXT:    xorb %r14b, %al
1534; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1535; CHECK-SSE1-NEXT:    movzbl 9(%r9), %eax
1536; CHECK-SSE1-NEXT:    xorb %bpl, %al
1537; CHECK-SSE1-NEXT:    andb 9(%r10), %al
1538; CHECK-SSE1-NEXT:    xorb %bpl, %al
1539; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1540; CHECK-SSE1-NEXT:    movzbl 10(%r9), %eax
1541; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1542; CHECK-SSE1-NEXT:    xorb %cl, %al
1543; CHECK-SSE1-NEXT:    andb 10(%r10), %al
1544; CHECK-SSE1-NEXT:    xorb %cl, %al
1545; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1546; CHECK-SSE1-NEXT:    movzbl 11(%r9), %eax
1547; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1548; CHECK-SSE1-NEXT:    xorb %cl, %al
1549; CHECK-SSE1-NEXT:    andb 11(%r10), %al
1550; CHECK-SSE1-NEXT:    xorb %cl, %al
1551; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1552; CHECK-SSE1-NEXT:    movzbl 12(%r9), %eax
1553; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1554; CHECK-SSE1-NEXT:    xorb %cl, %al
1555; CHECK-SSE1-NEXT:    andb 12(%r10), %al
1556; CHECK-SSE1-NEXT:    xorb %cl, %al
1557; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1558; CHECK-SSE1-NEXT:    movzbl 13(%r9), %eax
1559; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1560; CHECK-SSE1-NEXT:    xorb %cl, %al
1561; CHECK-SSE1-NEXT:    andb 13(%r10), %al
1562; CHECK-SSE1-NEXT:    xorb %cl, %al
1563; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1564; CHECK-SSE1-NEXT:    movzbl 14(%r9), %eax
1565; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1566; CHECK-SSE1-NEXT:    xorb %cl, %al
1567; CHECK-SSE1-NEXT:    andb 14(%r10), %al
1568; CHECK-SSE1-NEXT:    xorb %cl, %al
1569; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1570; CHECK-SSE1-NEXT:    movzbl 15(%r9), %eax
1571; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1572; CHECK-SSE1-NEXT:    xorb %cl, %al
1573; CHECK-SSE1-NEXT:    andb 15(%r10), %al
1574; CHECK-SSE1-NEXT:    xorb %cl, %al
1575; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1576; CHECK-SSE1-NEXT:    movzbl 16(%r8), %eax
1577; CHECK-SSE1-NEXT:    movzbl 16(%r9), %ecx
1578; CHECK-SSE1-NEXT:    xorb %al, %cl
1579; CHECK-SSE1-NEXT:    andb 16(%r10), %cl
1580; CHECK-SSE1-NEXT:    xorb %al, %cl
1581; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1582; CHECK-SSE1-NEXT:    movzbl 17(%r8), %eax
1583; CHECK-SSE1-NEXT:    movzbl 17(%r9), %ecx
1584; CHECK-SSE1-NEXT:    xorb %al, %cl
1585; CHECK-SSE1-NEXT:    andb 17(%r10), %cl
1586; CHECK-SSE1-NEXT:    xorb %al, %cl
1587; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1588; CHECK-SSE1-NEXT:    movzbl 18(%r8), %eax
1589; CHECK-SSE1-NEXT:    movzbl 18(%r9), %ecx
1590; CHECK-SSE1-NEXT:    xorb %al, %cl
1591; CHECK-SSE1-NEXT:    andb 18(%r10), %cl
1592; CHECK-SSE1-NEXT:    xorb %al, %cl
1593; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1594; CHECK-SSE1-NEXT:    movzbl 19(%r8), %eax
1595; CHECK-SSE1-NEXT:    movzbl 19(%r9), %ecx
1596; CHECK-SSE1-NEXT:    xorb %al, %cl
1597; CHECK-SSE1-NEXT:    andb 19(%r10), %cl
1598; CHECK-SSE1-NEXT:    xorb %al, %cl
1599; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1600; CHECK-SSE1-NEXT:    movzbl 20(%r8), %eax
1601; CHECK-SSE1-NEXT:    movzbl 20(%r9), %ecx
1602; CHECK-SSE1-NEXT:    xorb %al, %cl
1603; CHECK-SSE1-NEXT:    andb 20(%r10), %cl
1604; CHECK-SSE1-NEXT:    xorb %al, %cl
1605; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1606; CHECK-SSE1-NEXT:    movzbl 21(%r8), %eax
1607; CHECK-SSE1-NEXT:    movzbl 21(%r9), %r13d
1608; CHECK-SSE1-NEXT:    xorb %al, %r13b
1609; CHECK-SSE1-NEXT:    andb 21(%r10), %r13b
1610; CHECK-SSE1-NEXT:    xorb %al, %r13b
1611; CHECK-SSE1-NEXT:    movzbl 22(%r8), %eax
1612; CHECK-SSE1-NEXT:    movzbl 22(%r9), %r12d
1613; CHECK-SSE1-NEXT:    xorb %al, %r12b
1614; CHECK-SSE1-NEXT:    andb 22(%r10), %r12b
1615; CHECK-SSE1-NEXT:    xorb %al, %r12b
1616; CHECK-SSE1-NEXT:    movzbl 23(%r8), %eax
1617; CHECK-SSE1-NEXT:    movzbl 23(%r9), %r15d
1618; CHECK-SSE1-NEXT:    xorb %al, %r15b
1619; CHECK-SSE1-NEXT:    andb 23(%r10), %r15b
1620; CHECK-SSE1-NEXT:    xorb %al, %r15b
1621; CHECK-SSE1-NEXT:    movzbl 24(%r8), %eax
1622; CHECK-SSE1-NEXT:    movzbl 24(%r9), %r14d
1623; CHECK-SSE1-NEXT:    xorb %al, %r14b
1624; CHECK-SSE1-NEXT:    andb 24(%r10), %r14b
1625; CHECK-SSE1-NEXT:    xorb %al, %r14b
1626; CHECK-SSE1-NEXT:    movzbl 25(%r8), %eax
1627; CHECK-SSE1-NEXT:    movzbl 25(%r9), %ebp
1628; CHECK-SSE1-NEXT:    xorb %al, %bpl
1629; CHECK-SSE1-NEXT:    andb 25(%r10), %bpl
1630; CHECK-SSE1-NEXT:    xorb %al, %bpl
1631; CHECK-SSE1-NEXT:    movzbl 26(%r8), %eax
1632; CHECK-SSE1-NEXT:    movzbl 26(%r9), %edi
1633; CHECK-SSE1-NEXT:    xorb %al, %dil
1634; CHECK-SSE1-NEXT:    andb 26(%r10), %dil
1635; CHECK-SSE1-NEXT:    xorb %al, %dil
1636; CHECK-SSE1-NEXT:    movzbl 27(%r8), %eax
1637; CHECK-SSE1-NEXT:    movzbl 27(%r9), %esi
1638; CHECK-SSE1-NEXT:    xorb %al, %sil
1639; CHECK-SSE1-NEXT:    andb 27(%r10), %sil
1640; CHECK-SSE1-NEXT:    xorb %al, %sil
1641; CHECK-SSE1-NEXT:    movzbl 28(%r8), %eax
1642; CHECK-SSE1-NEXT:    movzbl 28(%r9), %edx
1643; CHECK-SSE1-NEXT:    xorb %al, %dl
1644; CHECK-SSE1-NEXT:    andb 28(%r10), %dl
1645; CHECK-SSE1-NEXT:    xorb %al, %dl
1646; CHECK-SSE1-NEXT:    movzbl 29(%r8), %eax
1647; CHECK-SSE1-NEXT:    movzbl 29(%r9), %ecx
1648; CHECK-SSE1-NEXT:    xorb %al, %cl
1649; CHECK-SSE1-NEXT:    andb 29(%r10), %cl
1650; CHECK-SSE1-NEXT:    xorb %al, %cl
1651; CHECK-SSE1-NEXT:    movzbl 30(%r8), %ebx
1652; CHECK-SSE1-NEXT:    movzbl 30(%r9), %eax
1653; CHECK-SSE1-NEXT:    xorb %bl, %al
1654; CHECK-SSE1-NEXT:    andb 30(%r10), %al
1655; CHECK-SSE1-NEXT:    xorb %bl, %al
1656; CHECK-SSE1-NEXT:    movzbl 31(%r8), %r8d
1657; CHECK-SSE1-NEXT:    movzbl 31(%r9), %r9d
1658; CHECK-SSE1-NEXT:    xorb %r8b, %r9b
1659; CHECK-SSE1-NEXT:    andb 31(%r10), %r9b
1660; CHECK-SSE1-NEXT:    xorb %r8b, %r9b
1661; CHECK-SSE1-NEXT:    movb %r9b, 31(%r11)
1662; CHECK-SSE1-NEXT:    movb %al, 30(%r11)
1663; CHECK-SSE1-NEXT:    movb %cl, 29(%r11)
1664; CHECK-SSE1-NEXT:    movb %dl, 28(%r11)
1665; CHECK-SSE1-NEXT:    movb %sil, 27(%r11)
1666; CHECK-SSE1-NEXT:    movb %dil, 26(%r11)
1667; CHECK-SSE1-NEXT:    movb %bpl, 25(%r11)
1668; CHECK-SSE1-NEXT:    movb %r14b, 24(%r11)
1669; CHECK-SSE1-NEXT:    movb %r15b, 23(%r11)
1670; CHECK-SSE1-NEXT:    movb %r12b, 22(%r11)
1671; CHECK-SSE1-NEXT:    movb %r13b, 21(%r11)
1672; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1673; CHECK-SSE1-NEXT:    movb %al, 20(%r11)
1674; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1675; CHECK-SSE1-NEXT:    movb %al, 19(%r11)
1676; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1677; CHECK-SSE1-NEXT:    movb %al, 18(%r11)
1678; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1679; CHECK-SSE1-NEXT:    movb %al, 17(%r11)
1680; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1681; CHECK-SSE1-NEXT:    movb %al, 16(%r11)
1682; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1683; CHECK-SSE1-NEXT:    movb %al, 15(%r11)
1684; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1685; CHECK-SSE1-NEXT:    movb %al, 14(%r11)
1686; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1687; CHECK-SSE1-NEXT:    movb %al, 13(%r11)
1688; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1689; CHECK-SSE1-NEXT:    movb %al, 12(%r11)
1690; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1691; CHECK-SSE1-NEXT:    movb %al, 11(%r11)
1692; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1693; CHECK-SSE1-NEXT:    movb %al, 10(%r11)
1694; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1695; CHECK-SSE1-NEXT:    movb %al, 9(%r11)
1696; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1697; CHECK-SSE1-NEXT:    movb %al, 8(%r11)
1698; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1699; CHECK-SSE1-NEXT:    movb %al, 7(%r11)
1700; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1701; CHECK-SSE1-NEXT:    movb %al, 6(%r11)
1702; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1703; CHECK-SSE1-NEXT:    movb %al, 5(%r11)
1704; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1705; CHECK-SSE1-NEXT:    movb %al, 4(%r11)
1706; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1707; CHECK-SSE1-NEXT:    movb %al, 3(%r11)
1708; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1709; CHECK-SSE1-NEXT:    movb %al, 2(%r11)
1710; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1711; CHECK-SSE1-NEXT:    movb %al, 1(%r11)
1712; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1713; CHECK-SSE1-NEXT:    movb %al, (%r11)
1714; CHECK-SSE1-NEXT:    movq %r11, %rax
1715; CHECK-SSE1-NEXT:    popq %rbx
1716; CHECK-SSE1-NEXT:    popq %r12
1717; CHECK-SSE1-NEXT:    popq %r13
1718; CHECK-SSE1-NEXT:    popq %r14
1719; CHECK-SSE1-NEXT:    popq %r15
1720; CHECK-SSE1-NEXT:    popq %rbp
1721; CHECK-SSE1-NEXT:    retq
1722;
1723; CHECK-SSE2-LABEL: out_v32i8:
1724; CHECK-SSE2:       # %bb.0:
1725; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
1726; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
1727; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm2
1728; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
1729; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm3
1730; CHECK-SSE2-NEXT:    andps %xmm0, %xmm3
1731; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm1
1732; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
1733; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
1734; CHECK-SSE2-NEXT:    orps %xmm3, %xmm0
1735; CHECK-SSE2-NEXT:    retq
1736;
1737; CHECK-XOP-LABEL: out_v32i8:
1738; CHECK-XOP:       # %bb.0:
1739; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
1740; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
1741; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
1742; CHECK-XOP-NEXT:    retq
1743  %x = load <32 x i8>, ptr%px, align 32
1744  %y = load <32 x i8>, ptr%py, align 32
1745  %mask = load <32 x i8>, ptr%pmask, align 32
1746  %mx = and <32 x i8> %x, %mask
1747  %notmask = xor <32 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1748  %my = and <32 x i8> %y, %notmask
1749  %r = or <32 x i8> %mx, %my
1750  ret <32 x i8> %r
1751}
1752
1753define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
1754; CHECK-BASELINE-LABEL: out_v16i16:
1755; CHECK-BASELINE:       # %bb.0:
1756; CHECK-BASELINE-NEXT:    pushq %rbp
1757; CHECK-BASELINE-NEXT:    pushq %r15
1758; CHECK-BASELINE-NEXT:    pushq %r14
1759; CHECK-BASELINE-NEXT:    pushq %r13
1760; CHECK-BASELINE-NEXT:    pushq %r12
1761; CHECK-BASELINE-NEXT:    pushq %rbx
1762; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %r15d
1763; CHECK-BASELINE-NEXT:    movzwl 16(%rdx), %r14d
1764; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %ebp
1765; CHECK-BASELINE-NEXT:    movzwl 12(%rdx), %ebx
1766; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r13d
1767; CHECK-BASELINE-NEXT:    movzwl 8(%rdx), %r11d
1768; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %r10d
1769; CHECK-BASELINE-NEXT:    movzwl 4(%rdx), %r9d
1770; CHECK-BASELINE-NEXT:    movzwl (%rdx), %r8d
1771; CHECK-BASELINE-NEXT:    movzwl 2(%rdx), %r12d
1772; CHECK-BASELINE-NEXT:    movzwl (%rsi), %eax
1773; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
1774; CHECK-BASELINE-NEXT:    andw (%rcx), %ax
1775; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
1776; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1777; CHECK-BASELINE-NEXT:    movzwl 2(%rsi), %eax
1778; CHECK-BASELINE-NEXT:    xorw %r12w, %ax
1779; CHECK-BASELINE-NEXT:    andw 2(%rcx), %ax
1780; CHECK-BASELINE-NEXT:    xorl %eax, %r12d
1781; CHECK-BASELINE-NEXT:    movzwl 4(%rsi), %eax
1782; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
1783; CHECK-BASELINE-NEXT:    andw 4(%rcx), %ax
1784; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
1785; CHECK-BASELINE-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1786; CHECK-BASELINE-NEXT:    movzwl 6(%rsi), %eax
1787; CHECK-BASELINE-NEXT:    xorw %r10w, %ax
1788; CHECK-BASELINE-NEXT:    andw 6(%rcx), %ax
1789; CHECK-BASELINE-NEXT:    xorl %eax, %r10d
1790; CHECK-BASELINE-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1791; CHECK-BASELINE-NEXT:    movzwl 8(%rsi), %eax
1792; CHECK-BASELINE-NEXT:    xorw %r11w, %ax
1793; CHECK-BASELINE-NEXT:    andw 8(%rcx), %ax
1794; CHECK-BASELINE-NEXT:    xorl %eax, %r11d
1795; CHECK-BASELINE-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1796; CHECK-BASELINE-NEXT:    movzwl 10(%rsi), %eax
1797; CHECK-BASELINE-NEXT:    xorw %r13w, %ax
1798; CHECK-BASELINE-NEXT:    andw 10(%rcx), %ax
1799; CHECK-BASELINE-NEXT:    xorl %eax, %r13d
1800; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1801; CHECK-BASELINE-NEXT:    movzwl 12(%rsi), %eax
1802; CHECK-BASELINE-NEXT:    xorw %bx, %ax
1803; CHECK-BASELINE-NEXT:    andw 12(%rcx), %ax
1804; CHECK-BASELINE-NEXT:    xorl %eax, %ebx
1805; CHECK-BASELINE-NEXT:    movzwl 14(%rsi), %eax
1806; CHECK-BASELINE-NEXT:    xorw %bp, %ax
1807; CHECK-BASELINE-NEXT:    andw 14(%rcx), %ax
1808; CHECK-BASELINE-NEXT:    xorl %eax, %ebp
1809; CHECK-BASELINE-NEXT:    movzwl 16(%rsi), %eax
1810; CHECK-BASELINE-NEXT:    xorw %r14w, %ax
1811; CHECK-BASELINE-NEXT:    andw 16(%rcx), %ax
1812; CHECK-BASELINE-NEXT:    xorl %eax, %r14d
1813; CHECK-BASELINE-NEXT:    movzwl 18(%rsi), %eax
1814; CHECK-BASELINE-NEXT:    xorw %r15w, %ax
1815; CHECK-BASELINE-NEXT:    andw 18(%rcx), %ax
1816; CHECK-BASELINE-NEXT:    xorl %eax, %r15d
1817; CHECK-BASELINE-NEXT:    movzwl 20(%rdx), %r13d
1818; CHECK-BASELINE-NEXT:    movzwl 20(%rsi), %eax
1819; CHECK-BASELINE-NEXT:    xorw %r13w, %ax
1820; CHECK-BASELINE-NEXT:    andw 20(%rcx), %ax
1821; CHECK-BASELINE-NEXT:    xorl %eax, %r13d
1822; CHECK-BASELINE-NEXT:    movzwl 22(%rdx), %r9d
1823; CHECK-BASELINE-NEXT:    movzwl 22(%rsi), %eax
1824; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
1825; CHECK-BASELINE-NEXT:    andw 22(%rcx), %ax
1826; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
1827; CHECK-BASELINE-NEXT:    movzwl 24(%rdx), %r8d
1828; CHECK-BASELINE-NEXT:    movzwl 24(%rsi), %eax
1829; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
1830; CHECK-BASELINE-NEXT:    andw 24(%rcx), %ax
1831; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
1832; CHECK-BASELINE-NEXT:    movzwl 26(%rdx), %eax
1833; CHECK-BASELINE-NEXT:    movzwl 26(%rsi), %r10d
1834; CHECK-BASELINE-NEXT:    xorw %ax, %r10w
1835; CHECK-BASELINE-NEXT:    andw 26(%rcx), %r10w
1836; CHECK-BASELINE-NEXT:    xorl %r10d, %eax
1837; CHECK-BASELINE-NEXT:    movzwl 28(%rdx), %r10d
1838; CHECK-BASELINE-NEXT:    movzwl 28(%rsi), %r11d
1839; CHECK-BASELINE-NEXT:    xorw %r10w, %r11w
1840; CHECK-BASELINE-NEXT:    andw 28(%rcx), %r11w
1841; CHECK-BASELINE-NEXT:    xorl %r11d, %r10d
1842; CHECK-BASELINE-NEXT:    movzwl 30(%rdx), %edx
1843; CHECK-BASELINE-NEXT:    movzwl 30(%rsi), %esi
1844; CHECK-BASELINE-NEXT:    xorw %dx, %si
1845; CHECK-BASELINE-NEXT:    andw 30(%rcx), %si
1846; CHECK-BASELINE-NEXT:    xorl %esi, %edx
1847; CHECK-BASELINE-NEXT:    movw %dx, 30(%rdi)
1848; CHECK-BASELINE-NEXT:    movw %r10w, 28(%rdi)
1849; CHECK-BASELINE-NEXT:    movw %ax, 26(%rdi)
1850; CHECK-BASELINE-NEXT:    movw %r8w, 24(%rdi)
1851; CHECK-BASELINE-NEXT:    movw %r9w, 22(%rdi)
1852; CHECK-BASELINE-NEXT:    movw %r13w, 20(%rdi)
1853; CHECK-BASELINE-NEXT:    movw %r15w, 18(%rdi)
1854; CHECK-BASELINE-NEXT:    movw %r14w, 16(%rdi)
1855; CHECK-BASELINE-NEXT:    movw %bp, 14(%rdi)
1856; CHECK-BASELINE-NEXT:    movw %bx, 12(%rdi)
1857; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1858; CHECK-BASELINE-NEXT:    movw %ax, 10(%rdi)
1859; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1860; CHECK-BASELINE-NEXT:    movw %ax, 8(%rdi)
1861; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1862; CHECK-BASELINE-NEXT:    movw %ax, 6(%rdi)
1863; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1864; CHECK-BASELINE-NEXT:    movw %ax, 4(%rdi)
1865; CHECK-BASELINE-NEXT:    movw %r12w, 2(%rdi)
1866; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1867; CHECK-BASELINE-NEXT:    movw %ax, (%rdi)
1868; CHECK-BASELINE-NEXT:    movq %rdi, %rax
1869; CHECK-BASELINE-NEXT:    popq %rbx
1870; CHECK-BASELINE-NEXT:    popq %r12
1871; CHECK-BASELINE-NEXT:    popq %r13
1872; CHECK-BASELINE-NEXT:    popq %r14
1873; CHECK-BASELINE-NEXT:    popq %r15
1874; CHECK-BASELINE-NEXT:    popq %rbp
1875; CHECK-BASELINE-NEXT:    retq
1876;
1877; CHECK-SSE1-LABEL: out_v16i16:
1878; CHECK-SSE1:       # %bb.0:
1879; CHECK-SSE1-NEXT:    pushq %rbp
1880; CHECK-SSE1-NEXT:    pushq %r15
1881; CHECK-SSE1-NEXT:    pushq %r14
1882; CHECK-SSE1-NEXT:    pushq %r13
1883; CHECK-SSE1-NEXT:    pushq %r12
1884; CHECK-SSE1-NEXT:    pushq %rbx
1885; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %r15d
1886; CHECK-SSE1-NEXT:    movzwl 16(%rdx), %r14d
1887; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %ebp
1888; CHECK-SSE1-NEXT:    movzwl 12(%rdx), %ebx
1889; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r13d
1890; CHECK-SSE1-NEXT:    movzwl 8(%rdx), %r11d
1891; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %r10d
1892; CHECK-SSE1-NEXT:    movzwl 4(%rdx), %r9d
1893; CHECK-SSE1-NEXT:    movzwl (%rdx), %r8d
1894; CHECK-SSE1-NEXT:    movzwl 2(%rdx), %r12d
1895; CHECK-SSE1-NEXT:    movzwl (%rsi), %eax
1896; CHECK-SSE1-NEXT:    xorw %r8w, %ax
1897; CHECK-SSE1-NEXT:    andw (%rcx), %ax
1898; CHECK-SSE1-NEXT:    xorl %eax, %r8d
1899; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1900; CHECK-SSE1-NEXT:    movzwl 2(%rsi), %eax
1901; CHECK-SSE1-NEXT:    xorw %r12w, %ax
1902; CHECK-SSE1-NEXT:    andw 2(%rcx), %ax
1903; CHECK-SSE1-NEXT:    xorl %eax, %r12d
1904; CHECK-SSE1-NEXT:    movzwl 4(%rsi), %eax
1905; CHECK-SSE1-NEXT:    xorw %r9w, %ax
1906; CHECK-SSE1-NEXT:    andw 4(%rcx), %ax
1907; CHECK-SSE1-NEXT:    xorl %eax, %r9d
1908; CHECK-SSE1-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1909; CHECK-SSE1-NEXT:    movzwl 6(%rsi), %eax
1910; CHECK-SSE1-NEXT:    xorw %r10w, %ax
1911; CHECK-SSE1-NEXT:    andw 6(%rcx), %ax
1912; CHECK-SSE1-NEXT:    xorl %eax, %r10d
1913; CHECK-SSE1-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1914; CHECK-SSE1-NEXT:    movzwl 8(%rsi), %eax
1915; CHECK-SSE1-NEXT:    xorw %r11w, %ax
1916; CHECK-SSE1-NEXT:    andw 8(%rcx), %ax
1917; CHECK-SSE1-NEXT:    xorl %eax, %r11d
1918; CHECK-SSE1-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1919; CHECK-SSE1-NEXT:    movzwl 10(%rsi), %eax
1920; CHECK-SSE1-NEXT:    xorw %r13w, %ax
1921; CHECK-SSE1-NEXT:    andw 10(%rcx), %ax
1922; CHECK-SSE1-NEXT:    xorl %eax, %r13d
1923; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1924; CHECK-SSE1-NEXT:    movzwl 12(%rsi), %eax
1925; CHECK-SSE1-NEXT:    xorw %bx, %ax
1926; CHECK-SSE1-NEXT:    andw 12(%rcx), %ax
1927; CHECK-SSE1-NEXT:    xorl %eax, %ebx
1928; CHECK-SSE1-NEXT:    movzwl 14(%rsi), %eax
1929; CHECK-SSE1-NEXT:    xorw %bp, %ax
1930; CHECK-SSE1-NEXT:    andw 14(%rcx), %ax
1931; CHECK-SSE1-NEXT:    xorl %eax, %ebp
1932; CHECK-SSE1-NEXT:    movzwl 16(%rsi), %eax
1933; CHECK-SSE1-NEXT:    xorw %r14w, %ax
1934; CHECK-SSE1-NEXT:    andw 16(%rcx), %ax
1935; CHECK-SSE1-NEXT:    xorl %eax, %r14d
1936; CHECK-SSE1-NEXT:    movzwl 18(%rsi), %eax
1937; CHECK-SSE1-NEXT:    xorw %r15w, %ax
1938; CHECK-SSE1-NEXT:    andw 18(%rcx), %ax
1939; CHECK-SSE1-NEXT:    xorl %eax, %r15d
1940; CHECK-SSE1-NEXT:    movzwl 20(%rdx), %r13d
1941; CHECK-SSE1-NEXT:    movzwl 20(%rsi), %eax
1942; CHECK-SSE1-NEXT:    xorw %r13w, %ax
1943; CHECK-SSE1-NEXT:    andw 20(%rcx), %ax
1944; CHECK-SSE1-NEXT:    xorl %eax, %r13d
1945; CHECK-SSE1-NEXT:    movzwl 22(%rdx), %r9d
1946; CHECK-SSE1-NEXT:    movzwl 22(%rsi), %eax
1947; CHECK-SSE1-NEXT:    xorw %r9w, %ax
1948; CHECK-SSE1-NEXT:    andw 22(%rcx), %ax
1949; CHECK-SSE1-NEXT:    xorl %eax, %r9d
1950; CHECK-SSE1-NEXT:    movzwl 24(%rdx), %r8d
1951; CHECK-SSE1-NEXT:    movzwl 24(%rsi), %eax
1952; CHECK-SSE1-NEXT:    xorw %r8w, %ax
1953; CHECK-SSE1-NEXT:    andw 24(%rcx), %ax
1954; CHECK-SSE1-NEXT:    xorl %eax, %r8d
1955; CHECK-SSE1-NEXT:    movzwl 26(%rdx), %eax
1956; CHECK-SSE1-NEXT:    movzwl 26(%rsi), %r10d
1957; CHECK-SSE1-NEXT:    xorw %ax, %r10w
1958; CHECK-SSE1-NEXT:    andw 26(%rcx), %r10w
1959; CHECK-SSE1-NEXT:    xorl %r10d, %eax
1960; CHECK-SSE1-NEXT:    movzwl 28(%rdx), %r10d
1961; CHECK-SSE1-NEXT:    movzwl 28(%rsi), %r11d
1962; CHECK-SSE1-NEXT:    xorw %r10w, %r11w
1963; CHECK-SSE1-NEXT:    andw 28(%rcx), %r11w
1964; CHECK-SSE1-NEXT:    xorl %r11d, %r10d
1965; CHECK-SSE1-NEXT:    movzwl 30(%rdx), %edx
1966; CHECK-SSE1-NEXT:    movzwl 30(%rsi), %esi
1967; CHECK-SSE1-NEXT:    xorw %dx, %si
1968; CHECK-SSE1-NEXT:    andw 30(%rcx), %si
1969; CHECK-SSE1-NEXT:    xorl %esi, %edx
1970; CHECK-SSE1-NEXT:    movw %dx, 30(%rdi)
1971; CHECK-SSE1-NEXT:    movw %r10w, 28(%rdi)
1972; CHECK-SSE1-NEXT:    movw %ax, 26(%rdi)
1973; CHECK-SSE1-NEXT:    movw %r8w, 24(%rdi)
1974; CHECK-SSE1-NEXT:    movw %r9w, 22(%rdi)
1975; CHECK-SSE1-NEXT:    movw %r13w, 20(%rdi)
1976; CHECK-SSE1-NEXT:    movw %r15w, 18(%rdi)
1977; CHECK-SSE1-NEXT:    movw %r14w, 16(%rdi)
1978; CHECK-SSE1-NEXT:    movw %bp, 14(%rdi)
1979; CHECK-SSE1-NEXT:    movw %bx, 12(%rdi)
1980; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1981; CHECK-SSE1-NEXT:    movw %ax, 10(%rdi)
1982; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1983; CHECK-SSE1-NEXT:    movw %ax, 8(%rdi)
1984; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1985; CHECK-SSE1-NEXT:    movw %ax, 6(%rdi)
1986; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1987; CHECK-SSE1-NEXT:    movw %ax, 4(%rdi)
1988; CHECK-SSE1-NEXT:    movw %r12w, 2(%rdi)
1989; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
1990; CHECK-SSE1-NEXT:    movw %ax, (%rdi)
1991; CHECK-SSE1-NEXT:    movq %rdi, %rax
1992; CHECK-SSE1-NEXT:    popq %rbx
1993; CHECK-SSE1-NEXT:    popq %r12
1994; CHECK-SSE1-NEXT:    popq %r13
1995; CHECK-SSE1-NEXT:    popq %r14
1996; CHECK-SSE1-NEXT:    popq %r15
1997; CHECK-SSE1-NEXT:    popq %rbp
1998; CHECK-SSE1-NEXT:    retq
1999;
2000; CHECK-SSE2-LABEL: out_v16i16:
2001; CHECK-SSE2:       # %bb.0:
2002; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
2003; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
2004; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm2
2005; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
2006; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm3
2007; CHECK-SSE2-NEXT:    andps %xmm0, %xmm3
2008; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm1
2009; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
2010; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
2011; CHECK-SSE2-NEXT:    orps %xmm3, %xmm0
2012; CHECK-SSE2-NEXT:    retq
2013;
2014; CHECK-XOP-LABEL: out_v16i16:
2015; CHECK-XOP:       # %bb.0:
2016; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
2017; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
2018; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
2019; CHECK-XOP-NEXT:    retq
2020  %x = load <16 x i16>, ptr%px, align 32
2021  %y = load <16 x i16>, ptr%py, align 32
2022  %mask = load <16 x i16>, ptr%pmask, align 32
2023  %mx = and <16 x i16> %x, %mask
2024  %notmask = xor <16 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
2025  %my = and <16 x i16> %y, %notmask
2026  %r = or <16 x i16> %mx, %my
2027  ret <16 x i16> %r
2028}
2029
2030define <8 x i32> @out_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind {
2031; CHECK-BASELINE-LABEL: out_v8i32:
2032; CHECK-BASELINE:       # %bb.0:
2033; CHECK-BASELINE-NEXT:    pushq %rbp
2034; CHECK-BASELINE-NEXT:    pushq %r14
2035; CHECK-BASELINE-NEXT:    pushq %rbx
2036; CHECK-BASELINE-NEXT:    movq %rdi, %rax
2037; CHECK-BASELINE-NEXT:    movl 28(%rdx), %edi
2038; CHECK-BASELINE-NEXT:    movl 24(%rdx), %r8d
2039; CHECK-BASELINE-NEXT:    movl 20(%rdx), %r10d
2040; CHECK-BASELINE-NEXT:    movl 16(%rdx), %ebx
2041; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r14d
2042; CHECK-BASELINE-NEXT:    movl 8(%rdx), %ebp
2043; CHECK-BASELINE-NEXT:    movl (%rdx), %r9d
2044; CHECK-BASELINE-NEXT:    movl 4(%rdx), %r11d
2045; CHECK-BASELINE-NEXT:    movl (%rsi), %edx
2046; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
2047; CHECK-BASELINE-NEXT:    andl (%rcx), %edx
2048; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
2049; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r9d
2050; CHECK-BASELINE-NEXT:    xorl %r11d, %r9d
2051; CHECK-BASELINE-NEXT:    andl 4(%rcx), %r9d
2052; CHECK-BASELINE-NEXT:    xorl %r11d, %r9d
2053; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r11d
2054; CHECK-BASELINE-NEXT:    xorl %ebp, %r11d
2055; CHECK-BASELINE-NEXT:    andl 8(%rcx), %r11d
2056; CHECK-BASELINE-NEXT:    xorl %ebp, %r11d
2057; CHECK-BASELINE-NEXT:    movl 12(%rsi), %ebp
2058; CHECK-BASELINE-NEXT:    xorl %r14d, %ebp
2059; CHECK-BASELINE-NEXT:    andl 12(%rcx), %ebp
2060; CHECK-BASELINE-NEXT:    xorl %r14d, %ebp
2061; CHECK-BASELINE-NEXT:    movl 16(%rsi), %r14d
2062; CHECK-BASELINE-NEXT:    xorl %ebx, %r14d
2063; CHECK-BASELINE-NEXT:    andl 16(%rcx), %r14d
2064; CHECK-BASELINE-NEXT:    xorl %ebx, %r14d
2065; CHECK-BASELINE-NEXT:    movl 20(%rsi), %ebx
2066; CHECK-BASELINE-NEXT:    xorl %r10d, %ebx
2067; CHECK-BASELINE-NEXT:    andl 20(%rcx), %ebx
2068; CHECK-BASELINE-NEXT:    xorl %r10d, %ebx
2069; CHECK-BASELINE-NEXT:    movl 24(%rsi), %r10d
2070; CHECK-BASELINE-NEXT:    xorl %r8d, %r10d
2071; CHECK-BASELINE-NEXT:    andl 24(%rcx), %r10d
2072; CHECK-BASELINE-NEXT:    xorl %r8d, %r10d
2073; CHECK-BASELINE-NEXT:    movl 28(%rsi), %esi
2074; CHECK-BASELINE-NEXT:    xorl %edi, %esi
2075; CHECK-BASELINE-NEXT:    andl 28(%rcx), %esi
2076; CHECK-BASELINE-NEXT:    xorl %edi, %esi
2077; CHECK-BASELINE-NEXT:    movl %esi, 28(%rax)
2078; CHECK-BASELINE-NEXT:    movl %r10d, 24(%rax)
2079; CHECK-BASELINE-NEXT:    movl %ebx, 20(%rax)
2080; CHECK-BASELINE-NEXT:    movl %r14d, 16(%rax)
2081; CHECK-BASELINE-NEXT:    movl %ebp, 12(%rax)
2082; CHECK-BASELINE-NEXT:    movl %r11d, 8(%rax)
2083; CHECK-BASELINE-NEXT:    movl %r9d, 4(%rax)
2084; CHECK-BASELINE-NEXT:    movl %edx, (%rax)
2085; CHECK-BASELINE-NEXT:    popq %rbx
2086; CHECK-BASELINE-NEXT:    popq %r14
2087; CHECK-BASELINE-NEXT:    popq %rbp
2088; CHECK-BASELINE-NEXT:    retq
2089;
2090; CHECK-SSE1-LABEL: out_v8i32:
2091; CHECK-SSE1:       # %bb.0:
2092; CHECK-SSE1-NEXT:    pushq %rbp
2093; CHECK-SSE1-NEXT:    pushq %r14
2094; CHECK-SSE1-NEXT:    pushq %rbx
2095; CHECK-SSE1-NEXT:    movq %rdi, %rax
2096; CHECK-SSE1-NEXT:    movl 28(%rdx), %edi
2097; CHECK-SSE1-NEXT:    movl 24(%rdx), %r8d
2098; CHECK-SSE1-NEXT:    movl 20(%rdx), %r10d
2099; CHECK-SSE1-NEXT:    movl 16(%rdx), %ebx
2100; CHECK-SSE1-NEXT:    movl 12(%rdx), %r14d
2101; CHECK-SSE1-NEXT:    movl 8(%rdx), %ebp
2102; CHECK-SSE1-NEXT:    movl (%rdx), %r9d
2103; CHECK-SSE1-NEXT:    movl 4(%rdx), %r11d
2104; CHECK-SSE1-NEXT:    movl (%rsi), %edx
2105; CHECK-SSE1-NEXT:    xorl %r9d, %edx
2106; CHECK-SSE1-NEXT:    andl (%rcx), %edx
2107; CHECK-SSE1-NEXT:    xorl %r9d, %edx
2108; CHECK-SSE1-NEXT:    movl 4(%rsi), %r9d
2109; CHECK-SSE1-NEXT:    xorl %r11d, %r9d
2110; CHECK-SSE1-NEXT:    andl 4(%rcx), %r9d
2111; CHECK-SSE1-NEXT:    xorl %r11d, %r9d
2112; CHECK-SSE1-NEXT:    movl 8(%rsi), %r11d
2113; CHECK-SSE1-NEXT:    xorl %ebp, %r11d
2114; CHECK-SSE1-NEXT:    andl 8(%rcx), %r11d
2115; CHECK-SSE1-NEXT:    xorl %ebp, %r11d
2116; CHECK-SSE1-NEXT:    movl 12(%rsi), %ebp
2117; CHECK-SSE1-NEXT:    xorl %r14d, %ebp
2118; CHECK-SSE1-NEXT:    andl 12(%rcx), %ebp
2119; CHECK-SSE1-NEXT:    xorl %r14d, %ebp
2120; CHECK-SSE1-NEXT:    movl 16(%rsi), %r14d
2121; CHECK-SSE1-NEXT:    xorl %ebx, %r14d
2122; CHECK-SSE1-NEXT:    andl 16(%rcx), %r14d
2123; CHECK-SSE1-NEXT:    xorl %ebx, %r14d
2124; CHECK-SSE1-NEXT:    movl 20(%rsi), %ebx
2125; CHECK-SSE1-NEXT:    xorl %r10d, %ebx
2126; CHECK-SSE1-NEXT:    andl 20(%rcx), %ebx
2127; CHECK-SSE1-NEXT:    xorl %r10d, %ebx
2128; CHECK-SSE1-NEXT:    movl 24(%rsi), %r10d
2129; CHECK-SSE1-NEXT:    xorl %r8d, %r10d
2130; CHECK-SSE1-NEXT:    andl 24(%rcx), %r10d
2131; CHECK-SSE1-NEXT:    xorl %r8d, %r10d
2132; CHECK-SSE1-NEXT:    movl 28(%rsi), %esi
2133; CHECK-SSE1-NEXT:    xorl %edi, %esi
2134; CHECK-SSE1-NEXT:    andl 28(%rcx), %esi
2135; CHECK-SSE1-NEXT:    xorl %edi, %esi
2136; CHECK-SSE1-NEXT:    movl %esi, 28(%rax)
2137; CHECK-SSE1-NEXT:    movl %r10d, 24(%rax)
2138; CHECK-SSE1-NEXT:    movl %ebx, 20(%rax)
2139; CHECK-SSE1-NEXT:    movl %r14d, 16(%rax)
2140; CHECK-SSE1-NEXT:    movl %ebp, 12(%rax)
2141; CHECK-SSE1-NEXT:    movl %r11d, 8(%rax)
2142; CHECK-SSE1-NEXT:    movl %r9d, 4(%rax)
2143; CHECK-SSE1-NEXT:    movl %edx, (%rax)
2144; CHECK-SSE1-NEXT:    popq %rbx
2145; CHECK-SSE1-NEXT:    popq %r14
2146; CHECK-SSE1-NEXT:    popq %rbp
2147; CHECK-SSE1-NEXT:    retq
2148;
2149; CHECK-SSE2-LABEL: out_v8i32:
2150; CHECK-SSE2:       # %bb.0:
2151; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
2152; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
2153; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm2
2154; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
2155; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm3
2156; CHECK-SSE2-NEXT:    andps %xmm0, %xmm3
2157; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm1
2158; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
2159; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
2160; CHECK-SSE2-NEXT:    orps %xmm3, %xmm0
2161; CHECK-SSE2-NEXT:    retq
2162;
2163; CHECK-XOP-LABEL: out_v8i32:
2164; CHECK-XOP:       # %bb.0:
2165; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
2166; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
2167; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
2168; CHECK-XOP-NEXT:    retq
2169  %x = load <8 x i32>, ptr%px, align 32
2170  %y = load <8 x i32>, ptr%py, align 32
2171  %mask = load <8 x i32>, ptr%pmask, align 32
2172  %mx = and <8 x i32> %x, %mask
2173  %notmask = xor <8 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
2174  %my = and <8 x i32> %y, %notmask
2175  %r = or <8 x i32> %mx, %my
2176  ret <8 x i32> %r
2177}
2178
2179define <4 x i64> @out_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind {
2180; CHECK-BASELINE-LABEL: out_v4i64:
2181; CHECK-BASELINE:       # %bb.0:
2182; CHECK-BASELINE-NEXT:    movq %rdi, %rax
2183; CHECK-BASELINE-NEXT:    movq 24(%rdx), %rdi
2184; CHECK-BASELINE-NEXT:    movq 16(%rdx), %r8
2185; CHECK-BASELINE-NEXT:    movq (%rdx), %r9
2186; CHECK-BASELINE-NEXT:    movq 8(%rdx), %r10
2187; CHECK-BASELINE-NEXT:    movq (%rsi), %rdx
2188; CHECK-BASELINE-NEXT:    xorq %r9, %rdx
2189; CHECK-BASELINE-NEXT:    andq (%rcx), %rdx
2190; CHECK-BASELINE-NEXT:    xorq %r9, %rdx
2191; CHECK-BASELINE-NEXT:    movq 8(%rsi), %r9
2192; CHECK-BASELINE-NEXT:    xorq %r10, %r9
2193; CHECK-BASELINE-NEXT:    andq 8(%rcx), %r9
2194; CHECK-BASELINE-NEXT:    xorq %r10, %r9
2195; CHECK-BASELINE-NEXT:    movq 16(%rsi), %r10
2196; CHECK-BASELINE-NEXT:    xorq %r8, %r10
2197; CHECK-BASELINE-NEXT:    andq 16(%rcx), %r10
2198; CHECK-BASELINE-NEXT:    xorq %r8, %r10
2199; CHECK-BASELINE-NEXT:    movq 24(%rsi), %rsi
2200; CHECK-BASELINE-NEXT:    xorq %rdi, %rsi
2201; CHECK-BASELINE-NEXT:    andq 24(%rcx), %rsi
2202; CHECK-BASELINE-NEXT:    xorq %rdi, %rsi
2203; CHECK-BASELINE-NEXT:    movq %rsi, 24(%rax)
2204; CHECK-BASELINE-NEXT:    movq %r10, 16(%rax)
2205; CHECK-BASELINE-NEXT:    movq %r9, 8(%rax)
2206; CHECK-BASELINE-NEXT:    movq %rdx, (%rax)
2207; CHECK-BASELINE-NEXT:    retq
2208;
2209; CHECK-SSE1-LABEL: out_v4i64:
2210; CHECK-SSE1:       # %bb.0:
2211; CHECK-SSE1-NEXT:    movq %rdi, %rax
2212; CHECK-SSE1-NEXT:    movq 24(%rdx), %rdi
2213; CHECK-SSE1-NEXT:    movq 16(%rdx), %r8
2214; CHECK-SSE1-NEXT:    movq (%rdx), %r9
2215; CHECK-SSE1-NEXT:    movq 8(%rdx), %r10
2216; CHECK-SSE1-NEXT:    movq (%rsi), %rdx
2217; CHECK-SSE1-NEXT:    xorq %r9, %rdx
2218; CHECK-SSE1-NEXT:    andq (%rcx), %rdx
2219; CHECK-SSE1-NEXT:    xorq %r9, %rdx
2220; CHECK-SSE1-NEXT:    movq 8(%rsi), %r9
2221; CHECK-SSE1-NEXT:    xorq %r10, %r9
2222; CHECK-SSE1-NEXT:    andq 8(%rcx), %r9
2223; CHECK-SSE1-NEXT:    xorq %r10, %r9
2224; CHECK-SSE1-NEXT:    movq 16(%rsi), %r10
2225; CHECK-SSE1-NEXT:    xorq %r8, %r10
2226; CHECK-SSE1-NEXT:    andq 16(%rcx), %r10
2227; CHECK-SSE1-NEXT:    xorq %r8, %r10
2228; CHECK-SSE1-NEXT:    movq 24(%rsi), %rsi
2229; CHECK-SSE1-NEXT:    xorq %rdi, %rsi
2230; CHECK-SSE1-NEXT:    andq 24(%rcx), %rsi
2231; CHECK-SSE1-NEXT:    xorq %rdi, %rsi
2232; CHECK-SSE1-NEXT:    movq %rsi, 24(%rax)
2233; CHECK-SSE1-NEXT:    movq %r10, 16(%rax)
2234; CHECK-SSE1-NEXT:    movq %r9, 8(%rax)
2235; CHECK-SSE1-NEXT:    movq %rdx, (%rax)
2236; CHECK-SSE1-NEXT:    retq
2237;
2238; CHECK-SSE2-LABEL: out_v4i64:
2239; CHECK-SSE2:       # %bb.0:
2240; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
2241; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
2242; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm2
2243; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
2244; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm3
2245; CHECK-SSE2-NEXT:    andps %xmm0, %xmm3
2246; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm1
2247; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
2248; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
2249; CHECK-SSE2-NEXT:    orps %xmm3, %xmm0
2250; CHECK-SSE2-NEXT:    retq
2251;
2252; CHECK-XOP-LABEL: out_v4i64:
2253; CHECK-XOP:       # %bb.0:
2254; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
2255; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
2256; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
2257; CHECK-XOP-NEXT:    retq
2258  %x = load <4 x i64>, ptr%px, align 32
2259  %y = load <4 x i64>, ptr%py, align 32
2260  %mask = load <4 x i64>, ptr%pmask, align 32
2261  %mx = and <4 x i64> %x, %mask
2262  %notmask = xor <4 x i64> %mask, <i64 -1, i64 -1, i64 -1, i64 -1>
2263  %my = and <4 x i64> %y, %notmask
2264  %r = or <4 x i64> %mx, %my
2265  ret <4 x i64> %r
2266}
2267
2268;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2269; Should be the same as the previous one.
2270;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2271
2272; ============================================================================ ;
2273; 8-bit vector width
2274; ============================================================================ ;
2275
2276define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
2277; CHECK-LABEL: in_v1i8:
2278; CHECK:       # %bb.0:
2279; CHECK-NEXT:    movl %edi, %eax
2280; CHECK-NEXT:    xorl %esi, %eax
2281; CHECK-NEXT:    andl %edx, %eax
2282; CHECK-NEXT:    xorl %esi, %eax
2283; CHECK-NEXT:    # kill: def $al killed $al killed $eax
2284; CHECK-NEXT:    retq
2285  %n0 = xor <1 x i8> %x, %y
2286  %n1 = and <1 x i8> %n0, %mask
2287  %r = xor <1 x i8> %n1, %y
2288  ret <1 x i8> %r
2289}
2290
2291; ============================================================================ ;
2292; 16-bit vector width
2293; ============================================================================ ;
2294
2295define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
2296; CHECK-BASELINE-LABEL: in_v2i8:
2297; CHECK-BASELINE:       # %bb.0:
2298; CHECK-BASELINE-NEXT:    movl %edi, %eax
2299; CHECK-BASELINE-NEXT:    xorl %edx, %eax
2300; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
2301; CHECK-BASELINE-NEXT:    andl %r9d, %esi
2302; CHECK-BASELINE-NEXT:    andl %r8d, %eax
2303; CHECK-BASELINE-NEXT:    xorl %edx, %eax
2304; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
2305; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
2306; CHECK-BASELINE-NEXT:    movl %esi, %edx
2307; CHECK-BASELINE-NEXT:    retq
2308;
2309; CHECK-SSE1-LABEL: in_v2i8:
2310; CHECK-SSE1:       # %bb.0:
2311; CHECK-SSE1-NEXT:    movl %edi, %eax
2312; CHECK-SSE1-NEXT:    xorl %edx, %eax
2313; CHECK-SSE1-NEXT:    xorl %ecx, %esi
2314; CHECK-SSE1-NEXT:    andl %r9d, %esi
2315; CHECK-SSE1-NEXT:    andl %r8d, %eax
2316; CHECK-SSE1-NEXT:    xorl %edx, %eax
2317; CHECK-SSE1-NEXT:    xorl %ecx, %esi
2318; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
2319; CHECK-SSE1-NEXT:    movl %esi, %edx
2320; CHECK-SSE1-NEXT:    retq
2321;
2322; CHECK-SSE2-LABEL: in_v2i8:
2323; CHECK-SSE2:       # %bb.0:
2324; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
2325; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
2326; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
2327; CHECK-SSE2-NEXT:    retq
2328;
2329; CHECK-XOP-LABEL: in_v2i8:
2330; CHECK-XOP:       # %bb.0:
2331; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
2332; CHECK-XOP-NEXT:    retq
2333  %n0 = xor <2 x i8> %x, %y
2334  %n1 = and <2 x i8> %n0, %mask
2335  %r = xor <2 x i8> %n1, %y
2336  ret <2 x i8> %r
2337}
2338
2339define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
2340; CHECK-LABEL: in_v1i16:
2341; CHECK:       # %bb.0:
2342; CHECK-NEXT:    movl %edi, %eax
2343; CHECK-NEXT:    xorl %esi, %eax
2344; CHECK-NEXT:    andl %edx, %eax
2345; CHECK-NEXT:    xorl %esi, %eax
2346; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
2347; CHECK-NEXT:    retq
2348  %n0 = xor <1 x i16> %x, %y
2349  %n1 = and <1 x i16> %n0, %mask
2350  %r = xor <1 x i16> %n1, %y
2351  ret <1 x i16> %r
2352}
2353
2354; ============================================================================ ;
2355; 32-bit vector width
2356; ============================================================================ ;
2357
2358define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
2359; CHECK-BASELINE-LABEL: in_v4i8:
2360; CHECK-BASELINE:       # %bb.0:
2361; CHECK-BASELINE-NEXT:    movq %rdi, %rax
2362; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
2363; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
2364; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
2365; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
2366; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
2367; CHECK-BASELINE-NEXT:    xorb %r10b, %cl
2368; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
2369; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
2370; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
2371; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
2372; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
2373; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
2374; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
2375; CHECK-BASELINE-NEXT:    xorb %r10b, %cl
2376; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
2377; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rax)
2378; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
2379; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
2380; CHECK-BASELINE-NEXT:    movb %sil, (%rax)
2381; CHECK-BASELINE-NEXT:    retq
2382;
2383; CHECK-SSE1-LABEL: in_v4i8:
2384; CHECK-SSE1:       # %bb.0:
2385; CHECK-SSE1-NEXT:    movq %rdi, %rax
2386; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
2387; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
2388; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
2389; CHECK-SSE1-NEXT:    xorl %r9d, %esi
2390; CHECK-SSE1-NEXT:    xorb %r11b, %dl
2391; CHECK-SSE1-NEXT:    xorb %r10b, %cl
2392; CHECK-SSE1-NEXT:    xorb %dil, %r8b
2393; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
2394; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
2395; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
2396; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
2397; CHECK-SSE1-NEXT:    xorb %r9b, %sil
2398; CHECK-SSE1-NEXT:    xorb %r11b, %dl
2399; CHECK-SSE1-NEXT:    xorb %r10b, %cl
2400; CHECK-SSE1-NEXT:    xorb %dil, %r8b
2401; CHECK-SSE1-NEXT:    movb %r8b, 3(%rax)
2402; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
2403; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
2404; CHECK-SSE1-NEXT:    movb %sil, (%rax)
2405; CHECK-SSE1-NEXT:    retq
2406;
2407; CHECK-SSE2-LABEL: in_v4i8:
2408; CHECK-SSE2:       # %bb.0:
2409; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
2410; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
2411; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
2412; CHECK-SSE2-NEXT:    retq
2413;
2414; CHECK-XOP-LABEL: in_v4i8:
2415; CHECK-XOP:       # %bb.0:
2416; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
2417; CHECK-XOP-NEXT:    retq
2418  %n0 = xor <4 x i8> %x, %y
2419  %n1 = and <4 x i8> %n0, %mask
2420  %r = xor <4 x i8> %n1, %y
2421  ret <4 x i8> %r
2422}
2423
2424define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
2425; CHECK-BASELINE-LABEL: in_v2i16:
2426; CHECK-BASELINE:       # %bb.0:
2427; CHECK-BASELINE-NEXT:    movl %edi, %eax
2428; CHECK-BASELINE-NEXT:    xorl %edx, %eax
2429; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
2430; CHECK-BASELINE-NEXT:    andl %r9d, %esi
2431; CHECK-BASELINE-NEXT:    andl %r8d, %eax
2432; CHECK-BASELINE-NEXT:    xorl %edx, %eax
2433; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
2434; CHECK-BASELINE-NEXT:    # kill: def $ax killed $ax killed $eax
2435; CHECK-BASELINE-NEXT:    movl %esi, %edx
2436; CHECK-BASELINE-NEXT:    retq
2437;
2438; CHECK-SSE1-LABEL: in_v2i16:
2439; CHECK-SSE1:       # %bb.0:
2440; CHECK-SSE1-NEXT:    movl %edi, %eax
2441; CHECK-SSE1-NEXT:    xorl %edx, %eax
2442; CHECK-SSE1-NEXT:    xorl %ecx, %esi
2443; CHECK-SSE1-NEXT:    andl %r9d, %esi
2444; CHECK-SSE1-NEXT:    andl %r8d, %eax
2445; CHECK-SSE1-NEXT:    xorl %edx, %eax
2446; CHECK-SSE1-NEXT:    xorl %ecx, %esi
2447; CHECK-SSE1-NEXT:    # kill: def $ax killed $ax killed $eax
2448; CHECK-SSE1-NEXT:    movl %esi, %edx
2449; CHECK-SSE1-NEXT:    retq
2450;
2451; CHECK-SSE2-LABEL: in_v2i16:
2452; CHECK-SSE2:       # %bb.0:
2453; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
2454; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
2455; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
2456; CHECK-SSE2-NEXT:    retq
2457;
2458; CHECK-XOP-LABEL: in_v2i16:
2459; CHECK-XOP:       # %bb.0:
2460; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
2461; CHECK-XOP-NEXT:    retq
2462  %n0 = xor <2 x i16> %x, %y
2463  %n1 = and <2 x i16> %n0, %mask
2464  %r = xor <2 x i16> %n1, %y
2465  ret <2 x i16> %r
2466}
2467
2468define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
2469; CHECK-LABEL: in_v1i32:
2470; CHECK:       # %bb.0:
2471; CHECK-NEXT:    movl %edi, %eax
2472; CHECK-NEXT:    xorl %esi, %eax
2473; CHECK-NEXT:    andl %edx, %eax
2474; CHECK-NEXT:    xorl %esi, %eax
2475; CHECK-NEXT:    retq
2476  %n0 = xor <1 x i32> %x, %y
2477  %n1 = and <1 x i32> %n0, %mask
2478  %r = xor <1 x i32> %n1, %y
2479  ret <1 x i32> %r
2480}
2481
2482; ============================================================================ ;
2483; 64-bit vector width
2484; ============================================================================ ;
2485
2486define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
2487; CHECK-BASELINE-LABEL: in_v8i8:
2488; CHECK-BASELINE:       # %bb.0:
2489; CHECK-BASELINE-NEXT:    pushq %rbp
2490; CHECK-BASELINE-NEXT:    pushq %r15
2491; CHECK-BASELINE-NEXT:    pushq %r14
2492; CHECK-BASELINE-NEXT:    pushq %r13
2493; CHECK-BASELINE-NEXT:    pushq %r12
2494; CHECK-BASELINE-NEXT:    pushq %rbx
2495; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
2496; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
2497; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
2498; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
2499; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
2500; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
2501; CHECK-BASELINE-NEXT:    xorb %r11b, %sil
2502; CHECK-BASELINE-NEXT:    xorb %r12b, %dl
2503; CHECK-BASELINE-NEXT:    xorb %r14b, %cl
2504; CHECK-BASELINE-NEXT:    xorb %bpl, %r8b
2505; CHECK-BASELINE-NEXT:    xorb %bl, %r9b
2506; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
2507; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r15b
2508; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
2509; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
2510; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2511; CHECK-BASELINE-NEXT:    xorb %r10b, %al
2512; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
2513; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
2514; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
2515; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
2516; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
2517; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
2518; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
2519; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
2520; CHECK-BASELINE-NEXT:    xorb %r11b, %sil
2521; CHECK-BASELINE-NEXT:    xorb %r12b, %dl
2522; CHECK-BASELINE-NEXT:    xorb %r14b, %cl
2523; CHECK-BASELINE-NEXT:    xorb %bpl, %r8b
2524; CHECK-BASELINE-NEXT:    xorb %bl, %r9b
2525; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r15b
2526; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
2527; CHECK-BASELINE-NEXT:    xorb %r10b, %al
2528; CHECK-BASELINE-NEXT:    movb %al, 7(%rdi)
2529; CHECK-BASELINE-NEXT:    movb %r13b, 6(%rdi)
2530; CHECK-BASELINE-NEXT:    movb %r15b, 5(%rdi)
2531; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdi)
2532; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rdi)
2533; CHECK-BASELINE-NEXT:    movb %cl, 2(%rdi)
2534; CHECK-BASELINE-NEXT:    movb %dl, 1(%rdi)
2535; CHECK-BASELINE-NEXT:    movb %sil, (%rdi)
2536; CHECK-BASELINE-NEXT:    movq %rdi, %rax
2537; CHECK-BASELINE-NEXT:    popq %rbx
2538; CHECK-BASELINE-NEXT:    popq %r12
2539; CHECK-BASELINE-NEXT:    popq %r13
2540; CHECK-BASELINE-NEXT:    popq %r14
2541; CHECK-BASELINE-NEXT:    popq %r15
2542; CHECK-BASELINE-NEXT:    popq %rbp
2543; CHECK-BASELINE-NEXT:    retq
2544;
2545; CHECK-SSE1-LABEL: in_v8i8:
2546; CHECK-SSE1:       # %bb.0:
2547; CHECK-SSE1-NEXT:    pushq %rbp
2548; CHECK-SSE1-NEXT:    pushq %r15
2549; CHECK-SSE1-NEXT:    pushq %r14
2550; CHECK-SSE1-NEXT:    pushq %r13
2551; CHECK-SSE1-NEXT:    pushq %r12
2552; CHECK-SSE1-NEXT:    pushq %rbx
2553; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
2554; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
2555; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
2556; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
2557; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
2558; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
2559; CHECK-SSE1-NEXT:    xorb %r11b, %sil
2560; CHECK-SSE1-NEXT:    xorb %r12b, %dl
2561; CHECK-SSE1-NEXT:    xorb %r14b, %cl
2562; CHECK-SSE1-NEXT:    xorb %bpl, %r8b
2563; CHECK-SSE1-NEXT:    xorb %bl, %r9b
2564; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
2565; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r15b
2566; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
2567; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
2568; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2569; CHECK-SSE1-NEXT:    xorb %r10b, %al
2570; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
2571; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
2572; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
2573; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
2574; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
2575; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
2576; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
2577; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
2578; CHECK-SSE1-NEXT:    xorb %r11b, %sil
2579; CHECK-SSE1-NEXT:    xorb %r12b, %dl
2580; CHECK-SSE1-NEXT:    xorb %r14b, %cl
2581; CHECK-SSE1-NEXT:    xorb %bpl, %r8b
2582; CHECK-SSE1-NEXT:    xorb %bl, %r9b
2583; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r15b
2584; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
2585; CHECK-SSE1-NEXT:    xorb %r10b, %al
2586; CHECK-SSE1-NEXT:    movb %al, 7(%rdi)
2587; CHECK-SSE1-NEXT:    movb %r13b, 6(%rdi)
2588; CHECK-SSE1-NEXT:    movb %r15b, 5(%rdi)
2589; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdi)
2590; CHECK-SSE1-NEXT:    movb %r8b, 3(%rdi)
2591; CHECK-SSE1-NEXT:    movb %cl, 2(%rdi)
2592; CHECK-SSE1-NEXT:    movb %dl, 1(%rdi)
2593; CHECK-SSE1-NEXT:    movb %sil, (%rdi)
2594; CHECK-SSE1-NEXT:    movq %rdi, %rax
2595; CHECK-SSE1-NEXT:    popq %rbx
2596; CHECK-SSE1-NEXT:    popq %r12
2597; CHECK-SSE1-NEXT:    popq %r13
2598; CHECK-SSE1-NEXT:    popq %r14
2599; CHECK-SSE1-NEXT:    popq %r15
2600; CHECK-SSE1-NEXT:    popq %rbp
2601; CHECK-SSE1-NEXT:    retq
2602;
2603; CHECK-SSE2-LABEL: in_v8i8:
2604; CHECK-SSE2:       # %bb.0:
2605; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
2606; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
2607; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
2608; CHECK-SSE2-NEXT:    retq
2609;
2610; CHECK-XOP-LABEL: in_v8i8:
2611; CHECK-XOP:       # %bb.0:
2612; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
2613; CHECK-XOP-NEXT:    retq
2614  %n0 = xor <8 x i8> %x, %y
2615  %n1 = and <8 x i8> %n0, %mask
2616  %r = xor <8 x i8> %n1, %y
2617  ret <8 x i8> %r
2618}
2619
2620define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
2621; CHECK-BASELINE-LABEL: in_v4i16:
2622; CHECK-BASELINE:       # %bb.0:
2623; CHECK-BASELINE-NEXT:    movq %rdi, %rax
2624; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
2625; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
2626; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
2627; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
2628; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
2629; CHECK-BASELINE-NEXT:    xorl %r10d, %ecx
2630; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
2631; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
2632; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
2633; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
2634; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
2635; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
2636; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
2637; CHECK-BASELINE-NEXT:    xorl %r10d, %ecx
2638; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
2639; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
2640; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
2641; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
2642; CHECK-BASELINE-NEXT:    movw %si, (%rax)
2643; CHECK-BASELINE-NEXT:    retq
2644;
2645; CHECK-SSE1-LABEL: in_v4i16:
2646; CHECK-SSE1:       # %bb.0:
2647; CHECK-SSE1-NEXT:    movq %rdi, %rax
2648; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
2649; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
2650; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
2651; CHECK-SSE1-NEXT:    xorl %r9d, %esi
2652; CHECK-SSE1-NEXT:    xorl %r11d, %edx
2653; CHECK-SSE1-NEXT:    xorl %r10d, %ecx
2654; CHECK-SSE1-NEXT:    xorl %edi, %r8d
2655; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
2656; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
2657; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
2658; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
2659; CHECK-SSE1-NEXT:    xorl %r9d, %esi
2660; CHECK-SSE1-NEXT:    xorl %r11d, %edx
2661; CHECK-SSE1-NEXT:    xorl %r10d, %ecx
2662; CHECK-SSE1-NEXT:    xorl %edi, %r8d
2663; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
2664; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
2665; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
2666; CHECK-SSE1-NEXT:    movw %si, (%rax)
2667; CHECK-SSE1-NEXT:    retq
2668;
2669; CHECK-SSE2-LABEL: in_v4i16:
2670; CHECK-SSE2:       # %bb.0:
2671; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
2672; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
2673; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
2674; CHECK-SSE2-NEXT:    retq
2675;
2676; CHECK-XOP-LABEL: in_v4i16:
2677; CHECK-XOP:       # %bb.0:
2678; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
2679; CHECK-XOP-NEXT:    retq
2680  %n0 = xor <4 x i16> %x, %y
2681  %n1 = and <4 x i16> %n0, %mask
2682  %r = xor <4 x i16> %n1, %y
2683  ret <4 x i16> %r
2684}
2685
2686define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
2687; CHECK-BASELINE-LABEL: in_v2i32:
2688; CHECK-BASELINE:       # %bb.0:
2689; CHECK-BASELINE-NEXT:    movl %edi, %eax
2690; CHECK-BASELINE-NEXT:    xorl %edx, %eax
2691; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
2692; CHECK-BASELINE-NEXT:    andl %r9d, %esi
2693; CHECK-BASELINE-NEXT:    andl %r8d, %eax
2694; CHECK-BASELINE-NEXT:    xorl %edx, %eax
2695; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
2696; CHECK-BASELINE-NEXT:    movl %esi, %edx
2697; CHECK-BASELINE-NEXT:    retq
2698;
2699; CHECK-SSE1-LABEL: in_v2i32:
2700; CHECK-SSE1:       # %bb.0:
2701; CHECK-SSE1-NEXT:    movl %edi, %eax
2702; CHECK-SSE1-NEXT:    xorl %edx, %eax
2703; CHECK-SSE1-NEXT:    xorl %ecx, %esi
2704; CHECK-SSE1-NEXT:    andl %r9d, %esi
2705; CHECK-SSE1-NEXT:    andl %r8d, %eax
2706; CHECK-SSE1-NEXT:    xorl %edx, %eax
2707; CHECK-SSE1-NEXT:    xorl %ecx, %esi
2708; CHECK-SSE1-NEXT:    movl %esi, %edx
2709; CHECK-SSE1-NEXT:    retq
2710;
2711; CHECK-SSE2-LABEL: in_v2i32:
2712; CHECK-SSE2:       # %bb.0:
2713; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
2714; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
2715; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
2716; CHECK-SSE2-NEXT:    retq
2717;
2718; CHECK-XOP-LABEL: in_v2i32:
2719; CHECK-XOP:       # %bb.0:
2720; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
2721; CHECK-XOP-NEXT:    retq
2722  %n0 = xor <2 x i32> %x, %y
2723  %n1 = and <2 x i32> %n0, %mask
2724  %r = xor <2 x i32> %n1, %y
2725  ret <2 x i32> %r
2726}
2727
2728define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
2729; CHECK-LABEL: in_v1i64:
2730; CHECK:       # %bb.0:
2731; CHECK-NEXT:    movq %rdi, %rax
2732; CHECK-NEXT:    xorq %rsi, %rax
2733; CHECK-NEXT:    andq %rdx, %rax
2734; CHECK-NEXT:    xorq %rsi, %rax
2735; CHECK-NEXT:    retq
2736  %n0 = xor <1 x i64> %x, %y
2737  %n1 = and <1 x i64> %n0, %mask
2738  %r = xor <1 x i64> %n1, %y
2739  ret <1 x i64> %r
2740}
2741
2742; ============================================================================ ;
2743; 128-bit vector width
2744; ============================================================================ ;
2745
2746define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
2747; CHECK-BASELINE-LABEL: in_v16i8:
2748; CHECK-BASELINE:       # %bb.0:
2749; CHECK-BASELINE-NEXT:    pushq %rbp
2750; CHECK-BASELINE-NEXT:    pushq %r15
2751; CHECK-BASELINE-NEXT:    pushq %r14
2752; CHECK-BASELINE-NEXT:    pushq %r13
2753; CHECK-BASELINE-NEXT:    pushq %r12
2754; CHECK-BASELINE-NEXT:    pushq %rbx
2755; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2756; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2757; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2758; CHECK-BASELINE-NEXT:    movq %rdi, %rdx
2759; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
2760; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
2761; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2762; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
2763; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
2764; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
2765; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
2766; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
2767; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
2768; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
2769; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
2770; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
2771; CHECK-BASELINE-NEXT:    xorb %dil, %r9b
2772; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
2773; CHECK-BASELINE-NEXT:    xorb %dil, %r9b
2774; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
2775; CHECK-BASELINE-NEXT:    xorb %r10b, %dil
2776; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dil
2777; CHECK-BASELINE-NEXT:    xorb %r10b, %dil
2778; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
2779; CHECK-BASELINE-NEXT:    xorb %r11b, %r10b
2780; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
2781; CHECK-BASELINE-NEXT:    xorb %r11b, %r10b
2782; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
2783; CHECK-BASELINE-NEXT:    xorb %r13b, %r11b
2784; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
2785; CHECK-BASELINE-NEXT:    xorb %r13b, %r11b
2786; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
2787; CHECK-BASELINE-NEXT:    xorb %r12b, %r13b
2788; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
2789; CHECK-BASELINE-NEXT:    xorb %r12b, %r13b
2790; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
2791; CHECK-BASELINE-NEXT:    xorb %r15b, %r12b
2792; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
2793; CHECK-BASELINE-NEXT:    xorb %r15b, %r12b
2794; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
2795; CHECK-BASELINE-NEXT:    xorb %r14b, %r15b
2796; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
2797; CHECK-BASELINE-NEXT:    xorb %r14b, %r15b
2798; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
2799; CHECK-BASELINE-NEXT:    xorb %bpl, %r14b
2800; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
2801; CHECK-BASELINE-NEXT:    xorb %bpl, %r14b
2802; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
2803; CHECK-BASELINE-NEXT:    xorb %bl, %bpl
2804; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
2805; CHECK-BASELINE-NEXT:    xorb %bl, %bpl
2806; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
2807; CHECK-BASELINE-NEXT:    xorb %al, %bl
2808; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
2809; CHECK-BASELINE-NEXT:    xorb %al, %bl
2810; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2811; CHECK-BASELINE-NEXT:    xorb %cl, %al
2812; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
2813; CHECK-BASELINE-NEXT:    xorb %cl, %al
2814; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
2815; CHECK-BASELINE-NEXT:    xorb %sil, %cl
2816; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
2817; CHECK-BASELINE-NEXT:    xorb %sil, %cl
2818; CHECK-BASELINE-NEXT:    movb %cl, 15(%rdx)
2819; CHECK-BASELINE-NEXT:    movb %al, 14(%rdx)
2820; CHECK-BASELINE-NEXT:    movb %bl, 13(%rdx)
2821; CHECK-BASELINE-NEXT:    movb %bpl, 12(%rdx)
2822; CHECK-BASELINE-NEXT:    movb %r14b, 11(%rdx)
2823; CHECK-BASELINE-NEXT:    movb %r15b, 10(%rdx)
2824; CHECK-BASELINE-NEXT:    movb %r12b, 9(%rdx)
2825; CHECK-BASELINE-NEXT:    movb %r13b, 8(%rdx)
2826; CHECK-BASELINE-NEXT:    movb %r11b, 7(%rdx)
2827; CHECK-BASELINE-NEXT:    movb %r10b, 6(%rdx)
2828; CHECK-BASELINE-NEXT:    movb %dil, 5(%rdx)
2829; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdx)
2830; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2831; CHECK-BASELINE-NEXT:    xorb %al, %r8b
2832; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
2833; CHECK-BASELINE-NEXT:    xorb %al, %r8b
2834; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rdx)
2835; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2836; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
2837; CHECK-BASELINE-NEXT:    xorb %al, %cl
2838; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
2839; CHECK-BASELINE-NEXT:    xorb %al, %cl
2840; CHECK-BASELINE-NEXT:    movb %cl, 2(%rdx)
2841; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2842; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
2843; CHECK-BASELINE-NEXT:    xorb %al, %cl
2844; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
2845; CHECK-BASELINE-NEXT:    xorb %al, %cl
2846; CHECK-BASELINE-NEXT:    movb %cl, 1(%rdx)
2847; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2848; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
2849; CHECK-BASELINE-NEXT:    xorb %al, %cl
2850; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
2851; CHECK-BASELINE-NEXT:    xorb %al, %cl
2852; CHECK-BASELINE-NEXT:    movb %cl, (%rdx)
2853; CHECK-BASELINE-NEXT:    movq %rdx, %rax
2854; CHECK-BASELINE-NEXT:    popq %rbx
2855; CHECK-BASELINE-NEXT:    popq %r12
2856; CHECK-BASELINE-NEXT:    popq %r13
2857; CHECK-BASELINE-NEXT:    popq %r14
2858; CHECK-BASELINE-NEXT:    popq %r15
2859; CHECK-BASELINE-NEXT:    popq %rbp
2860; CHECK-BASELINE-NEXT:    retq
2861;
2862; CHECK-SSE1-LABEL: in_v16i8:
2863; CHECK-SSE1:       # %bb.0:
2864; CHECK-SSE1-NEXT:    pushq %rbp
2865; CHECK-SSE1-NEXT:    pushq %r15
2866; CHECK-SSE1-NEXT:    pushq %r14
2867; CHECK-SSE1-NEXT:    pushq %r13
2868; CHECK-SSE1-NEXT:    pushq %r12
2869; CHECK-SSE1-NEXT:    pushq %rbx
2870; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2871; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2872; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2873; CHECK-SSE1-NEXT:    movq %rdi, %rdx
2874; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
2875; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
2876; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2877; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
2878; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
2879; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
2880; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
2881; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
2882; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
2883; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
2884; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
2885; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
2886; CHECK-SSE1-NEXT:    xorb %dil, %r9b
2887; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
2888; CHECK-SSE1-NEXT:    xorb %dil, %r9b
2889; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
2890; CHECK-SSE1-NEXT:    xorb %r10b, %dil
2891; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dil
2892; CHECK-SSE1-NEXT:    xorb %r10b, %dil
2893; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
2894; CHECK-SSE1-NEXT:    xorb %r11b, %r10b
2895; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
2896; CHECK-SSE1-NEXT:    xorb %r11b, %r10b
2897; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
2898; CHECK-SSE1-NEXT:    xorb %r13b, %r11b
2899; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
2900; CHECK-SSE1-NEXT:    xorb %r13b, %r11b
2901; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
2902; CHECK-SSE1-NEXT:    xorb %r12b, %r13b
2903; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
2904; CHECK-SSE1-NEXT:    xorb %r12b, %r13b
2905; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
2906; CHECK-SSE1-NEXT:    xorb %r15b, %r12b
2907; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
2908; CHECK-SSE1-NEXT:    xorb %r15b, %r12b
2909; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
2910; CHECK-SSE1-NEXT:    xorb %r14b, %r15b
2911; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
2912; CHECK-SSE1-NEXT:    xorb %r14b, %r15b
2913; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
2914; CHECK-SSE1-NEXT:    xorb %bpl, %r14b
2915; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
2916; CHECK-SSE1-NEXT:    xorb %bpl, %r14b
2917; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
2918; CHECK-SSE1-NEXT:    xorb %bl, %bpl
2919; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
2920; CHECK-SSE1-NEXT:    xorb %bl, %bpl
2921; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
2922; CHECK-SSE1-NEXT:    xorb %al, %bl
2923; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
2924; CHECK-SSE1-NEXT:    xorb %al, %bl
2925; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2926; CHECK-SSE1-NEXT:    xorb %cl, %al
2927; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
2928; CHECK-SSE1-NEXT:    xorb %cl, %al
2929; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
2930; CHECK-SSE1-NEXT:    xorb %sil, %cl
2931; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
2932; CHECK-SSE1-NEXT:    xorb %sil, %cl
2933; CHECK-SSE1-NEXT:    movb %cl, 15(%rdx)
2934; CHECK-SSE1-NEXT:    movb %al, 14(%rdx)
2935; CHECK-SSE1-NEXT:    movb %bl, 13(%rdx)
2936; CHECK-SSE1-NEXT:    movb %bpl, 12(%rdx)
2937; CHECK-SSE1-NEXT:    movb %r14b, 11(%rdx)
2938; CHECK-SSE1-NEXT:    movb %r15b, 10(%rdx)
2939; CHECK-SSE1-NEXT:    movb %r12b, 9(%rdx)
2940; CHECK-SSE1-NEXT:    movb %r13b, 8(%rdx)
2941; CHECK-SSE1-NEXT:    movb %r11b, 7(%rdx)
2942; CHECK-SSE1-NEXT:    movb %r10b, 6(%rdx)
2943; CHECK-SSE1-NEXT:    movb %dil, 5(%rdx)
2944; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdx)
2945; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2946; CHECK-SSE1-NEXT:    xorb %al, %r8b
2947; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
2948; CHECK-SSE1-NEXT:    xorb %al, %r8b
2949; CHECK-SSE1-NEXT:    movb %r8b, 3(%rdx)
2950; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2951; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
2952; CHECK-SSE1-NEXT:    xorb %al, %cl
2953; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
2954; CHECK-SSE1-NEXT:    xorb %al, %cl
2955; CHECK-SSE1-NEXT:    movb %cl, 2(%rdx)
2956; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2957; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
2958; CHECK-SSE1-NEXT:    xorb %al, %cl
2959; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
2960; CHECK-SSE1-NEXT:    xorb %al, %cl
2961; CHECK-SSE1-NEXT:    movb %cl, 1(%rdx)
2962; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2963; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
2964; CHECK-SSE1-NEXT:    xorb %al, %cl
2965; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
2966; CHECK-SSE1-NEXT:    xorb %al, %cl
2967; CHECK-SSE1-NEXT:    movb %cl, (%rdx)
2968; CHECK-SSE1-NEXT:    movq %rdx, %rax
2969; CHECK-SSE1-NEXT:    popq %rbx
2970; CHECK-SSE1-NEXT:    popq %r12
2971; CHECK-SSE1-NEXT:    popq %r13
2972; CHECK-SSE1-NEXT:    popq %r14
2973; CHECK-SSE1-NEXT:    popq %r15
2974; CHECK-SSE1-NEXT:    popq %rbp
2975; CHECK-SSE1-NEXT:    retq
2976;
2977; CHECK-SSE2-LABEL: in_v16i8:
2978; CHECK-SSE2:       # %bb.0:
2979; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
2980; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
2981; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
2982; CHECK-SSE2-NEXT:    retq
2983;
2984; CHECK-XOP-LABEL: in_v16i8:
2985; CHECK-XOP:       # %bb.0:
2986; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
2987; CHECK-XOP-NEXT:    retq
2988  %n0 = xor <16 x i8> %x, %y
2989  %n1 = and <16 x i8> %n0, %mask
2990  %r = xor <16 x i8> %n1, %y
2991  ret <16 x i8> %r
2992}
2993
2994define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
2995; CHECK-BASELINE-LABEL: in_v8i16:
2996; CHECK-BASELINE:       # %bb.0:
2997; CHECK-BASELINE-NEXT:    pushq %rbx
2998; CHECK-BASELINE-NEXT:    movq %rdi, %rax
2999; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
3000; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
3001; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
3002; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
3003; CHECK-BASELINE-NEXT:    xorl %ebx, %esi
3004; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
3005; CHECK-BASELINE-NEXT:    xorl %ebx, %esi
3006; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
3007; CHECK-BASELINE-NEXT:    xorl %ebx, %edx
3008; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
3009; CHECK-BASELINE-NEXT:    xorl %ebx, %edx
3010; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
3011; CHECK-BASELINE-NEXT:    xorl %ebx, %ecx
3012; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
3013; CHECK-BASELINE-NEXT:    xorl %ebx, %ecx
3014; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
3015; CHECK-BASELINE-NEXT:    xorl %ebx, %r8d
3016; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
3017; CHECK-BASELINE-NEXT:    xorl %ebx, %r8d
3018; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
3019; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
3020; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
3021; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
3022; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
3023; CHECK-BASELINE-NEXT:    xorw %r11w, %bx
3024; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
3025; CHECK-BASELINE-NEXT:    xorl %r11d, %ebx
3026; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
3027; CHECK-BASELINE-NEXT:    xorw %r10w, %r11w
3028; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
3029; CHECK-BASELINE-NEXT:    xorl %r10d, %r11d
3030; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
3031; CHECK-BASELINE-NEXT:    xorw %di, %r10w
3032; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
3033; CHECK-BASELINE-NEXT:    xorl %edi, %r10d
3034; CHECK-BASELINE-NEXT:    movw %r10w, 14(%rax)
3035; CHECK-BASELINE-NEXT:    movw %r11w, 12(%rax)
3036; CHECK-BASELINE-NEXT:    movw %bx, 10(%rax)
3037; CHECK-BASELINE-NEXT:    movw %r9w, 8(%rax)
3038; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
3039; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
3040; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
3041; CHECK-BASELINE-NEXT:    movw %si, (%rax)
3042; CHECK-BASELINE-NEXT:    popq %rbx
3043; CHECK-BASELINE-NEXT:    retq
3044;
3045; CHECK-SSE1-LABEL: in_v8i16:
3046; CHECK-SSE1:       # %bb.0:
3047; CHECK-SSE1-NEXT:    pushq %rbx
3048; CHECK-SSE1-NEXT:    movq %rdi, %rax
3049; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
3050; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
3051; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
3052; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
3053; CHECK-SSE1-NEXT:    xorl %ebx, %esi
3054; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
3055; CHECK-SSE1-NEXT:    xorl %ebx, %esi
3056; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
3057; CHECK-SSE1-NEXT:    xorl %ebx, %edx
3058; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
3059; CHECK-SSE1-NEXT:    xorl %ebx, %edx
3060; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
3061; CHECK-SSE1-NEXT:    xorl %ebx, %ecx
3062; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
3063; CHECK-SSE1-NEXT:    xorl %ebx, %ecx
3064; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
3065; CHECK-SSE1-NEXT:    xorl %ebx, %r8d
3066; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
3067; CHECK-SSE1-NEXT:    xorl %ebx, %r8d
3068; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
3069; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
3070; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
3071; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
3072; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
3073; CHECK-SSE1-NEXT:    xorw %r11w, %bx
3074; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
3075; CHECK-SSE1-NEXT:    xorl %r11d, %ebx
3076; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
3077; CHECK-SSE1-NEXT:    xorw %r10w, %r11w
3078; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
3079; CHECK-SSE1-NEXT:    xorl %r10d, %r11d
3080; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
3081; CHECK-SSE1-NEXT:    xorw %di, %r10w
3082; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
3083; CHECK-SSE1-NEXT:    xorl %edi, %r10d
3084; CHECK-SSE1-NEXT:    movw %r10w, 14(%rax)
3085; CHECK-SSE1-NEXT:    movw %r11w, 12(%rax)
3086; CHECK-SSE1-NEXT:    movw %bx, 10(%rax)
3087; CHECK-SSE1-NEXT:    movw %r9w, 8(%rax)
3088; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
3089; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
3090; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
3091; CHECK-SSE1-NEXT:    movw %si, (%rax)
3092; CHECK-SSE1-NEXT:    popq %rbx
3093; CHECK-SSE1-NEXT:    retq
3094;
3095; CHECK-SSE2-LABEL: in_v8i16:
3096; CHECK-SSE2:       # %bb.0:
3097; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
3098; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
3099; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
3100; CHECK-SSE2-NEXT:    retq
3101;
3102; CHECK-XOP-LABEL: in_v8i16:
3103; CHECK-XOP:       # %bb.0:
3104; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
3105; CHECK-XOP-NEXT:    retq
3106  %n0 = xor <8 x i16> %x, %y
3107  %n1 = and <8 x i16> %n0, %mask
3108  %r = xor <8 x i16> %n1, %y
3109  ret <8 x i16> %r
3110}
3111
3112define <4 x i32> @in_v4i32(ptr%px, ptr%py, ptr%pmask) nounwind {
3113; CHECK-BASELINE-LABEL: in_v4i32:
3114; CHECK-BASELINE:       # %bb.0:
3115; CHECK-BASELINE-NEXT:    pushq %rbx
3116; CHECK-BASELINE-NEXT:    movq %rdi, %rax
3117; CHECK-BASELINE-NEXT:    movl 12(%rdx), %edi
3118; CHECK-BASELINE-NEXT:    movl 8(%rdx), %r8d
3119; CHECK-BASELINE-NEXT:    movl (%rdx), %r9d
3120; CHECK-BASELINE-NEXT:    movl 4(%rdx), %r10d
3121; CHECK-BASELINE-NEXT:    movl (%rsi), %edx
3122; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
3123; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r11d
3124; CHECK-BASELINE-NEXT:    xorl %r10d, %r11d
3125; CHECK-BASELINE-NEXT:    movl 8(%rsi), %ebx
3126; CHECK-BASELINE-NEXT:    xorl %r8d, %ebx
3127; CHECK-BASELINE-NEXT:    movl 12(%rsi), %esi
3128; CHECK-BASELINE-NEXT:    xorl %edi, %esi
3129; CHECK-BASELINE-NEXT:    andl 12(%rcx), %esi
3130; CHECK-BASELINE-NEXT:    andl 8(%rcx), %ebx
3131; CHECK-BASELINE-NEXT:    andl 4(%rcx), %r11d
3132; CHECK-BASELINE-NEXT:    andl (%rcx), %edx
3133; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
3134; CHECK-BASELINE-NEXT:    xorl %r10d, %r11d
3135; CHECK-BASELINE-NEXT:    xorl %r8d, %ebx
3136; CHECK-BASELINE-NEXT:    xorl %edi, %esi
3137; CHECK-BASELINE-NEXT:    movl %esi, 12(%rax)
3138; CHECK-BASELINE-NEXT:    movl %ebx, 8(%rax)
3139; CHECK-BASELINE-NEXT:    movl %r11d, 4(%rax)
3140; CHECK-BASELINE-NEXT:    movl %edx, (%rax)
3141; CHECK-BASELINE-NEXT:    popq %rbx
3142; CHECK-BASELINE-NEXT:    retq
3143;
3144; CHECK-SSE1-LABEL: in_v4i32:
3145; CHECK-SSE1:       # %bb.0:
3146; CHECK-SSE1-NEXT:    movq %rdi, %rax
3147; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
3148; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
3149; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm1
3150; CHECK-SSE1-NEXT:    andps (%rsi), %xmm0
3151; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
3152; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
3153; CHECK-SSE1-NEXT:    retq
3154;
3155; CHECK-SSE2-LABEL: in_v4i32:
3156; CHECK-SSE2:       # %bb.0:
3157; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
3158; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm1
3159; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm1
3160; CHECK-SSE2-NEXT:    andps (%rdi), %xmm0
3161; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
3162; CHECK-SSE2-NEXT:    retq
3163;
3164; CHECK-XOP-LABEL: in_v4i32:
3165; CHECK-XOP:       # %bb.0:
3166; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
3167; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
3168; CHECK-XOP-NEXT:    vpcmov %xmm1, (%rsi), %xmm0, %xmm0
3169; CHECK-XOP-NEXT:    retq
3170  %x = load <4 x i32>, ptr%px, align 16
3171  %y = load <4 x i32>, ptr%py, align 16
3172  %mask = load <4 x i32>, ptr%pmask, align 16
3173  %n0 = xor <4 x i32> %x, %y
3174  %n1 = and <4 x i32> %n0, %mask
3175  %r = xor <4 x i32> %n1, %y
3176  ret <4 x i32> %r
3177}
3178
3179define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
3180; CHECK-BASELINE-LABEL: in_v2i64:
3181; CHECK-BASELINE:       # %bb.0:
3182; CHECK-BASELINE-NEXT:    movq %rdi, %rax
3183; CHECK-BASELINE-NEXT:    xorq %rdx, %rax
3184; CHECK-BASELINE-NEXT:    xorq %rcx, %rsi
3185; CHECK-BASELINE-NEXT:    andq %r9, %rsi
3186; CHECK-BASELINE-NEXT:    andq %r8, %rax
3187; CHECK-BASELINE-NEXT:    xorq %rdx, %rax
3188; CHECK-BASELINE-NEXT:    xorq %rcx, %rsi
3189; CHECK-BASELINE-NEXT:    movq %rsi, %rdx
3190; CHECK-BASELINE-NEXT:    retq
3191;
3192; CHECK-SSE1-LABEL: in_v2i64:
3193; CHECK-SSE1:       # %bb.0:
3194; CHECK-SSE1-NEXT:    movq %rdi, %rax
3195; CHECK-SSE1-NEXT:    xorq %rdx, %rax
3196; CHECK-SSE1-NEXT:    xorq %rcx, %rsi
3197; CHECK-SSE1-NEXT:    andq %r9, %rsi
3198; CHECK-SSE1-NEXT:    andq %r8, %rax
3199; CHECK-SSE1-NEXT:    xorq %rdx, %rax
3200; CHECK-SSE1-NEXT:    xorq %rcx, %rsi
3201; CHECK-SSE1-NEXT:    movq %rsi, %rdx
3202; CHECK-SSE1-NEXT:    retq
3203;
3204; CHECK-SSE2-LABEL: in_v2i64:
3205; CHECK-SSE2:       # %bb.0:
3206; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
3207; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
3208; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
3209; CHECK-SSE2-NEXT:    retq
3210;
3211; CHECK-XOP-LABEL: in_v2i64:
3212; CHECK-XOP:       # %bb.0:
3213; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
3214; CHECK-XOP-NEXT:    retq
3215  %n0 = xor <2 x i64> %x, %y
3216  %n1 = and <2 x i64> %n0, %mask
3217  %r = xor <2 x i64> %n1, %y
3218  ret <2 x i64> %r
3219}
3220
3221; ============================================================================ ;
3222; 256-bit vector width
3223; ============================================================================ ;
3224
3225define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
3226; CHECK-BASELINE-LABEL: in_v32i8:
3227; CHECK-BASELINE:       # %bb.0:
3228; CHECK-BASELINE-NEXT:    pushq %rbp
3229; CHECK-BASELINE-NEXT:    pushq %r15
3230; CHECK-BASELINE-NEXT:    pushq %r14
3231; CHECK-BASELINE-NEXT:    pushq %r13
3232; CHECK-BASELINE-NEXT:    pushq %r12
3233; CHECK-BASELINE-NEXT:    pushq %rbx
3234; CHECK-BASELINE-NEXT:    movq %rcx, %r12
3235; CHECK-BASELINE-NEXT:    movq %rdx, %r15
3236; CHECK-BASELINE-NEXT:    movq %rsi, %r14
3237; CHECK-BASELINE-NEXT:    movq %rdi, %r13
3238; CHECK-BASELINE-NEXT:    movzbl 15(%rdx), %eax
3239; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3240; CHECK-BASELINE-NEXT:    movzbl 14(%rdx), %eax
3241; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3242; CHECK-BASELINE-NEXT:    movzbl 13(%rdx), %eax
3243; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3244; CHECK-BASELINE-NEXT:    movzbl 12(%rdx), %eax
3245; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3246; CHECK-BASELINE-NEXT:    movzbl 11(%rdx), %eax
3247; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3248; CHECK-BASELINE-NEXT:    movzbl 10(%rdx), %eax
3249; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3250; CHECK-BASELINE-NEXT:    movzbl 9(%rdx), %r8d
3251; CHECK-BASELINE-NEXT:    movzbl 8(%rdx), %r9d
3252; CHECK-BASELINE-NEXT:    movzbl 7(%rdx), %r10d
3253; CHECK-BASELINE-NEXT:    movzbl 6(%rdx), %ebp
3254; CHECK-BASELINE-NEXT:    movzbl 5(%rdx), %edi
3255; CHECK-BASELINE-NEXT:    movzbl 4(%rdx), %esi
3256; CHECK-BASELINE-NEXT:    movzbl 3(%rdx), %eax
3257; CHECK-BASELINE-NEXT:    movzbl 2(%rdx), %ecx
3258; CHECK-BASELINE-NEXT:    movzbl (%rdx), %r11d
3259; CHECK-BASELINE-NEXT:    movzbl 1(%rdx), %edx
3260; CHECK-BASELINE-NEXT:    movzbl (%r14), %ebx
3261; CHECK-BASELINE-NEXT:    xorb %r11b, %bl
3262; CHECK-BASELINE-NEXT:    andb (%r12), %bl
3263; CHECK-BASELINE-NEXT:    xorb %r11b, %bl
3264; CHECK-BASELINE-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3265; CHECK-BASELINE-NEXT:    movzbl 1(%r14), %r11d
3266; CHECK-BASELINE-NEXT:    xorb %dl, %r11b
3267; CHECK-BASELINE-NEXT:    andb 1(%r12), %r11b
3268; CHECK-BASELINE-NEXT:    xorb %dl, %r11b
3269; CHECK-BASELINE-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3270; CHECK-BASELINE-NEXT:    movzbl 2(%r14), %edx
3271; CHECK-BASELINE-NEXT:    xorb %cl, %dl
3272; CHECK-BASELINE-NEXT:    andb 2(%r12), %dl
3273; CHECK-BASELINE-NEXT:    xorb %cl, %dl
3274; CHECK-BASELINE-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3275; CHECK-BASELINE-NEXT:    movzbl 3(%r14), %ecx
3276; CHECK-BASELINE-NEXT:    xorb %al, %cl
3277; CHECK-BASELINE-NEXT:    andb 3(%r12), %cl
3278; CHECK-BASELINE-NEXT:    xorb %al, %cl
3279; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3280; CHECK-BASELINE-NEXT:    movzbl 4(%r14), %eax
3281; CHECK-BASELINE-NEXT:    xorb %sil, %al
3282; CHECK-BASELINE-NEXT:    andb 4(%r12), %al
3283; CHECK-BASELINE-NEXT:    xorb %sil, %al
3284; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3285; CHECK-BASELINE-NEXT:    movzbl 5(%r14), %eax
3286; CHECK-BASELINE-NEXT:    xorb %dil, %al
3287; CHECK-BASELINE-NEXT:    andb 5(%r12), %al
3288; CHECK-BASELINE-NEXT:    xorb %dil, %al
3289; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3290; CHECK-BASELINE-NEXT:    movzbl 6(%r14), %eax
3291; CHECK-BASELINE-NEXT:    xorb %bpl, %al
3292; CHECK-BASELINE-NEXT:    andb 6(%r12), %al
3293; CHECK-BASELINE-NEXT:    xorb %bpl, %al
3294; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3295; CHECK-BASELINE-NEXT:    movzbl 7(%r14), %eax
3296; CHECK-BASELINE-NEXT:    xorb %r10b, %al
3297; CHECK-BASELINE-NEXT:    andb 7(%r12), %al
3298; CHECK-BASELINE-NEXT:    xorb %r10b, %al
3299; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3300; CHECK-BASELINE-NEXT:    movzbl 8(%r14), %eax
3301; CHECK-BASELINE-NEXT:    xorb %r9b, %al
3302; CHECK-BASELINE-NEXT:    andb 8(%r12), %al
3303; CHECK-BASELINE-NEXT:    xorb %r9b, %al
3304; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3305; CHECK-BASELINE-NEXT:    movzbl 9(%r14), %eax
3306; CHECK-BASELINE-NEXT:    xorb %r8b, %al
3307; CHECK-BASELINE-NEXT:    andb 9(%r12), %al
3308; CHECK-BASELINE-NEXT:    xorb %r8b, %al
3309; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3310; CHECK-BASELINE-NEXT:    movzbl 10(%r14), %ecx
3311; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3312; CHECK-BASELINE-NEXT:    xorb %al, %cl
3313; CHECK-BASELINE-NEXT:    andb 10(%r12), %cl
3314; CHECK-BASELINE-NEXT:    xorb %al, %cl
3315; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3316; CHECK-BASELINE-NEXT:    movzbl 11(%r14), %ecx
3317; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3318; CHECK-BASELINE-NEXT:    xorb %al, %cl
3319; CHECK-BASELINE-NEXT:    andb 11(%r12), %cl
3320; CHECK-BASELINE-NEXT:    xorb %al, %cl
3321; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3322; CHECK-BASELINE-NEXT:    movzbl 12(%r14), %ecx
3323; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3324; CHECK-BASELINE-NEXT:    xorb %al, %cl
3325; CHECK-BASELINE-NEXT:    andb 12(%r12), %cl
3326; CHECK-BASELINE-NEXT:    xorb %al, %cl
3327; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3328; CHECK-BASELINE-NEXT:    movzbl 13(%r14), %ecx
3329; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3330; CHECK-BASELINE-NEXT:    xorb %al, %cl
3331; CHECK-BASELINE-NEXT:    andb 13(%r12), %cl
3332; CHECK-BASELINE-NEXT:    xorb %al, %cl
3333; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3334; CHECK-BASELINE-NEXT:    movzbl 14(%r14), %ecx
3335; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3336; CHECK-BASELINE-NEXT:    xorb %al, %cl
3337; CHECK-BASELINE-NEXT:    andb 14(%r12), %cl
3338; CHECK-BASELINE-NEXT:    xorb %al, %cl
3339; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3340; CHECK-BASELINE-NEXT:    movzbl 15(%r14), %ecx
3341; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3342; CHECK-BASELINE-NEXT:    xorb %al, %cl
3343; CHECK-BASELINE-NEXT:    andb 15(%r12), %cl
3344; CHECK-BASELINE-NEXT:    xorb %al, %cl
3345; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3346; CHECK-BASELINE-NEXT:    movzbl 16(%r15), %eax
3347; CHECK-BASELINE-NEXT:    movzbl 16(%r14), %ecx
3348; CHECK-BASELINE-NEXT:    xorb %al, %cl
3349; CHECK-BASELINE-NEXT:    andb 16(%r12), %cl
3350; CHECK-BASELINE-NEXT:    xorb %al, %cl
3351; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3352; CHECK-BASELINE-NEXT:    movzbl 17(%r15), %eax
3353; CHECK-BASELINE-NEXT:    movzbl 17(%r14), %ecx
3354; CHECK-BASELINE-NEXT:    xorb %al, %cl
3355; CHECK-BASELINE-NEXT:    andb 17(%r12), %cl
3356; CHECK-BASELINE-NEXT:    xorb %al, %cl
3357; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3358; CHECK-BASELINE-NEXT:    movzbl 18(%r15), %eax
3359; CHECK-BASELINE-NEXT:    movzbl 18(%r14), %ecx
3360; CHECK-BASELINE-NEXT:    xorb %al, %cl
3361; CHECK-BASELINE-NEXT:    andb 18(%r12), %cl
3362; CHECK-BASELINE-NEXT:    xorb %al, %cl
3363; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3364; CHECK-BASELINE-NEXT:    movzbl 19(%r15), %eax
3365; CHECK-BASELINE-NEXT:    movzbl 19(%r14), %ecx
3366; CHECK-BASELINE-NEXT:    xorb %al, %cl
3367; CHECK-BASELINE-NEXT:    andb 19(%r12), %cl
3368; CHECK-BASELINE-NEXT:    xorb %al, %cl
3369; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3370; CHECK-BASELINE-NEXT:    movzbl 20(%r15), %eax
3371; CHECK-BASELINE-NEXT:    movzbl 20(%r14), %ecx
3372; CHECK-BASELINE-NEXT:    xorb %al, %cl
3373; CHECK-BASELINE-NEXT:    andb 20(%r12), %cl
3374; CHECK-BASELINE-NEXT:    xorb %al, %cl
3375; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3376; CHECK-BASELINE-NEXT:    movzbl 21(%r15), %eax
3377; CHECK-BASELINE-NEXT:    movzbl 21(%r14), %ebp
3378; CHECK-BASELINE-NEXT:    xorb %al, %bpl
3379; CHECK-BASELINE-NEXT:    andb 21(%r12), %bpl
3380; CHECK-BASELINE-NEXT:    xorb %al, %bpl
3381; CHECK-BASELINE-NEXT:    movzbl 22(%r15), %eax
3382; CHECK-BASELINE-NEXT:    movzbl 22(%r14), %ebx
3383; CHECK-BASELINE-NEXT:    xorb %al, %bl
3384; CHECK-BASELINE-NEXT:    andb 22(%r12), %bl
3385; CHECK-BASELINE-NEXT:    xorb %al, %bl
3386; CHECK-BASELINE-NEXT:    movzbl 23(%r15), %eax
3387; CHECK-BASELINE-NEXT:    movzbl 23(%r14), %r11d
3388; CHECK-BASELINE-NEXT:    xorb %al, %r11b
3389; CHECK-BASELINE-NEXT:    andb 23(%r12), %r11b
3390; CHECK-BASELINE-NEXT:    xorb %al, %r11b
3391; CHECK-BASELINE-NEXT:    movzbl 24(%r15), %eax
3392; CHECK-BASELINE-NEXT:    movzbl 24(%r14), %r9d
3393; CHECK-BASELINE-NEXT:    xorb %al, %r9b
3394; CHECK-BASELINE-NEXT:    andb 24(%r12), %r9b
3395; CHECK-BASELINE-NEXT:    xorb %al, %r9b
3396; CHECK-BASELINE-NEXT:    movzbl 25(%r15), %eax
3397; CHECK-BASELINE-NEXT:    movzbl 25(%r14), %r8d
3398; CHECK-BASELINE-NEXT:    xorb %al, %r8b
3399; CHECK-BASELINE-NEXT:    andb 25(%r12), %r8b
3400; CHECK-BASELINE-NEXT:    xorb %al, %r8b
3401; CHECK-BASELINE-NEXT:    movzbl 26(%r15), %eax
3402; CHECK-BASELINE-NEXT:    movzbl 26(%r14), %edi
3403; CHECK-BASELINE-NEXT:    xorb %al, %dil
3404; CHECK-BASELINE-NEXT:    andb 26(%r12), %dil
3405; CHECK-BASELINE-NEXT:    xorb %al, %dil
3406; CHECK-BASELINE-NEXT:    movzbl 27(%r15), %eax
3407; CHECK-BASELINE-NEXT:    movzbl 27(%r14), %esi
3408; CHECK-BASELINE-NEXT:    xorb %al, %sil
3409; CHECK-BASELINE-NEXT:    andb 27(%r12), %sil
3410; CHECK-BASELINE-NEXT:    xorb %al, %sil
3411; CHECK-BASELINE-NEXT:    movzbl 28(%r15), %eax
3412; CHECK-BASELINE-NEXT:    movzbl 28(%r14), %edx
3413; CHECK-BASELINE-NEXT:    xorb %al, %dl
3414; CHECK-BASELINE-NEXT:    andb 28(%r12), %dl
3415; CHECK-BASELINE-NEXT:    xorb %al, %dl
3416; CHECK-BASELINE-NEXT:    movzbl 29(%r15), %eax
3417; CHECK-BASELINE-NEXT:    movzbl 29(%r14), %ecx
3418; CHECK-BASELINE-NEXT:    xorb %al, %cl
3419; CHECK-BASELINE-NEXT:    andb 29(%r12), %cl
3420; CHECK-BASELINE-NEXT:    xorb %al, %cl
3421; CHECK-BASELINE-NEXT:    movzbl 30(%r15), %r10d
3422; CHECK-BASELINE-NEXT:    movzbl 30(%r14), %eax
3423; CHECK-BASELINE-NEXT:    xorb %r10b, %al
3424; CHECK-BASELINE-NEXT:    andb 30(%r12), %al
3425; CHECK-BASELINE-NEXT:    xorb %r10b, %al
3426; CHECK-BASELINE-NEXT:    movzbl 31(%r15), %r10d
3427; CHECK-BASELINE-NEXT:    movzbl 31(%r14), %r14d
3428; CHECK-BASELINE-NEXT:    xorb %r10b, %r14b
3429; CHECK-BASELINE-NEXT:    andb 31(%r12), %r14b
3430; CHECK-BASELINE-NEXT:    xorb %r10b, %r14b
3431; CHECK-BASELINE-NEXT:    movb %r14b, 31(%r13)
3432; CHECK-BASELINE-NEXT:    movb %al, 30(%r13)
3433; CHECK-BASELINE-NEXT:    movb %cl, 29(%r13)
3434; CHECK-BASELINE-NEXT:    movb %dl, 28(%r13)
3435; CHECK-BASELINE-NEXT:    movb %sil, 27(%r13)
3436; CHECK-BASELINE-NEXT:    movb %dil, 26(%r13)
3437; CHECK-BASELINE-NEXT:    movb %r8b, 25(%r13)
3438; CHECK-BASELINE-NEXT:    movb %r9b, 24(%r13)
3439; CHECK-BASELINE-NEXT:    movb %r11b, 23(%r13)
3440; CHECK-BASELINE-NEXT:    movb %bl, 22(%r13)
3441; CHECK-BASELINE-NEXT:    movb %bpl, 21(%r13)
3442; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3443; CHECK-BASELINE-NEXT:    movb %al, 20(%r13)
3444; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3445; CHECK-BASELINE-NEXT:    movb %al, 19(%r13)
3446; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3447; CHECK-BASELINE-NEXT:    movb %al, 18(%r13)
3448; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3449; CHECK-BASELINE-NEXT:    movb %al, 17(%r13)
3450; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3451; CHECK-BASELINE-NEXT:    movb %al, 16(%r13)
3452; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3453; CHECK-BASELINE-NEXT:    movb %al, 15(%r13)
3454; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3455; CHECK-BASELINE-NEXT:    movb %al, 14(%r13)
3456; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3457; CHECK-BASELINE-NEXT:    movb %al, 13(%r13)
3458; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3459; CHECK-BASELINE-NEXT:    movb %al, 12(%r13)
3460; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3461; CHECK-BASELINE-NEXT:    movb %al, 11(%r13)
3462; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3463; CHECK-BASELINE-NEXT:    movb %al, 10(%r13)
3464; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3465; CHECK-BASELINE-NEXT:    movb %al, 9(%r13)
3466; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3467; CHECK-BASELINE-NEXT:    movb %al, 8(%r13)
3468; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3469; CHECK-BASELINE-NEXT:    movb %al, 7(%r13)
3470; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3471; CHECK-BASELINE-NEXT:    movb %al, 6(%r13)
3472; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3473; CHECK-BASELINE-NEXT:    movb %al, 5(%r13)
3474; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3475; CHECK-BASELINE-NEXT:    movb %al, 4(%r13)
3476; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3477; CHECK-BASELINE-NEXT:    movb %al, 3(%r13)
3478; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3479; CHECK-BASELINE-NEXT:    movb %al, 2(%r13)
3480; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3481; CHECK-BASELINE-NEXT:    movb %al, 1(%r13)
3482; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3483; CHECK-BASELINE-NEXT:    movb %al, (%r13)
3484; CHECK-BASELINE-NEXT:    movq %r13, %rax
3485; CHECK-BASELINE-NEXT:    popq %rbx
3486; CHECK-BASELINE-NEXT:    popq %r12
3487; CHECK-BASELINE-NEXT:    popq %r13
3488; CHECK-BASELINE-NEXT:    popq %r14
3489; CHECK-BASELINE-NEXT:    popq %r15
3490; CHECK-BASELINE-NEXT:    popq %rbp
3491; CHECK-BASELINE-NEXT:    retq
3492;
3493; CHECK-SSE1-LABEL: in_v32i8:
3494; CHECK-SSE1:       # %bb.0:
3495; CHECK-SSE1-NEXT:    pushq %rbp
3496; CHECK-SSE1-NEXT:    pushq %r15
3497; CHECK-SSE1-NEXT:    pushq %r14
3498; CHECK-SSE1-NEXT:    pushq %r13
3499; CHECK-SSE1-NEXT:    pushq %r12
3500; CHECK-SSE1-NEXT:    pushq %rbx
3501; CHECK-SSE1-NEXT:    movq %rcx, %r12
3502; CHECK-SSE1-NEXT:    movq %rdx, %r15
3503; CHECK-SSE1-NEXT:    movq %rsi, %r14
3504; CHECK-SSE1-NEXT:    movq %rdi, %r13
3505; CHECK-SSE1-NEXT:    movzbl 15(%rdx), %eax
3506; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3507; CHECK-SSE1-NEXT:    movzbl 14(%rdx), %eax
3508; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3509; CHECK-SSE1-NEXT:    movzbl 13(%rdx), %eax
3510; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3511; CHECK-SSE1-NEXT:    movzbl 12(%rdx), %eax
3512; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3513; CHECK-SSE1-NEXT:    movzbl 11(%rdx), %eax
3514; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3515; CHECK-SSE1-NEXT:    movzbl 10(%rdx), %eax
3516; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3517; CHECK-SSE1-NEXT:    movzbl 9(%rdx), %r8d
3518; CHECK-SSE1-NEXT:    movzbl 8(%rdx), %r9d
3519; CHECK-SSE1-NEXT:    movzbl 7(%rdx), %r10d
3520; CHECK-SSE1-NEXT:    movzbl 6(%rdx), %ebp
3521; CHECK-SSE1-NEXT:    movzbl 5(%rdx), %edi
3522; CHECK-SSE1-NEXT:    movzbl 4(%rdx), %esi
3523; CHECK-SSE1-NEXT:    movzbl 3(%rdx), %eax
3524; CHECK-SSE1-NEXT:    movzbl 2(%rdx), %ecx
3525; CHECK-SSE1-NEXT:    movzbl (%rdx), %r11d
3526; CHECK-SSE1-NEXT:    movzbl 1(%rdx), %edx
3527; CHECK-SSE1-NEXT:    movzbl (%r14), %ebx
3528; CHECK-SSE1-NEXT:    xorb %r11b, %bl
3529; CHECK-SSE1-NEXT:    andb (%r12), %bl
3530; CHECK-SSE1-NEXT:    xorb %r11b, %bl
3531; CHECK-SSE1-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3532; CHECK-SSE1-NEXT:    movzbl 1(%r14), %r11d
3533; CHECK-SSE1-NEXT:    xorb %dl, %r11b
3534; CHECK-SSE1-NEXT:    andb 1(%r12), %r11b
3535; CHECK-SSE1-NEXT:    xorb %dl, %r11b
3536; CHECK-SSE1-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3537; CHECK-SSE1-NEXT:    movzbl 2(%r14), %edx
3538; CHECK-SSE1-NEXT:    xorb %cl, %dl
3539; CHECK-SSE1-NEXT:    andb 2(%r12), %dl
3540; CHECK-SSE1-NEXT:    xorb %cl, %dl
3541; CHECK-SSE1-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3542; CHECK-SSE1-NEXT:    movzbl 3(%r14), %ecx
3543; CHECK-SSE1-NEXT:    xorb %al, %cl
3544; CHECK-SSE1-NEXT:    andb 3(%r12), %cl
3545; CHECK-SSE1-NEXT:    xorb %al, %cl
3546; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3547; CHECK-SSE1-NEXT:    movzbl 4(%r14), %eax
3548; CHECK-SSE1-NEXT:    xorb %sil, %al
3549; CHECK-SSE1-NEXT:    andb 4(%r12), %al
3550; CHECK-SSE1-NEXT:    xorb %sil, %al
3551; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3552; CHECK-SSE1-NEXT:    movzbl 5(%r14), %eax
3553; CHECK-SSE1-NEXT:    xorb %dil, %al
3554; CHECK-SSE1-NEXT:    andb 5(%r12), %al
3555; CHECK-SSE1-NEXT:    xorb %dil, %al
3556; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3557; CHECK-SSE1-NEXT:    movzbl 6(%r14), %eax
3558; CHECK-SSE1-NEXT:    xorb %bpl, %al
3559; CHECK-SSE1-NEXT:    andb 6(%r12), %al
3560; CHECK-SSE1-NEXT:    xorb %bpl, %al
3561; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3562; CHECK-SSE1-NEXT:    movzbl 7(%r14), %eax
3563; CHECK-SSE1-NEXT:    xorb %r10b, %al
3564; CHECK-SSE1-NEXT:    andb 7(%r12), %al
3565; CHECK-SSE1-NEXT:    xorb %r10b, %al
3566; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3567; CHECK-SSE1-NEXT:    movzbl 8(%r14), %eax
3568; CHECK-SSE1-NEXT:    xorb %r9b, %al
3569; CHECK-SSE1-NEXT:    andb 8(%r12), %al
3570; CHECK-SSE1-NEXT:    xorb %r9b, %al
3571; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3572; CHECK-SSE1-NEXT:    movzbl 9(%r14), %eax
3573; CHECK-SSE1-NEXT:    xorb %r8b, %al
3574; CHECK-SSE1-NEXT:    andb 9(%r12), %al
3575; CHECK-SSE1-NEXT:    xorb %r8b, %al
3576; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3577; CHECK-SSE1-NEXT:    movzbl 10(%r14), %ecx
3578; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3579; CHECK-SSE1-NEXT:    xorb %al, %cl
3580; CHECK-SSE1-NEXT:    andb 10(%r12), %cl
3581; CHECK-SSE1-NEXT:    xorb %al, %cl
3582; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3583; CHECK-SSE1-NEXT:    movzbl 11(%r14), %ecx
3584; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3585; CHECK-SSE1-NEXT:    xorb %al, %cl
3586; CHECK-SSE1-NEXT:    andb 11(%r12), %cl
3587; CHECK-SSE1-NEXT:    xorb %al, %cl
3588; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3589; CHECK-SSE1-NEXT:    movzbl 12(%r14), %ecx
3590; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3591; CHECK-SSE1-NEXT:    xorb %al, %cl
3592; CHECK-SSE1-NEXT:    andb 12(%r12), %cl
3593; CHECK-SSE1-NEXT:    xorb %al, %cl
3594; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3595; CHECK-SSE1-NEXT:    movzbl 13(%r14), %ecx
3596; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3597; CHECK-SSE1-NEXT:    xorb %al, %cl
3598; CHECK-SSE1-NEXT:    andb 13(%r12), %cl
3599; CHECK-SSE1-NEXT:    xorb %al, %cl
3600; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3601; CHECK-SSE1-NEXT:    movzbl 14(%r14), %ecx
3602; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3603; CHECK-SSE1-NEXT:    xorb %al, %cl
3604; CHECK-SSE1-NEXT:    andb 14(%r12), %cl
3605; CHECK-SSE1-NEXT:    xorb %al, %cl
3606; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3607; CHECK-SSE1-NEXT:    movzbl 15(%r14), %ecx
3608; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3609; CHECK-SSE1-NEXT:    xorb %al, %cl
3610; CHECK-SSE1-NEXT:    andb 15(%r12), %cl
3611; CHECK-SSE1-NEXT:    xorb %al, %cl
3612; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3613; CHECK-SSE1-NEXT:    movzbl 16(%r15), %eax
3614; CHECK-SSE1-NEXT:    movzbl 16(%r14), %ecx
3615; CHECK-SSE1-NEXT:    xorb %al, %cl
3616; CHECK-SSE1-NEXT:    andb 16(%r12), %cl
3617; CHECK-SSE1-NEXT:    xorb %al, %cl
3618; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3619; CHECK-SSE1-NEXT:    movzbl 17(%r15), %eax
3620; CHECK-SSE1-NEXT:    movzbl 17(%r14), %ecx
3621; CHECK-SSE1-NEXT:    xorb %al, %cl
3622; CHECK-SSE1-NEXT:    andb 17(%r12), %cl
3623; CHECK-SSE1-NEXT:    xorb %al, %cl
3624; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3625; CHECK-SSE1-NEXT:    movzbl 18(%r15), %eax
3626; CHECK-SSE1-NEXT:    movzbl 18(%r14), %ecx
3627; CHECK-SSE1-NEXT:    xorb %al, %cl
3628; CHECK-SSE1-NEXT:    andb 18(%r12), %cl
3629; CHECK-SSE1-NEXT:    xorb %al, %cl
3630; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3631; CHECK-SSE1-NEXT:    movzbl 19(%r15), %eax
3632; CHECK-SSE1-NEXT:    movzbl 19(%r14), %ecx
3633; CHECK-SSE1-NEXT:    xorb %al, %cl
3634; CHECK-SSE1-NEXT:    andb 19(%r12), %cl
3635; CHECK-SSE1-NEXT:    xorb %al, %cl
3636; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3637; CHECK-SSE1-NEXT:    movzbl 20(%r15), %eax
3638; CHECK-SSE1-NEXT:    movzbl 20(%r14), %ecx
3639; CHECK-SSE1-NEXT:    xorb %al, %cl
3640; CHECK-SSE1-NEXT:    andb 20(%r12), %cl
3641; CHECK-SSE1-NEXT:    xorb %al, %cl
3642; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
3643; CHECK-SSE1-NEXT:    movzbl 21(%r15), %eax
3644; CHECK-SSE1-NEXT:    movzbl 21(%r14), %ebp
3645; CHECK-SSE1-NEXT:    xorb %al, %bpl
3646; CHECK-SSE1-NEXT:    andb 21(%r12), %bpl
3647; CHECK-SSE1-NEXT:    xorb %al, %bpl
3648; CHECK-SSE1-NEXT:    movzbl 22(%r15), %eax
3649; CHECK-SSE1-NEXT:    movzbl 22(%r14), %ebx
3650; CHECK-SSE1-NEXT:    xorb %al, %bl
3651; CHECK-SSE1-NEXT:    andb 22(%r12), %bl
3652; CHECK-SSE1-NEXT:    xorb %al, %bl
3653; CHECK-SSE1-NEXT:    movzbl 23(%r15), %eax
3654; CHECK-SSE1-NEXT:    movzbl 23(%r14), %r11d
3655; CHECK-SSE1-NEXT:    xorb %al, %r11b
3656; CHECK-SSE1-NEXT:    andb 23(%r12), %r11b
3657; CHECK-SSE1-NEXT:    xorb %al, %r11b
3658; CHECK-SSE1-NEXT:    movzbl 24(%r15), %eax
3659; CHECK-SSE1-NEXT:    movzbl 24(%r14), %r9d
3660; CHECK-SSE1-NEXT:    xorb %al, %r9b
3661; CHECK-SSE1-NEXT:    andb 24(%r12), %r9b
3662; CHECK-SSE1-NEXT:    xorb %al, %r9b
3663; CHECK-SSE1-NEXT:    movzbl 25(%r15), %eax
3664; CHECK-SSE1-NEXT:    movzbl 25(%r14), %r8d
3665; CHECK-SSE1-NEXT:    xorb %al, %r8b
3666; CHECK-SSE1-NEXT:    andb 25(%r12), %r8b
3667; CHECK-SSE1-NEXT:    xorb %al, %r8b
3668; CHECK-SSE1-NEXT:    movzbl 26(%r15), %eax
3669; CHECK-SSE1-NEXT:    movzbl 26(%r14), %edi
3670; CHECK-SSE1-NEXT:    xorb %al, %dil
3671; CHECK-SSE1-NEXT:    andb 26(%r12), %dil
3672; CHECK-SSE1-NEXT:    xorb %al, %dil
3673; CHECK-SSE1-NEXT:    movzbl 27(%r15), %eax
3674; CHECK-SSE1-NEXT:    movzbl 27(%r14), %esi
3675; CHECK-SSE1-NEXT:    xorb %al, %sil
3676; CHECK-SSE1-NEXT:    andb 27(%r12), %sil
3677; CHECK-SSE1-NEXT:    xorb %al, %sil
3678; CHECK-SSE1-NEXT:    movzbl 28(%r15), %eax
3679; CHECK-SSE1-NEXT:    movzbl 28(%r14), %edx
3680; CHECK-SSE1-NEXT:    xorb %al, %dl
3681; CHECK-SSE1-NEXT:    andb 28(%r12), %dl
3682; CHECK-SSE1-NEXT:    xorb %al, %dl
3683; CHECK-SSE1-NEXT:    movzbl 29(%r15), %eax
3684; CHECK-SSE1-NEXT:    movzbl 29(%r14), %ecx
3685; CHECK-SSE1-NEXT:    xorb %al, %cl
3686; CHECK-SSE1-NEXT:    andb 29(%r12), %cl
3687; CHECK-SSE1-NEXT:    xorb %al, %cl
3688; CHECK-SSE1-NEXT:    movzbl 30(%r15), %r10d
3689; CHECK-SSE1-NEXT:    movzbl 30(%r14), %eax
3690; CHECK-SSE1-NEXT:    xorb %r10b, %al
3691; CHECK-SSE1-NEXT:    andb 30(%r12), %al
3692; CHECK-SSE1-NEXT:    xorb %r10b, %al
3693; CHECK-SSE1-NEXT:    movzbl 31(%r15), %r10d
3694; CHECK-SSE1-NEXT:    movzbl 31(%r14), %r14d
3695; CHECK-SSE1-NEXT:    xorb %r10b, %r14b
3696; CHECK-SSE1-NEXT:    andb 31(%r12), %r14b
3697; CHECK-SSE1-NEXT:    xorb %r10b, %r14b
3698; CHECK-SSE1-NEXT:    movb %r14b, 31(%r13)
3699; CHECK-SSE1-NEXT:    movb %al, 30(%r13)
3700; CHECK-SSE1-NEXT:    movb %cl, 29(%r13)
3701; CHECK-SSE1-NEXT:    movb %dl, 28(%r13)
3702; CHECK-SSE1-NEXT:    movb %sil, 27(%r13)
3703; CHECK-SSE1-NEXT:    movb %dil, 26(%r13)
3704; CHECK-SSE1-NEXT:    movb %r8b, 25(%r13)
3705; CHECK-SSE1-NEXT:    movb %r9b, 24(%r13)
3706; CHECK-SSE1-NEXT:    movb %r11b, 23(%r13)
3707; CHECK-SSE1-NEXT:    movb %bl, 22(%r13)
3708; CHECK-SSE1-NEXT:    movb %bpl, 21(%r13)
3709; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3710; CHECK-SSE1-NEXT:    movb %al, 20(%r13)
3711; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3712; CHECK-SSE1-NEXT:    movb %al, 19(%r13)
3713; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3714; CHECK-SSE1-NEXT:    movb %al, 18(%r13)
3715; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3716; CHECK-SSE1-NEXT:    movb %al, 17(%r13)
3717; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3718; CHECK-SSE1-NEXT:    movb %al, 16(%r13)
3719; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3720; CHECK-SSE1-NEXT:    movb %al, 15(%r13)
3721; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3722; CHECK-SSE1-NEXT:    movb %al, 14(%r13)
3723; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3724; CHECK-SSE1-NEXT:    movb %al, 13(%r13)
3725; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3726; CHECK-SSE1-NEXT:    movb %al, 12(%r13)
3727; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3728; CHECK-SSE1-NEXT:    movb %al, 11(%r13)
3729; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3730; CHECK-SSE1-NEXT:    movb %al, 10(%r13)
3731; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3732; CHECK-SSE1-NEXT:    movb %al, 9(%r13)
3733; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3734; CHECK-SSE1-NEXT:    movb %al, 8(%r13)
3735; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3736; CHECK-SSE1-NEXT:    movb %al, 7(%r13)
3737; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3738; CHECK-SSE1-NEXT:    movb %al, 6(%r13)
3739; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3740; CHECK-SSE1-NEXT:    movb %al, 5(%r13)
3741; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3742; CHECK-SSE1-NEXT:    movb %al, 4(%r13)
3743; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3744; CHECK-SSE1-NEXT:    movb %al, 3(%r13)
3745; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3746; CHECK-SSE1-NEXT:    movb %al, 2(%r13)
3747; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3748; CHECK-SSE1-NEXT:    movb %al, 1(%r13)
3749; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
3750; CHECK-SSE1-NEXT:    movb %al, (%r13)
3751; CHECK-SSE1-NEXT:    movq %r13, %rax
3752; CHECK-SSE1-NEXT:    popq %rbx
3753; CHECK-SSE1-NEXT:    popq %r12
3754; CHECK-SSE1-NEXT:    popq %r13
3755; CHECK-SSE1-NEXT:    popq %r14
3756; CHECK-SSE1-NEXT:    popq %r15
3757; CHECK-SSE1-NEXT:    popq %rbp
3758; CHECK-SSE1-NEXT:    retq
3759;
3760; CHECK-SSE2-LABEL: in_v32i8:
3761; CHECK-SSE2:       # %bb.0:
3762; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
3763; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
3764; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm2
3765; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm2
3766; CHECK-SSE2-NEXT:    andps (%rdi), %xmm0
3767; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
3768; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm2
3769; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm2
3770; CHECK-SSE2-NEXT:    andps 16(%rdi), %xmm1
3771; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
3772; CHECK-SSE2-NEXT:    retq
3773;
3774; CHECK-XOP-LABEL: in_v32i8:
3775; CHECK-XOP:       # %bb.0:
3776; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
3777; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
3778; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
3779; CHECK-XOP-NEXT:    retq
3780  %x = load <32 x i8>, ptr%px, align 32
3781  %y = load <32 x i8>, ptr%py, align 32
3782  %mask = load <32 x i8>, ptr%pmask, align 32
3783  %n0 = xor <32 x i8> %x, %y
3784  %n1 = and <32 x i8> %n0, %mask
3785  %r = xor <32 x i8> %n1, %y
3786  ret <32 x i8> %r
3787}
3788
3789define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
3790; CHECK-BASELINE-LABEL: in_v16i16:
3791; CHECK-BASELINE:       # %bb.0:
3792; CHECK-BASELINE-NEXT:    pushq %rbp
3793; CHECK-BASELINE-NEXT:    pushq %r15
3794; CHECK-BASELINE-NEXT:    pushq %r14
3795; CHECK-BASELINE-NEXT:    pushq %r13
3796; CHECK-BASELINE-NEXT:    pushq %r12
3797; CHECK-BASELINE-NEXT:    pushq %rbx
3798; CHECK-BASELINE-NEXT:    movq %rcx, %r9
3799; CHECK-BASELINE-NEXT:    movq %rdi, %r10
3800; CHECK-BASELINE-NEXT:    movzwl 30(%rdx), %edi
3801; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3802; CHECK-BASELINE-NEXT:    movl 28(%rdx), %edi
3803; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3804; CHECK-BASELINE-NEXT:    movzwl 26(%rdx), %edi
3805; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3806; CHECK-BASELINE-NEXT:    movl 24(%rdx), %eax
3807; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3808; CHECK-BASELINE-NEXT:    movzwl 22(%rdx), %eax
3809; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3810; CHECK-BASELINE-NEXT:    movl 20(%rdx), %r8d
3811; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3812; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %r11d
3813; CHECK-BASELINE-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3814; CHECK-BASELINE-NEXT:    movl 16(%rdx), %ebx
3815; CHECK-BASELINE-NEXT:    movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3816; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %ebp
3817; CHECK-BASELINE-NEXT:    movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3818; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r14d
3819; CHECK-BASELINE-NEXT:    movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3820; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r15d
3821; CHECK-BASELINE-NEXT:    movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3822; CHECK-BASELINE-NEXT:    movl 8(%rdx), %r12d
3823; CHECK-BASELINE-NEXT:    movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3824; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %r13d
3825; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3826; CHECK-BASELINE-NEXT:    movl (%rdx), %ecx
3827; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3828; CHECK-BASELINE-NEXT:    movl 4(%rdx), %edi
3829; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3830; CHECK-BASELINE-NEXT:    movzwl 2(%rdx), %eax
3831; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3832; CHECK-BASELINE-NEXT:    movzwl (%rsi), %edx
3833; CHECK-BASELINE-NEXT:    xorw %cx, %dx
3834; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3835; CHECK-BASELINE-NEXT:    movzwl 2(%rsi), %ecx
3836; CHECK-BASELINE-NEXT:    xorw %ax, %cx
3837; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3838; CHECK-BASELINE-NEXT:    movzwl 4(%rsi), %eax
3839; CHECK-BASELINE-NEXT:    xorw %di, %ax
3840; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3841; CHECK-BASELINE-NEXT:    movzwl 6(%rsi), %ecx
3842; CHECK-BASELINE-NEXT:    xorw %r13w, %cx
3843; CHECK-BASELINE-NEXT:    movzwl 8(%rsi), %eax
3844; CHECK-BASELINE-NEXT:    xorw %r12w, %ax
3845; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3846; CHECK-BASELINE-NEXT:    movzwl 10(%rsi), %eax
3847; CHECK-BASELINE-NEXT:    xorw %r15w, %ax
3848; CHECK-BASELINE-NEXT:    movzwl 12(%rsi), %edx
3849; CHECK-BASELINE-NEXT:    xorw %r14w, %dx
3850; CHECK-BASELINE-NEXT:    movzwl 14(%rsi), %r13d
3851; CHECK-BASELINE-NEXT:    xorw %bp, %r13w
3852; CHECK-BASELINE-NEXT:    movzwl 16(%rsi), %r12d
3853; CHECK-BASELINE-NEXT:    xorw %bx, %r12w
3854; CHECK-BASELINE-NEXT:    movzwl 18(%rsi), %r15d
3855; CHECK-BASELINE-NEXT:    xorw %r11w, %r15w
3856; CHECK-BASELINE-NEXT:    movzwl 20(%rsi), %r14d
3857; CHECK-BASELINE-NEXT:    xorw %r8w, %r14w
3858; CHECK-BASELINE-NEXT:    movzwl 22(%rsi), %ebp
3859; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload
3860; CHECK-BASELINE-NEXT:    movzwl 24(%rsi), %ebx
3861; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload
3862; CHECK-BASELINE-NEXT:    movzwl 26(%rsi), %r11d
3863; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload
3864; CHECK-BASELINE-NEXT:    movzwl 28(%rsi), %edi
3865; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Folded Reload
3866; CHECK-BASELINE-NEXT:    movzwl 30(%rsi), %esi
3867; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Folded Reload
3868; CHECK-BASELINE-NEXT:    andw 30(%r9), %si
3869; CHECK-BASELINE-NEXT:    andw 28(%r9), %di
3870; CHECK-BASELINE-NEXT:    andw 26(%r9), %r11w
3871; CHECK-BASELINE-NEXT:    andw 24(%r9), %bx
3872; CHECK-BASELINE-NEXT:    andw 22(%r9), %bp
3873; CHECK-BASELINE-NEXT:    andw 20(%r9), %r14w
3874; CHECK-BASELINE-NEXT:    andw 18(%r9), %r15w
3875; CHECK-BASELINE-NEXT:    andw 16(%r9), %r12w
3876; CHECK-BASELINE-NEXT:    andw 14(%r9), %r13w
3877; CHECK-BASELINE-NEXT:    andw 12(%r9), %dx
3878; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3879; CHECK-BASELINE-NEXT:    andw 10(%r9), %ax
3880; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3881; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
3882; CHECK-BASELINE-NEXT:    andw 8(%r9), %dx
3883; CHECK-BASELINE-NEXT:    andw 6(%r9), %cx
3884; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3885; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
3886; CHECK-BASELINE-NEXT:    andw 4(%r9), %r8w
3887; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
3888; CHECK-BASELINE-NEXT:    andw 2(%r9), %ax
3889; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
3890; CHECK-BASELINE-NEXT:    andw (%r9), %cx
3891; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
3892; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3893; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
3894; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3895; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload
3896; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
3897; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload
3898; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
3899; CHECK-BASELINE-NEXT:    movl %edx, %ecx
3900; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
3901; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
3902; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
3903; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
3904; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload
3905; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
3906; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
3907; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
3908; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
3909; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
3910; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
3911; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Folded Reload
3912; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
3913; CHECK-BASELINE-NEXT:    movw %si, 30(%r10)
3914; CHECK-BASELINE-NEXT:    movw %di, 28(%r10)
3915; CHECK-BASELINE-NEXT:    movw %r11w, 26(%r10)
3916; CHECK-BASELINE-NEXT:    movw %bx, 24(%r10)
3917; CHECK-BASELINE-NEXT:    movw %bp, 22(%r10)
3918; CHECK-BASELINE-NEXT:    movw %r14w, 20(%r10)
3919; CHECK-BASELINE-NEXT:    movw %r15w, 18(%r10)
3920; CHECK-BASELINE-NEXT:    movw %r12w, 16(%r10)
3921; CHECK-BASELINE-NEXT:    movw %r13w, 14(%r10)
3922; CHECK-BASELINE-NEXT:    movw %ax, 12(%r10)
3923; CHECK-BASELINE-NEXT:    movw %dx, 10(%r10)
3924; CHECK-BASELINE-NEXT:    movw %cx, 8(%r10)
3925; CHECK-BASELINE-NEXT:    movw %r9w, 6(%r10)
3926; CHECK-BASELINE-NEXT:    movw %r8w, 4(%r10)
3927; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
3928; CHECK-BASELINE-NEXT:    movw %ax, 2(%r10)
3929; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
3930; CHECK-BASELINE-NEXT:    movw %ax, (%r10)
3931; CHECK-BASELINE-NEXT:    movq %r10, %rax
3932; CHECK-BASELINE-NEXT:    popq %rbx
3933; CHECK-BASELINE-NEXT:    popq %r12
3934; CHECK-BASELINE-NEXT:    popq %r13
3935; CHECK-BASELINE-NEXT:    popq %r14
3936; CHECK-BASELINE-NEXT:    popq %r15
3937; CHECK-BASELINE-NEXT:    popq %rbp
3938; CHECK-BASELINE-NEXT:    retq
3939;
3940; CHECK-SSE1-LABEL: in_v16i16:
3941; CHECK-SSE1:       # %bb.0:
3942; CHECK-SSE1-NEXT:    pushq %rbp
3943; CHECK-SSE1-NEXT:    pushq %r15
3944; CHECK-SSE1-NEXT:    pushq %r14
3945; CHECK-SSE1-NEXT:    pushq %r13
3946; CHECK-SSE1-NEXT:    pushq %r12
3947; CHECK-SSE1-NEXT:    pushq %rbx
3948; CHECK-SSE1-NEXT:    movq %rcx, %r9
3949; CHECK-SSE1-NEXT:    movq %rdi, %r10
3950; CHECK-SSE1-NEXT:    movzwl 30(%rdx), %edi
3951; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3952; CHECK-SSE1-NEXT:    movl 28(%rdx), %edi
3953; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3954; CHECK-SSE1-NEXT:    movzwl 26(%rdx), %edi
3955; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3956; CHECK-SSE1-NEXT:    movl 24(%rdx), %eax
3957; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3958; CHECK-SSE1-NEXT:    movzwl 22(%rdx), %eax
3959; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3960; CHECK-SSE1-NEXT:    movl 20(%rdx), %r8d
3961; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3962; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %r11d
3963; CHECK-SSE1-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3964; CHECK-SSE1-NEXT:    movl 16(%rdx), %ebx
3965; CHECK-SSE1-NEXT:    movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3966; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %ebp
3967; CHECK-SSE1-NEXT:    movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3968; CHECK-SSE1-NEXT:    movl 12(%rdx), %r14d
3969; CHECK-SSE1-NEXT:    movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3970; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r15d
3971; CHECK-SSE1-NEXT:    movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3972; CHECK-SSE1-NEXT:    movl 8(%rdx), %r12d
3973; CHECK-SSE1-NEXT:    movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3974; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %r13d
3975; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3976; CHECK-SSE1-NEXT:    movl (%rdx), %ecx
3977; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3978; CHECK-SSE1-NEXT:    movl 4(%rdx), %edi
3979; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3980; CHECK-SSE1-NEXT:    movzwl 2(%rdx), %eax
3981; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3982; CHECK-SSE1-NEXT:    movzwl (%rsi), %edx
3983; CHECK-SSE1-NEXT:    xorw %cx, %dx
3984; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3985; CHECK-SSE1-NEXT:    movzwl 2(%rsi), %ecx
3986; CHECK-SSE1-NEXT:    xorw %ax, %cx
3987; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3988; CHECK-SSE1-NEXT:    movzwl 4(%rsi), %eax
3989; CHECK-SSE1-NEXT:    xorw %di, %ax
3990; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3991; CHECK-SSE1-NEXT:    movzwl 6(%rsi), %ecx
3992; CHECK-SSE1-NEXT:    xorw %r13w, %cx
3993; CHECK-SSE1-NEXT:    movzwl 8(%rsi), %eax
3994; CHECK-SSE1-NEXT:    xorw %r12w, %ax
3995; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3996; CHECK-SSE1-NEXT:    movzwl 10(%rsi), %eax
3997; CHECK-SSE1-NEXT:    xorw %r15w, %ax
3998; CHECK-SSE1-NEXT:    movzwl 12(%rsi), %edx
3999; CHECK-SSE1-NEXT:    xorw %r14w, %dx
4000; CHECK-SSE1-NEXT:    movzwl 14(%rsi), %r13d
4001; CHECK-SSE1-NEXT:    xorw %bp, %r13w
4002; CHECK-SSE1-NEXT:    movzwl 16(%rsi), %r12d
4003; CHECK-SSE1-NEXT:    xorw %bx, %r12w
4004; CHECK-SSE1-NEXT:    movzwl 18(%rsi), %r15d
4005; CHECK-SSE1-NEXT:    xorw %r11w, %r15w
4006; CHECK-SSE1-NEXT:    movzwl 20(%rsi), %r14d
4007; CHECK-SSE1-NEXT:    xorw %r8w, %r14w
4008; CHECK-SSE1-NEXT:    movzwl 22(%rsi), %ebp
4009; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload
4010; CHECK-SSE1-NEXT:    movzwl 24(%rsi), %ebx
4011; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload
4012; CHECK-SSE1-NEXT:    movzwl 26(%rsi), %r11d
4013; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload
4014; CHECK-SSE1-NEXT:    movzwl 28(%rsi), %edi
4015; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Folded Reload
4016; CHECK-SSE1-NEXT:    movzwl 30(%rsi), %esi
4017; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Folded Reload
4018; CHECK-SSE1-NEXT:    andw 30(%r9), %si
4019; CHECK-SSE1-NEXT:    andw 28(%r9), %di
4020; CHECK-SSE1-NEXT:    andw 26(%r9), %r11w
4021; CHECK-SSE1-NEXT:    andw 24(%r9), %bx
4022; CHECK-SSE1-NEXT:    andw 22(%r9), %bp
4023; CHECK-SSE1-NEXT:    andw 20(%r9), %r14w
4024; CHECK-SSE1-NEXT:    andw 18(%r9), %r15w
4025; CHECK-SSE1-NEXT:    andw 16(%r9), %r12w
4026; CHECK-SSE1-NEXT:    andw 14(%r9), %r13w
4027; CHECK-SSE1-NEXT:    andw 12(%r9), %dx
4028; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
4029; CHECK-SSE1-NEXT:    andw 10(%r9), %ax
4030; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
4031; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
4032; CHECK-SSE1-NEXT:    andw 8(%r9), %dx
4033; CHECK-SSE1-NEXT:    andw 6(%r9), %cx
4034; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
4035; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
4036; CHECK-SSE1-NEXT:    andw 4(%r9), %r8w
4037; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
4038; CHECK-SSE1-NEXT:    andw 2(%r9), %ax
4039; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
4040; CHECK-SSE1-NEXT:    andw (%r9), %cx
4041; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
4042; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
4043; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
4044; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
4045; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload
4046; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
4047; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload
4048; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
4049; CHECK-SSE1-NEXT:    movl %edx, %ecx
4050; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
4051; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
4052; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
4053; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
4054; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload
4055; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
4056; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
4057; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
4058; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
4059; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
4060; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
4061; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Folded Reload
4062; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
4063; CHECK-SSE1-NEXT:    movw %si, 30(%r10)
4064; CHECK-SSE1-NEXT:    movw %di, 28(%r10)
4065; CHECK-SSE1-NEXT:    movw %r11w, 26(%r10)
4066; CHECK-SSE1-NEXT:    movw %bx, 24(%r10)
4067; CHECK-SSE1-NEXT:    movw %bp, 22(%r10)
4068; CHECK-SSE1-NEXT:    movw %r14w, 20(%r10)
4069; CHECK-SSE1-NEXT:    movw %r15w, 18(%r10)
4070; CHECK-SSE1-NEXT:    movw %r12w, 16(%r10)
4071; CHECK-SSE1-NEXT:    movw %r13w, 14(%r10)
4072; CHECK-SSE1-NEXT:    movw %ax, 12(%r10)
4073; CHECK-SSE1-NEXT:    movw %dx, 10(%r10)
4074; CHECK-SSE1-NEXT:    movw %cx, 8(%r10)
4075; CHECK-SSE1-NEXT:    movw %r9w, 6(%r10)
4076; CHECK-SSE1-NEXT:    movw %r8w, 4(%r10)
4077; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
4078; CHECK-SSE1-NEXT:    movw %ax, 2(%r10)
4079; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
4080; CHECK-SSE1-NEXT:    movw %ax, (%r10)
4081; CHECK-SSE1-NEXT:    movq %r10, %rax
4082; CHECK-SSE1-NEXT:    popq %rbx
4083; CHECK-SSE1-NEXT:    popq %r12
4084; CHECK-SSE1-NEXT:    popq %r13
4085; CHECK-SSE1-NEXT:    popq %r14
4086; CHECK-SSE1-NEXT:    popq %r15
4087; CHECK-SSE1-NEXT:    popq %rbp
4088; CHECK-SSE1-NEXT:    retq
4089;
4090; CHECK-SSE2-LABEL: in_v16i16:
4091; CHECK-SSE2:       # %bb.0:
4092; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
4093; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
4094; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm2
4095; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm2
4096; CHECK-SSE2-NEXT:    andps (%rdi), %xmm0
4097; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
4098; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm2
4099; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm2
4100; CHECK-SSE2-NEXT:    andps 16(%rdi), %xmm1
4101; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
4102; CHECK-SSE2-NEXT:    retq
4103;
4104; CHECK-XOP-LABEL: in_v16i16:
4105; CHECK-XOP:       # %bb.0:
4106; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
4107; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
4108; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
4109; CHECK-XOP-NEXT:    retq
4110  %x = load <16 x i16>, ptr%px, align 32
4111  %y = load <16 x i16>, ptr%py, align 32
4112  %mask = load <16 x i16>, ptr%pmask, align 32
4113  %n0 = xor <16 x i16> %x, %y
4114  %n1 = and <16 x i16> %n0, %mask
4115  %r = xor <16 x i16> %n1, %y
4116  ret <16 x i16> %r
4117}
4118
4119define <8 x i32> @in_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind {
4120; CHECK-BASELINE-LABEL: in_v8i32:
4121; CHECK-BASELINE:       # %bb.0:
4122; CHECK-BASELINE-NEXT:    pushq %rbp
4123; CHECK-BASELINE-NEXT:    pushq %r15
4124; CHECK-BASELINE-NEXT:    pushq %r14
4125; CHECK-BASELINE-NEXT:    pushq %r13
4126; CHECK-BASELINE-NEXT:    pushq %r12
4127; CHECK-BASELINE-NEXT:    pushq %rbx
4128; CHECK-BASELINE-NEXT:    movl 28(%rdx), %ebp
4129; CHECK-BASELINE-NEXT:    movl 24(%rdx), %ebx
4130; CHECK-BASELINE-NEXT:    movl 20(%rdx), %r10d
4131; CHECK-BASELINE-NEXT:    movl 16(%rdx), %eax
4132; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
4133; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r12d
4134; CHECK-BASELINE-NEXT:    movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
4135; CHECK-BASELINE-NEXT:    movl 8(%rdx), %r14d
4136; CHECK-BASELINE-NEXT:    movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
4137; CHECK-BASELINE-NEXT:    movl (%rdx), %r15d
4138; CHECK-BASELINE-NEXT:    movl 4(%rdx), %r13d
4139; CHECK-BASELINE-NEXT:    movl (%rsi), %r8d
4140; CHECK-BASELINE-NEXT:    xorl %r15d, %r8d
4141; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r9d
4142; CHECK-BASELINE-NEXT:    xorl %r13d, %r9d
4143; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r11d
4144; CHECK-BASELINE-NEXT:    xorl %r14d, %r11d
4145; CHECK-BASELINE-NEXT:    movl 12(%rsi), %r14d
4146; CHECK-BASELINE-NEXT:    xorl %r12d, %r14d
4147; CHECK-BASELINE-NEXT:    movl 16(%rsi), %r12d
4148; CHECK-BASELINE-NEXT:    xorl %eax, %r12d
4149; CHECK-BASELINE-NEXT:    movl 20(%rsi), %edx
4150; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
4151; CHECK-BASELINE-NEXT:    movl 24(%rsi), %eax
4152; CHECK-BASELINE-NEXT:    xorl %ebx, %eax
4153; CHECK-BASELINE-NEXT:    movl 28(%rsi), %esi
4154; CHECK-BASELINE-NEXT:    xorl %ebp, %esi
4155; CHECK-BASELINE-NEXT:    andl 28(%rcx), %esi
4156; CHECK-BASELINE-NEXT:    andl 24(%rcx), %eax
4157; CHECK-BASELINE-NEXT:    andl 20(%rcx), %edx
4158; CHECK-BASELINE-NEXT:    andl 16(%rcx), %r12d
4159; CHECK-BASELINE-NEXT:    andl 12(%rcx), %r14d
4160; CHECK-BASELINE-NEXT:    andl 8(%rcx), %r11d
4161; CHECK-BASELINE-NEXT:    andl 4(%rcx), %r9d
4162; CHECK-BASELINE-NEXT:    andl (%rcx), %r8d
4163; CHECK-BASELINE-NEXT:    xorl %r15d, %r8d
4164; CHECK-BASELINE-NEXT:    xorl %r13d, %r9d
4165; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
4166; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
4167; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
4168; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
4169; CHECK-BASELINE-NEXT:    xorl %ebx, %eax
4170; CHECK-BASELINE-NEXT:    xorl %ebp, %esi
4171; CHECK-BASELINE-NEXT:    movl %esi, 28(%rdi)
4172; CHECK-BASELINE-NEXT:    movl %eax, 24(%rdi)
4173; CHECK-BASELINE-NEXT:    movl %edx, 20(%rdi)
4174; CHECK-BASELINE-NEXT:    movl %r12d, 16(%rdi)
4175; CHECK-BASELINE-NEXT:    movl %r14d, 12(%rdi)
4176; CHECK-BASELINE-NEXT:    movl %r11d, 8(%rdi)
4177; CHECK-BASELINE-NEXT:    movl %r9d, 4(%rdi)
4178; CHECK-BASELINE-NEXT:    movl %r8d, (%rdi)
4179; CHECK-BASELINE-NEXT:    movq %rdi, %rax
4180; CHECK-BASELINE-NEXT:    popq %rbx
4181; CHECK-BASELINE-NEXT:    popq %r12
4182; CHECK-BASELINE-NEXT:    popq %r13
4183; CHECK-BASELINE-NEXT:    popq %r14
4184; CHECK-BASELINE-NEXT:    popq %r15
4185; CHECK-BASELINE-NEXT:    popq %rbp
4186; CHECK-BASELINE-NEXT:    retq
4187;
4188; CHECK-SSE1-LABEL: in_v8i32:
4189; CHECK-SSE1:       # %bb.0:
4190; CHECK-SSE1-NEXT:    pushq %rbp
4191; CHECK-SSE1-NEXT:    pushq %r15
4192; CHECK-SSE1-NEXT:    pushq %r14
4193; CHECK-SSE1-NEXT:    pushq %r13
4194; CHECK-SSE1-NEXT:    pushq %r12
4195; CHECK-SSE1-NEXT:    pushq %rbx
4196; CHECK-SSE1-NEXT:    movl 28(%rdx), %ebp
4197; CHECK-SSE1-NEXT:    movl 24(%rdx), %ebx
4198; CHECK-SSE1-NEXT:    movl 20(%rdx), %r10d
4199; CHECK-SSE1-NEXT:    movl 16(%rdx), %eax
4200; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
4201; CHECK-SSE1-NEXT:    movl 12(%rdx), %r12d
4202; CHECK-SSE1-NEXT:    movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
4203; CHECK-SSE1-NEXT:    movl 8(%rdx), %r14d
4204; CHECK-SSE1-NEXT:    movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
4205; CHECK-SSE1-NEXT:    movl (%rdx), %r15d
4206; CHECK-SSE1-NEXT:    movl 4(%rdx), %r13d
4207; CHECK-SSE1-NEXT:    movl (%rsi), %r8d
4208; CHECK-SSE1-NEXT:    xorl %r15d, %r8d
4209; CHECK-SSE1-NEXT:    movl 4(%rsi), %r9d
4210; CHECK-SSE1-NEXT:    xorl %r13d, %r9d
4211; CHECK-SSE1-NEXT:    movl 8(%rsi), %r11d
4212; CHECK-SSE1-NEXT:    xorl %r14d, %r11d
4213; CHECK-SSE1-NEXT:    movl 12(%rsi), %r14d
4214; CHECK-SSE1-NEXT:    xorl %r12d, %r14d
4215; CHECK-SSE1-NEXT:    movl 16(%rsi), %r12d
4216; CHECK-SSE1-NEXT:    xorl %eax, %r12d
4217; CHECK-SSE1-NEXT:    movl 20(%rsi), %edx
4218; CHECK-SSE1-NEXT:    xorl %r10d, %edx
4219; CHECK-SSE1-NEXT:    movl 24(%rsi), %eax
4220; CHECK-SSE1-NEXT:    xorl %ebx, %eax
4221; CHECK-SSE1-NEXT:    movl 28(%rsi), %esi
4222; CHECK-SSE1-NEXT:    xorl %ebp, %esi
4223; CHECK-SSE1-NEXT:    andl 28(%rcx), %esi
4224; CHECK-SSE1-NEXT:    andl 24(%rcx), %eax
4225; CHECK-SSE1-NEXT:    andl 20(%rcx), %edx
4226; CHECK-SSE1-NEXT:    andl 16(%rcx), %r12d
4227; CHECK-SSE1-NEXT:    andl 12(%rcx), %r14d
4228; CHECK-SSE1-NEXT:    andl 8(%rcx), %r11d
4229; CHECK-SSE1-NEXT:    andl 4(%rcx), %r9d
4230; CHECK-SSE1-NEXT:    andl (%rcx), %r8d
4231; CHECK-SSE1-NEXT:    xorl %r15d, %r8d
4232; CHECK-SSE1-NEXT:    xorl %r13d, %r9d
4233; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
4234; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
4235; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
4236; CHECK-SSE1-NEXT:    xorl %r10d, %edx
4237; CHECK-SSE1-NEXT:    xorl %ebx, %eax
4238; CHECK-SSE1-NEXT:    xorl %ebp, %esi
4239; CHECK-SSE1-NEXT:    movl %esi, 28(%rdi)
4240; CHECK-SSE1-NEXT:    movl %eax, 24(%rdi)
4241; CHECK-SSE1-NEXT:    movl %edx, 20(%rdi)
4242; CHECK-SSE1-NEXT:    movl %r12d, 16(%rdi)
4243; CHECK-SSE1-NEXT:    movl %r14d, 12(%rdi)
4244; CHECK-SSE1-NEXT:    movl %r11d, 8(%rdi)
4245; CHECK-SSE1-NEXT:    movl %r9d, 4(%rdi)
4246; CHECK-SSE1-NEXT:    movl %r8d, (%rdi)
4247; CHECK-SSE1-NEXT:    movq %rdi, %rax
4248; CHECK-SSE1-NEXT:    popq %rbx
4249; CHECK-SSE1-NEXT:    popq %r12
4250; CHECK-SSE1-NEXT:    popq %r13
4251; CHECK-SSE1-NEXT:    popq %r14
4252; CHECK-SSE1-NEXT:    popq %r15
4253; CHECK-SSE1-NEXT:    popq %rbp
4254; CHECK-SSE1-NEXT:    retq
4255;
4256; CHECK-SSE2-LABEL: in_v8i32:
4257; CHECK-SSE2:       # %bb.0:
4258; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
4259; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
4260; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm2
4261; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm2
4262; CHECK-SSE2-NEXT:    andps (%rdi), %xmm0
4263; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
4264; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm2
4265; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm2
4266; CHECK-SSE2-NEXT:    andps 16(%rdi), %xmm1
4267; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
4268; CHECK-SSE2-NEXT:    retq
4269;
4270; CHECK-XOP-LABEL: in_v8i32:
4271; CHECK-XOP:       # %bb.0:
4272; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
4273; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
4274; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
4275; CHECK-XOP-NEXT:    retq
4276  %x = load <8 x i32>, ptr%px, align 32
4277  %y = load <8 x i32>, ptr%py, align 32
4278  %mask = load <8 x i32>, ptr%pmask, align 32
4279  %n0 = xor <8 x i32> %x, %y
4280  %n1 = and <8 x i32> %n0, %mask
4281  %r = xor <8 x i32> %n1, %y
4282  ret <8 x i32> %r
4283}
4284
4285define <4 x i64> @in_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind {
4286; CHECK-BASELINE-LABEL: in_v4i64:
4287; CHECK-BASELINE:       # %bb.0:
4288; CHECK-BASELINE-NEXT:    pushq %rbx
4289; CHECK-BASELINE-NEXT:    movq %rdi, %rax
4290; CHECK-BASELINE-NEXT:    movq 24(%rdx), %rdi
4291; CHECK-BASELINE-NEXT:    movq 16(%rdx), %r8
4292; CHECK-BASELINE-NEXT:    movq (%rdx), %r9
4293; CHECK-BASELINE-NEXT:    movq 8(%rdx), %r10
4294; CHECK-BASELINE-NEXT:    movq (%rsi), %rdx
4295; CHECK-BASELINE-NEXT:    xorq %r9, %rdx
4296; CHECK-BASELINE-NEXT:    movq 8(%rsi), %r11
4297; CHECK-BASELINE-NEXT:    xorq %r10, %r11
4298; CHECK-BASELINE-NEXT:    movq 16(%rsi), %rbx
4299; CHECK-BASELINE-NEXT:    xorq %r8, %rbx
4300; CHECK-BASELINE-NEXT:    movq 24(%rsi), %rsi
4301; CHECK-BASELINE-NEXT:    xorq %rdi, %rsi
4302; CHECK-BASELINE-NEXT:    andq 24(%rcx), %rsi
4303; CHECK-BASELINE-NEXT:    andq 16(%rcx), %rbx
4304; CHECK-BASELINE-NEXT:    andq 8(%rcx), %r11
4305; CHECK-BASELINE-NEXT:    andq (%rcx), %rdx
4306; CHECK-BASELINE-NEXT:    xorq %r9, %rdx
4307; CHECK-BASELINE-NEXT:    xorq %r10, %r11
4308; CHECK-BASELINE-NEXT:    xorq %r8, %rbx
4309; CHECK-BASELINE-NEXT:    xorq %rdi, %rsi
4310; CHECK-BASELINE-NEXT:    movq %rsi, 24(%rax)
4311; CHECK-BASELINE-NEXT:    movq %rbx, 16(%rax)
4312; CHECK-BASELINE-NEXT:    movq %r11, 8(%rax)
4313; CHECK-BASELINE-NEXT:    movq %rdx, (%rax)
4314; CHECK-BASELINE-NEXT:    popq %rbx
4315; CHECK-BASELINE-NEXT:    retq
4316;
4317; CHECK-SSE1-LABEL: in_v4i64:
4318; CHECK-SSE1:       # %bb.0:
4319; CHECK-SSE1-NEXT:    pushq %rbx
4320; CHECK-SSE1-NEXT:    movq %rdi, %rax
4321; CHECK-SSE1-NEXT:    movq 24(%rdx), %rdi
4322; CHECK-SSE1-NEXT:    movq 16(%rdx), %r8
4323; CHECK-SSE1-NEXT:    movq (%rdx), %r9
4324; CHECK-SSE1-NEXT:    movq 8(%rdx), %r10
4325; CHECK-SSE1-NEXT:    movq (%rsi), %rdx
4326; CHECK-SSE1-NEXT:    xorq %r9, %rdx
4327; CHECK-SSE1-NEXT:    movq 8(%rsi), %r11
4328; CHECK-SSE1-NEXT:    xorq %r10, %r11
4329; CHECK-SSE1-NEXT:    movq 16(%rsi), %rbx
4330; CHECK-SSE1-NEXT:    xorq %r8, %rbx
4331; CHECK-SSE1-NEXT:    movq 24(%rsi), %rsi
4332; CHECK-SSE1-NEXT:    xorq %rdi, %rsi
4333; CHECK-SSE1-NEXT:    andq 24(%rcx), %rsi
4334; CHECK-SSE1-NEXT:    andq 16(%rcx), %rbx
4335; CHECK-SSE1-NEXT:    andq 8(%rcx), %r11
4336; CHECK-SSE1-NEXT:    andq (%rcx), %rdx
4337; CHECK-SSE1-NEXT:    xorq %r9, %rdx
4338; CHECK-SSE1-NEXT:    xorq %r10, %r11
4339; CHECK-SSE1-NEXT:    xorq %r8, %rbx
4340; CHECK-SSE1-NEXT:    xorq %rdi, %rsi
4341; CHECK-SSE1-NEXT:    movq %rsi, 24(%rax)
4342; CHECK-SSE1-NEXT:    movq %rbx, 16(%rax)
4343; CHECK-SSE1-NEXT:    movq %r11, 8(%rax)
4344; CHECK-SSE1-NEXT:    movq %rdx, (%rax)
4345; CHECK-SSE1-NEXT:    popq %rbx
4346; CHECK-SSE1-NEXT:    retq
4347;
4348; CHECK-SSE2-LABEL: in_v4i64:
4349; CHECK-SSE2:       # %bb.0:
4350; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
4351; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
4352; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm2
4353; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm2
4354; CHECK-SSE2-NEXT:    andps (%rdi), %xmm0
4355; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
4356; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm2
4357; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm2
4358; CHECK-SSE2-NEXT:    andps 16(%rdi), %xmm1
4359; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
4360; CHECK-SSE2-NEXT:    retq
4361;
4362; CHECK-XOP-LABEL: in_v4i64:
4363; CHECK-XOP:       # %bb.0:
4364; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
4365; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
4366; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
4367; CHECK-XOP-NEXT:    retq
4368  %x = load <4 x i64>, ptr%px, align 32
4369  %y = load <4 x i64>, ptr%py, align 32
4370  %mask = load <4 x i64>, ptr%pmask, align 32
4371  %n0 = xor <4 x i64> %x, %y
4372  %n1 = and <4 x i64> %n0, %mask
4373  %r = xor <4 x i64> %n1, %y
4374  ret <4 x i64> %r
4375}
4376