xref: /llvm-project/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll (revision 6c5941b09fca487efc5000c82bbce6054bf36a7c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-sse2 | FileCheck %s --check-prefixes=ALL,SCALAR
3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSE2-ONLY
4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSE3
5; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSSE3-ONLY
6; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSE41
7; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSE42
8; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX1
9; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX2-ONLY
10; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX512,AVX512F
11; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX512,AVX512BW
12
13define void @vec32_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
14; SCALAR-LABEL: vec32_v2i8:
15; SCALAR:       # %bb.0:
16; SCALAR-NEXT:    movzbl (%rdi), %eax
17; SCALAR-NEXT:    movzbl 1(%rdi), %ecx
18; SCALAR-NEXT:    notb %al
19; SCALAR-NEXT:    notb %cl
20; SCALAR-NEXT:    movb %cl, 1(%rsi)
21; SCALAR-NEXT:    movb %al, (%rsi)
22; SCALAR-NEXT:    movb %cl, 1(%rdx)
23; SCALAR-NEXT:    movb %al, (%rdx)
24; SCALAR-NEXT:    movb %cl, 3(%rdx)
25; SCALAR-NEXT:    movb %al, 2(%rdx)
26; SCALAR-NEXT:    retq
27;
28; SSE-LABEL: vec32_v2i8:
29; SSE:       # %bb.0:
30; SSE-NEXT:    movl (%rdi), %eax
31; SSE-NEXT:    notl %eax
32; SSE-NEXT:    movw %ax, (%rsi)
33; SSE-NEXT:    movw %ax, (%rdx)
34; SSE-NEXT:    movw %ax, 2(%rdx)
35; SSE-NEXT:    retq
36  %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
37  %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
38  store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
39  %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
40  store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
41  %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
42  store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
43  ret void
44}
45
46define void @vec64_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
47; SCALAR-LABEL: vec64_v2i8:
48; SCALAR:       # %bb.0:
49; SCALAR-NEXT:    movzbl (%rdi), %eax
50; SCALAR-NEXT:    movzbl 1(%rdi), %ecx
51; SCALAR-NEXT:    notb %al
52; SCALAR-NEXT:    notb %cl
53; SCALAR-NEXT:    movb %cl, 1(%rsi)
54; SCALAR-NEXT:    movb %al, (%rsi)
55; SCALAR-NEXT:    movb %cl, 1(%rdx)
56; SCALAR-NEXT:    movb %al, (%rdx)
57; SCALAR-NEXT:    movb %cl, 3(%rdx)
58; SCALAR-NEXT:    movb %al, 2(%rdx)
59; SCALAR-NEXT:    movb %cl, 5(%rdx)
60; SCALAR-NEXT:    movb %al, 4(%rdx)
61; SCALAR-NEXT:    movb %cl, 7(%rdx)
62; SCALAR-NEXT:    movb %al, 6(%rdx)
63; SCALAR-NEXT:    retq
64;
65; SSE-LABEL: vec64_v2i8:
66; SSE:       # %bb.0:
67; SSE-NEXT:    movl (%rdi), %eax
68; SSE-NEXT:    notl %eax
69; SSE-NEXT:    movw %ax, (%rsi)
70; SSE-NEXT:    movw %ax, (%rdx)
71; SSE-NEXT:    movw %ax, 2(%rdx)
72; SSE-NEXT:    movw %ax, 4(%rdx)
73; SSE-NEXT:    movw %ax, 6(%rdx)
74; SSE-NEXT:    retq
75  %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
76  %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
77  store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
78  %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
79  store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
80  %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
81  store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
82  %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2
83  store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
84  %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3
85  store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
86  ret void
87}
88
89define void @vec64_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
90; SCALAR-LABEL: vec64_v2i16:
91; SCALAR:       # %bb.0:
92; SCALAR-NEXT:    movzwl 2(%rdi), %eax
93; SCALAR-NEXT:    movl (%rdi), %ecx
94; SCALAR-NEXT:    notl %ecx
95; SCALAR-NEXT:    notl %eax
96; SCALAR-NEXT:    movw %ax, 2(%rsi)
97; SCALAR-NEXT:    movw %cx, (%rsi)
98; SCALAR-NEXT:    movw %ax, 2(%rdx)
99; SCALAR-NEXT:    movw %cx, (%rdx)
100; SCALAR-NEXT:    movw %ax, 6(%rdx)
101; SCALAR-NEXT:    movw %cx, 4(%rdx)
102; SCALAR-NEXT:    retq
103;
104; SSE-LABEL: vec64_v2i16:
105; SSE:       # %bb.0:
106; SSE-NEXT:    movl (%rdi), %eax
107; SSE-NEXT:    notl %eax
108; SSE-NEXT:    movl %eax, (%rsi)
109; SSE-NEXT:    movl %eax, (%rdx)
110; SSE-NEXT:    movl %eax, 4(%rdx)
111; SSE-NEXT:    retq
112  %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64
113  %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1>
114  store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
115  %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0
116  store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
117  %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1
118  store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
119  ret void
120}
121
122define void @vec64_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
123; SCALAR-LABEL: vec64_v4i8:
124; SCALAR:       # %bb.0:
125; SCALAR-NEXT:    movzbl 3(%rdi), %eax
126; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
127; SCALAR-NEXT:    movzbl (%rdi), %r8d
128; SCALAR-NEXT:    movzbl 1(%rdi), %edi
129; SCALAR-NEXT:    notb %r8b
130; SCALAR-NEXT:    notb %dil
131; SCALAR-NEXT:    notb %cl
132; SCALAR-NEXT:    notb %al
133; SCALAR-NEXT:    movb %al, 3(%rsi)
134; SCALAR-NEXT:    movb %cl, 2(%rsi)
135; SCALAR-NEXT:    movb %dil, 1(%rsi)
136; SCALAR-NEXT:    movb %r8b, (%rsi)
137; SCALAR-NEXT:    movb %al, 3(%rdx)
138; SCALAR-NEXT:    movb %cl, 2(%rdx)
139; SCALAR-NEXT:    movb %dil, 1(%rdx)
140; SCALAR-NEXT:    movb %r8b, (%rdx)
141; SCALAR-NEXT:    movb %al, 7(%rdx)
142; SCALAR-NEXT:    movb %cl, 6(%rdx)
143; SCALAR-NEXT:    movb %dil, 5(%rdx)
144; SCALAR-NEXT:    movb %r8b, 4(%rdx)
145; SCALAR-NEXT:    retq
146;
147; SSE-LABEL: vec64_v4i8:
148; SSE:       # %bb.0:
149; SSE-NEXT:    movl (%rdi), %eax
150; SSE-NEXT:    notl %eax
151; SSE-NEXT:    movl %eax, (%rsi)
152; SSE-NEXT:    movl %eax, (%rdx)
153; SSE-NEXT:    movl %eax, 4(%rdx)
154; SSE-NEXT:    retq
155  %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64
156  %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1>
157  store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
158  %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0
159  store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
160  %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1
161  store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
162  ret void
163}
164
165define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
166; SCALAR-LABEL: vec128_v2i8:
167; SCALAR:       # %bb.0:
168; SCALAR-NEXT:    movzbl (%rdi), %eax
169; SCALAR-NEXT:    movzbl 1(%rdi), %ecx
170; SCALAR-NEXT:    notb %al
171; SCALAR-NEXT:    notb %cl
172; SCALAR-NEXT:    movb %cl, 1(%rsi)
173; SCALAR-NEXT:    movb %al, (%rsi)
174; SCALAR-NEXT:    movb %cl, 1(%rdx)
175; SCALAR-NEXT:    movb %al, (%rdx)
176; SCALAR-NEXT:    movb %cl, 3(%rdx)
177; SCALAR-NEXT:    movb %al, 2(%rdx)
178; SCALAR-NEXT:    movb %cl, 5(%rdx)
179; SCALAR-NEXT:    movb %al, 4(%rdx)
180; SCALAR-NEXT:    movb %cl, 7(%rdx)
181; SCALAR-NEXT:    movb %al, 6(%rdx)
182; SCALAR-NEXT:    movb %cl, 9(%rdx)
183; SCALAR-NEXT:    movb %al, 8(%rdx)
184; SCALAR-NEXT:    movb %cl, 11(%rdx)
185; SCALAR-NEXT:    movb %al, 10(%rdx)
186; SCALAR-NEXT:    movb %cl, 13(%rdx)
187; SCALAR-NEXT:    movb %al, 12(%rdx)
188; SCALAR-NEXT:    movb %cl, 15(%rdx)
189; SCALAR-NEXT:    movb %al, 14(%rdx)
190; SCALAR-NEXT:    retq
191;
192; SSE2-ONLY-LABEL: vec128_v2i8:
193; SSE2-ONLY:       # %bb.0:
194; SSE2-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
195; SSE2-ONLY-NEXT:    pxor (%rdi), %xmm0
196; SSE2-ONLY-NEXT:    movd %xmm0, %eax
197; SSE2-ONLY-NEXT:    movw %ax, (%rsi)
198; SSE2-ONLY-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
199; SSE2-ONLY-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
200; SSE2-ONLY-NEXT:    movdqa %xmm0, (%rdx)
201; SSE2-ONLY-NEXT:    retq
202;
203; SSE3-LABEL: vec128_v2i8:
204; SSE3:       # %bb.0:
205; SSE3-NEXT:    pcmpeqd %xmm0, %xmm0
206; SSE3-NEXT:    pxor (%rdi), %xmm0
207; SSE3-NEXT:    movd %xmm0, %eax
208; SSE3-NEXT:    movw %ax, (%rsi)
209; SSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
210; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
211; SSE3-NEXT:    movdqa %xmm0, (%rdx)
212; SSE3-NEXT:    retq
213;
214; SSSE3-ONLY-LABEL: vec128_v2i8:
215; SSSE3-ONLY:       # %bb.0:
216; SSSE3-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
217; SSSE3-ONLY-NEXT:    pxor (%rdi), %xmm0
218; SSSE3-ONLY-NEXT:    movd %xmm0, %eax
219; SSSE3-ONLY-NEXT:    movw %ax, (%rsi)
220; SSSE3-ONLY-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
221; SSSE3-ONLY-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
222; SSSE3-ONLY-NEXT:    movdqa %xmm0, (%rdx)
223; SSSE3-ONLY-NEXT:    retq
224;
225; SSE41-LABEL: vec128_v2i8:
226; SSE41:       # %bb.0:
227; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
228; SSE41-NEXT:    pxor (%rdi), %xmm0
229; SSE41-NEXT:    pextrw $0, %xmm0, (%rsi)
230; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
231; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
232; SSE41-NEXT:    movdqa %xmm0, (%rdx)
233; SSE41-NEXT:    retq
234;
235; SSE42-LABEL: vec128_v2i8:
236; SSE42:       # %bb.0:
237; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
238; SSE42-NEXT:    pxor (%rdi), %xmm0
239; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
240; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
241; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
242; SSE42-NEXT:    movdqa %xmm0, (%rdx)
243; SSE42-NEXT:    retq
244;
245; AVX1-LABEL: vec128_v2i8:
246; AVX1:       # %bb.0:
247; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
248; AVX1-NEXT:    vpxor (%rdi), %xmm0, %xmm0
249; AVX1-NEXT:    vpextrw $0, %xmm0, (%rsi)
250; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
251; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
252; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
253; AVX1-NEXT:    retq
254;
255; AVX2-LABEL: vec128_v2i8:
256; AVX2:       # %bb.0:
257; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
258; AVX2-NEXT:    vpxor (%rdi), %xmm0, %xmm0
259; AVX2-NEXT:    vpextrw $0, %xmm0, (%rsi)
260; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
261; AVX2-NEXT:    vmovdqa %xmm0, (%rdx)
262; AVX2-NEXT:    retq
263  %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
264  %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
265  store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
266  %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
267  store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
268  %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
269  store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
270  %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2
271  store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
272  %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3
273  store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
274  %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4
275  store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8
276  %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5
277  store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2
278  %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6
279  store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4
280  %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7
281  store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2
282  ret void
283}
284
285define void @vec128_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
286; SCALAR-LABEL: vec128_v2i16:
287; SCALAR:       # %bb.0:
288; SCALAR-NEXT:    movzwl 2(%rdi), %eax
289; SCALAR-NEXT:    movl (%rdi), %ecx
290; SCALAR-NEXT:    notl %ecx
291; SCALAR-NEXT:    notl %eax
292; SCALAR-NEXT:    movw %ax, 2(%rsi)
293; SCALAR-NEXT:    movw %cx, (%rsi)
294; SCALAR-NEXT:    movw %ax, 2(%rdx)
295; SCALAR-NEXT:    movw %cx, (%rdx)
296; SCALAR-NEXT:    movw %ax, 6(%rdx)
297; SCALAR-NEXT:    movw %cx, 4(%rdx)
298; SCALAR-NEXT:    movw %ax, 10(%rdx)
299; SCALAR-NEXT:    movw %cx, 8(%rdx)
300; SCALAR-NEXT:    movw %ax, 14(%rdx)
301; SCALAR-NEXT:    movw %cx, 12(%rdx)
302; SCALAR-NEXT:    retq
303;
304; SSE2-LABEL: vec128_v2i16:
305; SSE2:       # %bb.0:
306; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
307; SSE2-NEXT:    pxor (%rdi), %xmm0
308; SSE2-NEXT:    movd %xmm0, (%rsi)
309; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
310; SSE2-NEXT:    movdqa %xmm0, (%rdx)
311; SSE2-NEXT:    retq
312;
313; AVX1-LABEL: vec128_v2i16:
314; AVX1:       # %bb.0:
315; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
316; AVX1-NEXT:    vpxor (%rdi), %xmm0, %xmm0
317; AVX1-NEXT:    vmovd %xmm0, (%rsi)
318; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
319; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
320; AVX1-NEXT:    retq
321;
322; AVX2-LABEL: vec128_v2i16:
323; AVX2:       # %bb.0:
324; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
325; AVX2-NEXT:    vpxor (%rdi), %xmm0, %xmm0
326; AVX2-NEXT:    vmovd %xmm0, (%rsi)
327; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
328; AVX2-NEXT:    vmovdqa %xmm0, (%rdx)
329; AVX2-NEXT:    retq
330  %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64
331  %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1>
332  store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
333  %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0
334  store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
335  %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1
336  store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
337  %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2
338  store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8
339  %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3
340  store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4
341  ret void
342}
343
344define void @vec128_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
345; SCALAR-LABEL: vec128_v2i32:
346; SCALAR:       # %bb.0:
347; SCALAR-NEXT:    movl (%rdi), %eax
348; SCALAR-NEXT:    movl 4(%rdi), %ecx
349; SCALAR-NEXT:    notl %eax
350; SCALAR-NEXT:    notl %ecx
351; SCALAR-NEXT:    movl %ecx, 4(%rsi)
352; SCALAR-NEXT:    movl %eax, (%rsi)
353; SCALAR-NEXT:    movl %ecx, 4(%rdx)
354; SCALAR-NEXT:    movl %eax, (%rdx)
355; SCALAR-NEXT:    movl %ecx, 12(%rdx)
356; SCALAR-NEXT:    movl %eax, 8(%rdx)
357; SCALAR-NEXT:    retq
358;
359; SSE2-LABEL: vec128_v2i32:
360; SSE2:       # %bb.0:
361; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
362; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
363; SSE2-NEXT:    pxor %xmm0, %xmm1
364; SSE2-NEXT:    movq %xmm1, (%rsi)
365; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
366; SSE2-NEXT:    movdqa %xmm0, (%rdx)
367; SSE2-NEXT:    retq
368;
369; AVX1-LABEL: vec128_v2i32:
370; AVX1:       # %bb.0:
371; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
372; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
373; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
374; AVX1-NEXT:    vmovq %xmm0, (%rsi)
375; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
376; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
377; AVX1-NEXT:    retq
378;
379; AVX2-ONLY-LABEL: vec128_v2i32:
380; AVX2-ONLY:       # %bb.0:
381; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
382; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
383; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
384; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
385; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %xmm0
386; AVX2-ONLY-NEXT:    vmovdqa %xmm0, (%rdx)
387; AVX2-ONLY-NEXT:    retq
388;
389; AVX512-LABEL: vec128_v2i32:
390; AVX512:       # %bb.0:
391; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
392; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
393; AVX512-NEXT:    vmovq %xmm0, (%rsi)
394; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
395; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
396; AVX512-NEXT:    retq
397  %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
398  %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
399  store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
400  %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0
401  store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
402  %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1
403  store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8
404  ret void
405}
406
407define void @vec128_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
408; SCALAR-LABEL: vec128_v2f32:
409; SCALAR:       # %bb.0:
410; SCALAR-NEXT:    movl (%rdi), %eax
411; SCALAR-NEXT:    movl 4(%rdi), %ecx
412; SCALAR-NEXT:    notl %eax
413; SCALAR-NEXT:    notl %ecx
414; SCALAR-NEXT:    movl %ecx, 4(%rsi)
415; SCALAR-NEXT:    movl %eax, (%rsi)
416; SCALAR-NEXT:    movl %ecx, 4(%rdx)
417; SCALAR-NEXT:    movl %eax, (%rdx)
418; SCALAR-NEXT:    movl %ecx, 12(%rdx)
419; SCALAR-NEXT:    movl %eax, 8(%rdx)
420; SCALAR-NEXT:    retq
421;
422; SSE2-LABEL: vec128_v2f32:
423; SSE2:       # %bb.0:
424; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
425; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
426; SSE2-NEXT:    pxor %xmm0, %xmm1
427; SSE2-NEXT:    movq %xmm1, (%rsi)
428; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
429; SSE2-NEXT:    movdqa %xmm0, (%rdx)
430; SSE2-NEXT:    retq
431;
432; AVX1-LABEL: vec128_v2f32:
433; AVX1:       # %bb.0:
434; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
435; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
436; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
437; AVX1-NEXT:    vmovq %xmm0, (%rsi)
438; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
439; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
440; AVX1-NEXT:    retq
441;
442; AVX2-ONLY-LABEL: vec128_v2f32:
443; AVX2-ONLY:       # %bb.0:
444; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
445; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
446; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
447; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
448; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %xmm0
449; AVX2-ONLY-NEXT:    vmovdqa %xmm0, (%rdx)
450; AVX2-ONLY-NEXT:    retq
451;
452; AVX512-LABEL: vec128_v2f32:
453; AVX512:       # %bb.0:
454; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
455; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
456; AVX512-NEXT:    vmovq %xmm0, (%rsi)
457; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
458; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
459; AVX512-NEXT:    retq
460  %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
461  %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
462  %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float>
463  store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64
464  %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0
465  store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
466  %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1
467  store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8
468  ret void
469}
470
471define void @vec128_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
472; SCALAR-LABEL: vec128_v4i8:
473; SCALAR:       # %bb.0:
474; SCALAR-NEXT:    movzbl 3(%rdi), %eax
475; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
476; SCALAR-NEXT:    movzbl (%rdi), %r8d
477; SCALAR-NEXT:    movzbl 1(%rdi), %edi
478; SCALAR-NEXT:    notb %r8b
479; SCALAR-NEXT:    notb %dil
480; SCALAR-NEXT:    notb %cl
481; SCALAR-NEXT:    notb %al
482; SCALAR-NEXT:    movb %al, 3(%rsi)
483; SCALAR-NEXT:    movb %cl, 2(%rsi)
484; SCALAR-NEXT:    movb %dil, 1(%rsi)
485; SCALAR-NEXT:    movb %r8b, (%rsi)
486; SCALAR-NEXT:    movb %al, 3(%rdx)
487; SCALAR-NEXT:    movb %cl, 2(%rdx)
488; SCALAR-NEXT:    movb %dil, 1(%rdx)
489; SCALAR-NEXT:    movb %r8b, (%rdx)
490; SCALAR-NEXT:    movb %al, 7(%rdx)
491; SCALAR-NEXT:    movb %cl, 6(%rdx)
492; SCALAR-NEXT:    movb %dil, 5(%rdx)
493; SCALAR-NEXT:    movb %r8b, 4(%rdx)
494; SCALAR-NEXT:    movb %al, 11(%rdx)
495; SCALAR-NEXT:    movb %cl, 10(%rdx)
496; SCALAR-NEXT:    movb %dil, 9(%rdx)
497; SCALAR-NEXT:    movb %r8b, 8(%rdx)
498; SCALAR-NEXT:    movb %al, 15(%rdx)
499; SCALAR-NEXT:    movb %cl, 14(%rdx)
500; SCALAR-NEXT:    movb %dil, 13(%rdx)
501; SCALAR-NEXT:    movb %r8b, 12(%rdx)
502; SCALAR-NEXT:    retq
503;
504; SSE2-LABEL: vec128_v4i8:
505; SSE2:       # %bb.0:
506; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
507; SSE2-NEXT:    pxor (%rdi), %xmm0
508; SSE2-NEXT:    movd %xmm0, (%rsi)
509; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
510; SSE2-NEXT:    movdqa %xmm0, (%rdx)
511; SSE2-NEXT:    retq
512;
513; AVX1-LABEL: vec128_v4i8:
514; AVX1:       # %bb.0:
515; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
516; AVX1-NEXT:    vpxor (%rdi), %xmm0, %xmm0
517; AVX1-NEXT:    vmovd %xmm0, (%rsi)
518; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
519; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
520; AVX1-NEXT:    retq
521;
522; AVX2-LABEL: vec128_v4i8:
523; AVX2:       # %bb.0:
524; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
525; AVX2-NEXT:    vpxor (%rdi), %xmm0, %xmm0
526; AVX2-NEXT:    vmovd %xmm0, (%rsi)
527; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
528; AVX2-NEXT:    vmovdqa %xmm0, (%rdx)
529; AVX2-NEXT:    retq
530  %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64
531  %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1>
532  store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
533  %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0
534  store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
535  %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1
536  store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
537  %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2
538  store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8
539  %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3
540  store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4
541  ret void
542}
543
544define void @vec128_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
545; SCALAR-LABEL: vec128_v4i16:
546; SCALAR:       # %bb.0:
547; SCALAR-NEXT:    movzwl 6(%rdi), %eax
548; SCALAR-NEXT:    movzwl 2(%rdi), %ecx
549; SCALAR-NEXT:    movl (%rdi), %r8d
550; SCALAR-NEXT:    movl 4(%rdi), %edi
551; SCALAR-NEXT:    notl %r8d
552; SCALAR-NEXT:    notl %ecx
553; SCALAR-NEXT:    notl %edi
554; SCALAR-NEXT:    notl %eax
555; SCALAR-NEXT:    movw %ax, 6(%rsi)
556; SCALAR-NEXT:    movw %di, 4(%rsi)
557; SCALAR-NEXT:    movw %cx, 2(%rsi)
558; SCALAR-NEXT:    movw %r8w, (%rsi)
559; SCALAR-NEXT:    movw %ax, 6(%rdx)
560; SCALAR-NEXT:    movw %di, 4(%rdx)
561; SCALAR-NEXT:    movw %cx, 2(%rdx)
562; SCALAR-NEXT:    movw %r8w, (%rdx)
563; SCALAR-NEXT:    movw %ax, 14(%rdx)
564; SCALAR-NEXT:    movw %di, 12(%rdx)
565; SCALAR-NEXT:    movw %cx, 10(%rdx)
566; SCALAR-NEXT:    movw %r8w, 8(%rdx)
567; SCALAR-NEXT:    retq
568;
569; SSE2-LABEL: vec128_v4i16:
570; SSE2:       # %bb.0:
571; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
572; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
573; SSE2-NEXT:    pxor %xmm0, %xmm1
574; SSE2-NEXT:    movq %xmm1, (%rsi)
575; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
576; SSE2-NEXT:    movdqa %xmm0, (%rdx)
577; SSE2-NEXT:    retq
578;
579; AVX1-LABEL: vec128_v4i16:
580; AVX1:       # %bb.0:
581; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
582; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
583; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
584; AVX1-NEXT:    vmovq %xmm0, (%rsi)
585; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
586; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
587; AVX1-NEXT:    retq
588;
589; AVX2-ONLY-LABEL: vec128_v4i16:
590; AVX2-ONLY:       # %bb.0:
591; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
592; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
593; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
594; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
595; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %xmm0
596; AVX2-ONLY-NEXT:    vmovdqa %xmm0, (%rdx)
597; AVX2-ONLY-NEXT:    retq
598;
599; AVX512-LABEL: vec128_v4i16:
600; AVX512:       # %bb.0:
601; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
602; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
603; AVX512-NEXT:    vmovq %xmm0, (%rsi)
604; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
605; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
606; AVX512-NEXT:    retq
607  %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64
608  %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1>
609  store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
610  %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0
611  store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
612  %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1
613  store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8
614  ret void
615}
616
617define void @vec128_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
618; SCALAR-LABEL: vec128_v8i8:
619; SCALAR:       # %bb.0:
620; SCALAR-NEXT:    pushq %rbx
621; SCALAR-NEXT:    movzbl 7(%rdi), %ebx
622; SCALAR-NEXT:    movzbl 6(%rdi), %r11d
623; SCALAR-NEXT:    movzbl 5(%rdi), %r10d
624; SCALAR-NEXT:    movzbl 4(%rdi), %r9d
625; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
626; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
627; SCALAR-NEXT:    movzbl (%rdi), %eax
628; SCALAR-NEXT:    movzbl 1(%rdi), %edi
629; SCALAR-NEXT:    notb %al
630; SCALAR-NEXT:    notb %dil
631; SCALAR-NEXT:    notb %cl
632; SCALAR-NEXT:    notb %r8b
633; SCALAR-NEXT:    notb %r9b
634; SCALAR-NEXT:    notb %r10b
635; SCALAR-NEXT:    notb %r11b
636; SCALAR-NEXT:    notb %bl
637; SCALAR-NEXT:    movb %bl, 7(%rsi)
638; SCALAR-NEXT:    movb %r11b, 6(%rsi)
639; SCALAR-NEXT:    movb %r10b, 5(%rsi)
640; SCALAR-NEXT:    movb %r9b, 4(%rsi)
641; SCALAR-NEXT:    movb %r8b, 3(%rsi)
642; SCALAR-NEXT:    movb %cl, 2(%rsi)
643; SCALAR-NEXT:    movb %dil, 1(%rsi)
644; SCALAR-NEXT:    movb %al, (%rsi)
645; SCALAR-NEXT:    movb %bl, 7(%rdx)
646; SCALAR-NEXT:    movb %r11b, 6(%rdx)
647; SCALAR-NEXT:    movb %r10b, 5(%rdx)
648; SCALAR-NEXT:    movb %r9b, 4(%rdx)
649; SCALAR-NEXT:    movb %r8b, 3(%rdx)
650; SCALAR-NEXT:    movb %cl, 2(%rdx)
651; SCALAR-NEXT:    movb %dil, 1(%rdx)
652; SCALAR-NEXT:    movb %al, (%rdx)
653; SCALAR-NEXT:    movb %bl, 15(%rdx)
654; SCALAR-NEXT:    movb %r11b, 14(%rdx)
655; SCALAR-NEXT:    movb %r10b, 13(%rdx)
656; SCALAR-NEXT:    movb %r9b, 12(%rdx)
657; SCALAR-NEXT:    movb %r8b, 11(%rdx)
658; SCALAR-NEXT:    movb %cl, 10(%rdx)
659; SCALAR-NEXT:    movb %dil, 9(%rdx)
660; SCALAR-NEXT:    movb %al, 8(%rdx)
661; SCALAR-NEXT:    popq %rbx
662; SCALAR-NEXT:    retq
663;
664; SSE2-LABEL: vec128_v8i8:
665; SSE2:       # %bb.0:
666; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
667; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
668; SSE2-NEXT:    pxor %xmm0, %xmm1
669; SSE2-NEXT:    movq %xmm1, (%rsi)
670; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
671; SSE2-NEXT:    movdqa %xmm0, (%rdx)
672; SSE2-NEXT:    retq
673;
674; AVX1-LABEL: vec128_v8i8:
675; AVX1:       # %bb.0:
676; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
677; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
678; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
679; AVX1-NEXT:    vmovq %xmm0, (%rsi)
680; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
681; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
682; AVX1-NEXT:    retq
683;
684; AVX2-ONLY-LABEL: vec128_v8i8:
685; AVX2-ONLY:       # %bb.0:
686; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
687; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
688; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
689; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
690; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %xmm0
691; AVX2-ONLY-NEXT:    vmovdqa %xmm0, (%rdx)
692; AVX2-ONLY-NEXT:    retq
693;
694; AVX512-LABEL: vec128_v8i8:
695; AVX512:       # %bb.0:
696; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
697; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
698; AVX512-NEXT:    vmovq %xmm0, (%rsi)
699; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
700; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
701; AVX512-NEXT:    retq
702  %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64
703  %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
704  store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
705  %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0
706  store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
707  %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1
708  store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8
709  ret void
710}
711
712define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
713; SCALAR-LABEL: vec256_v2i8:
714; SCALAR:       # %bb.0:
715; SCALAR-NEXT:    movzbl (%rdi), %eax
716; SCALAR-NEXT:    movzbl 1(%rdi), %ecx
717; SCALAR-NEXT:    notb %al
718; SCALAR-NEXT:    notb %cl
719; SCALAR-NEXT:    movb %cl, 1(%rsi)
720; SCALAR-NEXT:    movb %al, (%rsi)
721; SCALAR-NEXT:    movb %cl, 1(%rdx)
722; SCALAR-NEXT:    movb %al, (%rdx)
723; SCALAR-NEXT:    movb %cl, 3(%rdx)
724; SCALAR-NEXT:    movb %al, 2(%rdx)
725; SCALAR-NEXT:    movb %cl, 5(%rdx)
726; SCALAR-NEXT:    movb %al, 4(%rdx)
727; SCALAR-NEXT:    movb %cl, 7(%rdx)
728; SCALAR-NEXT:    movb %al, 6(%rdx)
729; SCALAR-NEXT:    movb %cl, 9(%rdx)
730; SCALAR-NEXT:    movb %al, 8(%rdx)
731; SCALAR-NEXT:    movb %cl, 11(%rdx)
732; SCALAR-NEXT:    movb %al, 10(%rdx)
733; SCALAR-NEXT:    movb %cl, 13(%rdx)
734; SCALAR-NEXT:    movb %al, 12(%rdx)
735; SCALAR-NEXT:    movb %cl, 15(%rdx)
736; SCALAR-NEXT:    movb %al, 14(%rdx)
737; SCALAR-NEXT:    movb %cl, 17(%rdx)
738; SCALAR-NEXT:    movb %al, 16(%rdx)
739; SCALAR-NEXT:    movb %cl, 19(%rdx)
740; SCALAR-NEXT:    movb %al, 18(%rdx)
741; SCALAR-NEXT:    movb %cl, 21(%rdx)
742; SCALAR-NEXT:    movb %al, 20(%rdx)
743; SCALAR-NEXT:    movb %cl, 23(%rdx)
744; SCALAR-NEXT:    movb %al, 22(%rdx)
745; SCALAR-NEXT:    movb %cl, 25(%rdx)
746; SCALAR-NEXT:    movb %al, 24(%rdx)
747; SCALAR-NEXT:    movb %cl, 27(%rdx)
748; SCALAR-NEXT:    movb %al, 26(%rdx)
749; SCALAR-NEXT:    movb %cl, 29(%rdx)
750; SCALAR-NEXT:    movb %al, 28(%rdx)
751; SCALAR-NEXT:    movb %cl, 31(%rdx)
752; SCALAR-NEXT:    movb %al, 30(%rdx)
753; SCALAR-NEXT:    retq
754;
755; SSE2-ONLY-LABEL: vec256_v2i8:
756; SSE2-ONLY:       # %bb.0:
757; SSE2-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
758; SSE2-ONLY-NEXT:    pxor (%rdi), %xmm0
759; SSE2-ONLY-NEXT:    movd %xmm0, %eax
760; SSE2-ONLY-NEXT:    movw %ax, (%rsi)
761; SSE2-ONLY-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
762; SSE2-ONLY-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
763; SSE2-ONLY-NEXT:    movdqa %xmm0, (%rdx)
764; SSE2-ONLY-NEXT:    movdqa %xmm0, 16(%rdx)
765; SSE2-ONLY-NEXT:    retq
766;
767; SSE3-LABEL: vec256_v2i8:
768; SSE3:       # %bb.0:
769; SSE3-NEXT:    pcmpeqd %xmm0, %xmm0
770; SSE3-NEXT:    pxor (%rdi), %xmm0
771; SSE3-NEXT:    movd %xmm0, %eax
772; SSE3-NEXT:    movw %ax, (%rsi)
773; SSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
774; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
775; SSE3-NEXT:    movdqa %xmm0, (%rdx)
776; SSE3-NEXT:    movdqa %xmm0, 16(%rdx)
777; SSE3-NEXT:    retq
778;
779; SSSE3-ONLY-LABEL: vec256_v2i8:
780; SSSE3-ONLY:       # %bb.0:
781; SSSE3-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
782; SSSE3-ONLY-NEXT:    pxor (%rdi), %xmm0
783; SSSE3-ONLY-NEXT:    movd %xmm0, %eax
784; SSSE3-ONLY-NEXT:    movw %ax, (%rsi)
785; SSSE3-ONLY-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
786; SSSE3-ONLY-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
787; SSSE3-ONLY-NEXT:    movdqa %xmm0, (%rdx)
788; SSSE3-ONLY-NEXT:    movdqa %xmm0, 16(%rdx)
789; SSSE3-ONLY-NEXT:    retq
790;
791; SSE41-LABEL: vec256_v2i8:
792; SSE41:       # %bb.0:
793; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
794; SSE41-NEXT:    pxor (%rdi), %xmm0
795; SSE41-NEXT:    pextrw $0, %xmm0, (%rsi)
796; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
797; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
798; SSE41-NEXT:    movdqa %xmm0, (%rdx)
799; SSE41-NEXT:    movdqa %xmm0, 16(%rdx)
800; SSE41-NEXT:    retq
801;
802; SSE42-LABEL: vec256_v2i8:
803; SSE42:       # %bb.0:
804; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
805; SSE42-NEXT:    pxor (%rdi), %xmm0
806; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
807; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
808; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
809; SSE42-NEXT:    movdqa %xmm0, (%rdx)
810; SSE42-NEXT:    movdqa %xmm0, 16(%rdx)
811; SSE42-NEXT:    retq
812;
813; AVX1-LABEL: vec256_v2i8:
814; AVX1:       # %bb.0:
815; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
816; AVX1-NEXT:    vpxor (%rdi), %xmm0, %xmm0
817; AVX1-NEXT:    vpextrw $0, %xmm0, (%rsi)
818; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
819; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
820; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
821; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
822; AVX1-NEXT:    vzeroupper
823; AVX1-NEXT:    retq
824;
825; AVX2-LABEL: vec256_v2i8:
826; AVX2:       # %bb.0:
827; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
828; AVX2-NEXT:    vpxor (%rdi), %xmm0, %xmm0
829; AVX2-NEXT:    vpextrw $0, %xmm0, (%rsi)
830; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
831; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
832; AVX2-NEXT:    vzeroupper
833; AVX2-NEXT:    retq
834  %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
835  %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
836  store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
837  %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
838  store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
839  %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
840  store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
841  %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2
842  store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
843  %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3
844  store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
845  %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4
846  store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8
847  %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5
848  store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2
849  %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6
850  store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4
851  %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7
852  store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2
853  %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8
854  store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16
855  %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9
856  store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2
857  %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10
858  store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4
859  %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11
860  store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2
861  %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12
862  store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8
863  %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13
864  store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2
865  %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14
866  store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4
867  %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15
868  store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2
869  ret void
870}
871
872define void @vec256_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
873; SCALAR-LABEL: vec256_v2i16:
874; SCALAR:       # %bb.0:
875; SCALAR-NEXT:    movzwl 2(%rdi), %eax
876; SCALAR-NEXT:    movl (%rdi), %ecx
877; SCALAR-NEXT:    notl %ecx
878; SCALAR-NEXT:    notl %eax
879; SCALAR-NEXT:    movw %ax, 2(%rsi)
880; SCALAR-NEXT:    movw %cx, (%rsi)
881; SCALAR-NEXT:    movw %ax, 2(%rdx)
882; SCALAR-NEXT:    movw %cx, (%rdx)
883; SCALAR-NEXT:    movw %ax, 6(%rdx)
884; SCALAR-NEXT:    movw %cx, 4(%rdx)
885; SCALAR-NEXT:    movw %ax, 10(%rdx)
886; SCALAR-NEXT:    movw %cx, 8(%rdx)
887; SCALAR-NEXT:    movw %ax, 14(%rdx)
888; SCALAR-NEXT:    movw %cx, 12(%rdx)
889; SCALAR-NEXT:    movw %ax, 18(%rdx)
890; SCALAR-NEXT:    movw %cx, 16(%rdx)
891; SCALAR-NEXT:    movw %ax, 22(%rdx)
892; SCALAR-NEXT:    movw %cx, 20(%rdx)
893; SCALAR-NEXT:    movw %ax, 26(%rdx)
894; SCALAR-NEXT:    movw %cx, 24(%rdx)
895; SCALAR-NEXT:    movw %ax, 30(%rdx)
896; SCALAR-NEXT:    movw %cx, 28(%rdx)
897; SCALAR-NEXT:    retq
898;
899; SSE2-LABEL: vec256_v2i16:
900; SSE2:       # %bb.0:
901; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
902; SSE2-NEXT:    pxor (%rdi), %xmm0
903; SSE2-NEXT:    movd %xmm0, (%rsi)
904; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
905; SSE2-NEXT:    movdqa %xmm0, (%rdx)
906; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
907; SSE2-NEXT:    retq
908;
909; AVX1-LABEL: vec256_v2i16:
910; AVX1:       # %bb.0:
911; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
912; AVX1-NEXT:    vpxor (%rdi), %xmm0, %xmm0
913; AVX1-NEXT:    vmovd %xmm0, (%rsi)
914; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
915; AVX1-NEXT:    vmovdqa %xmm0, 16(%rdx)
916; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
917; AVX1-NEXT:    retq
918;
919; AVX2-LABEL: vec256_v2i16:
920; AVX2:       # %bb.0:
921; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
922; AVX2-NEXT:    vpxor (%rdi), %xmm0, %xmm0
923; AVX2-NEXT:    vmovd %xmm0, (%rsi)
924; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
925; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
926; AVX2-NEXT:    vzeroupper
927; AVX2-NEXT:    retq
928  %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64
929  %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1>
930  store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
931  %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0
932  store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
933  %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1
934  store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
935  %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2
936  store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8
937  %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3
938  store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4
939  %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4
940  store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16
941  %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5
942  store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4
943  %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6
944  store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8
945  %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7
946  store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4
947  ret void
948}
949
950define void @vec256_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
951; SCALAR-LABEL: vec256_v2i32:
952; SCALAR:       # %bb.0:
953; SCALAR-NEXT:    movl (%rdi), %eax
954; SCALAR-NEXT:    movl 4(%rdi), %ecx
955; SCALAR-NEXT:    notl %eax
956; SCALAR-NEXT:    notl %ecx
957; SCALAR-NEXT:    movl %ecx, 4(%rsi)
958; SCALAR-NEXT:    movl %eax, (%rsi)
959; SCALAR-NEXT:    movl %ecx, 4(%rdx)
960; SCALAR-NEXT:    movl %eax, (%rdx)
961; SCALAR-NEXT:    movl %ecx, 12(%rdx)
962; SCALAR-NEXT:    movl %eax, 8(%rdx)
963; SCALAR-NEXT:    movl %ecx, 20(%rdx)
964; SCALAR-NEXT:    movl %eax, 16(%rdx)
965; SCALAR-NEXT:    movl %ecx, 28(%rdx)
966; SCALAR-NEXT:    movl %eax, 24(%rdx)
967; SCALAR-NEXT:    retq
968;
969; SSE2-LABEL: vec256_v2i32:
970; SSE2:       # %bb.0:
971; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
972; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
973; SSE2-NEXT:    pxor %xmm0, %xmm1
974; SSE2-NEXT:    movq %xmm1, (%rsi)
975; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
976; SSE2-NEXT:    movdqa %xmm0, (%rdx)
977; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
978; SSE2-NEXT:    retq
979;
980; AVX1-LABEL: vec256_v2i32:
981; AVX1:       # %bb.0:
982; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
983; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
984; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
985; AVX1-NEXT:    vmovq %xmm0, (%rsi)
986; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
987; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
988; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
989; AVX1-NEXT:    vzeroupper
990; AVX1-NEXT:    retq
991;
992; AVX2-ONLY-LABEL: vec256_v2i32:
993; AVX2-ONLY:       # %bb.0:
994; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
995; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
996; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
997; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
998; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %ymm0
999; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
1000; AVX2-ONLY-NEXT:    vzeroupper
1001; AVX2-ONLY-NEXT:    retq
1002;
1003; AVX512-LABEL: vec256_v2i32:
1004; AVX512:       # %bb.0:
1005; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1006; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
1007; AVX512-NEXT:    vmovq %xmm0, (%rsi)
1008; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
1009; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
1010; AVX512-NEXT:    vzeroupper
1011; AVX512-NEXT:    retq
1012  %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
1013  %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
1014  store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
1015  %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0
1016  store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
1017  %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1
1018  store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8
1019  %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2
1020  store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16
1021  %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3
1022  store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8
1023  ret void
1024}
1025
1026define void @vec256_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1027; SCALAR-LABEL: vec256_v2f32:
1028; SCALAR:       # %bb.0:
1029; SCALAR-NEXT:    movl (%rdi), %eax
1030; SCALAR-NEXT:    movl 4(%rdi), %ecx
1031; SCALAR-NEXT:    notl %eax
1032; SCALAR-NEXT:    notl %ecx
1033; SCALAR-NEXT:    movl %ecx, 4(%rsi)
1034; SCALAR-NEXT:    movl %eax, (%rsi)
1035; SCALAR-NEXT:    movl %ecx, 4(%rdx)
1036; SCALAR-NEXT:    movl %eax, (%rdx)
1037; SCALAR-NEXT:    movl %ecx, 12(%rdx)
1038; SCALAR-NEXT:    movl %eax, 8(%rdx)
1039; SCALAR-NEXT:    movl %ecx, 20(%rdx)
1040; SCALAR-NEXT:    movl %eax, 16(%rdx)
1041; SCALAR-NEXT:    movl %ecx, 28(%rdx)
1042; SCALAR-NEXT:    movl %eax, 24(%rdx)
1043; SCALAR-NEXT:    retq
1044;
1045; SSE2-LABEL: vec256_v2f32:
1046; SSE2:       # %bb.0:
1047; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1048; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
1049; SSE2-NEXT:    pxor %xmm0, %xmm1
1050; SSE2-NEXT:    movq %xmm1, (%rsi)
1051; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
1052; SSE2-NEXT:    movdqa %xmm0, (%rdx)
1053; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
1054; SSE2-NEXT:    retq
1055;
1056; AVX1-LABEL: vec256_v2f32:
1057; AVX1:       # %bb.0:
1058; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1059; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1060; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1061; AVX1-NEXT:    vmovq %xmm0, (%rsi)
1062; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1063; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1064; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
1065; AVX1-NEXT:    vzeroupper
1066; AVX1-NEXT:    retq
1067;
1068; AVX2-ONLY-LABEL: vec256_v2f32:
1069; AVX2-ONLY:       # %bb.0:
1070; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1071; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1072; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1073; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
1074; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %ymm0
1075; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
1076; AVX2-ONLY-NEXT:    vzeroupper
1077; AVX2-ONLY-NEXT:    retq
1078;
1079; AVX512-LABEL: vec256_v2f32:
1080; AVX512:       # %bb.0:
1081; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1082; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
1083; AVX512-NEXT:    vmovq %xmm0, (%rsi)
1084; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
1085; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
1086; AVX512-NEXT:    vzeroupper
1087; AVX512-NEXT:    retq
1088  %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
1089  %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
1090  %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float>
1091  store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64
1092  %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0
1093  store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
1094  %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1
1095  store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8
1096  %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2
1097  store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16
1098  %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3
1099  store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8
1100  ret void
1101}
1102
1103define void @vec256_v2i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1104; SCALAR-LABEL: vec256_v2i64:
1105; SCALAR:       # %bb.0:
1106; SCALAR-NEXT:    movq (%rdi), %rax
1107; SCALAR-NEXT:    movq 8(%rdi), %rcx
1108; SCALAR-NEXT:    notq %rax
1109; SCALAR-NEXT:    notq %rcx
1110; SCALAR-NEXT:    movq %rcx, 8(%rsi)
1111; SCALAR-NEXT:    movq %rax, (%rsi)
1112; SCALAR-NEXT:    movq %rcx, 8(%rdx)
1113; SCALAR-NEXT:    movq %rax, (%rdx)
1114; SCALAR-NEXT:    movq %rcx, 24(%rdx)
1115; SCALAR-NEXT:    movq %rax, 16(%rdx)
1116; SCALAR-NEXT:    retq
1117;
1118; SSE2-LABEL: vec256_v2i64:
1119; SSE2:       # %bb.0:
1120; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
1121; SSE2-NEXT:    pxor (%rdi), %xmm0
1122; SSE2-NEXT:    movdqa %xmm0, (%rsi)
1123; SSE2-NEXT:    movdqa %xmm0, (%rdx)
1124; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
1125; SSE2-NEXT:    retq
1126;
1127; AVX-LABEL: vec256_v2i64:
1128; AVX:       # %bb.0:
1129; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1130; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
1131; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
1132; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
1133; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
1134; AVX-NEXT:    retq
1135  %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
1136  %in.subvec = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
1137  store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64
1138  %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0
1139  store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64
1140  %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1
1141  store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16
1142  ret void
1143}
1144
1145define void @vec256_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1146; SCALAR-LABEL: vec256_v2f64:
1147; SCALAR:       # %bb.0:
1148; SCALAR-NEXT:    movq (%rdi), %rax
1149; SCALAR-NEXT:    movq 8(%rdi), %rcx
1150; SCALAR-NEXT:    notq %rax
1151; SCALAR-NEXT:    notq %rcx
1152; SCALAR-NEXT:    movq %rcx, 8(%rsi)
1153; SCALAR-NEXT:    movq %rax, (%rsi)
1154; SCALAR-NEXT:    movq %rcx, 8(%rdx)
1155; SCALAR-NEXT:    movq %rax, (%rdx)
1156; SCALAR-NEXT:    movq %rcx, 24(%rdx)
1157; SCALAR-NEXT:    movq %rax, 16(%rdx)
1158; SCALAR-NEXT:    retq
1159;
1160; SSE2-LABEL: vec256_v2f64:
1161; SSE2:       # %bb.0:
1162; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
1163; SSE2-NEXT:    pxor (%rdi), %xmm0
1164; SSE2-NEXT:    movdqa %xmm0, (%rsi)
1165; SSE2-NEXT:    movdqa %xmm0, (%rdx)
1166; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
1167; SSE2-NEXT:    retq
1168;
1169; AVX-LABEL: vec256_v2f64:
1170; AVX:       # %bb.0:
1171; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1172; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
1173; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
1174; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
1175; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
1176; AVX-NEXT:    retq
1177  %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
1178  %in.subvec.int = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
1179  %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double>
1180  store <2 x double> %in.subvec, ptr %out.subvec.ptr, align 64
1181  %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0
1182  store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64
1183  %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1
1184  store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16
1185  ret void
1186}
1187
1188define void @vec256_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1189; SCALAR-LABEL: vec256_v4i8:
1190; SCALAR:       # %bb.0:
1191; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
1192; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
1193; SCALAR-NEXT:    movzbl (%rdi), %eax
1194; SCALAR-NEXT:    movzbl 1(%rdi), %edi
1195; SCALAR-NEXT:    notb %al
1196; SCALAR-NEXT:    notb %dil
1197; SCALAR-NEXT:    notb %cl
1198; SCALAR-NEXT:    notb %r8b
1199; SCALAR-NEXT:    movb %r8b, 3(%rsi)
1200; SCALAR-NEXT:    movb %cl, 2(%rsi)
1201; SCALAR-NEXT:    movb %dil, 1(%rsi)
1202; SCALAR-NEXT:    movb %al, (%rsi)
1203; SCALAR-NEXT:    movb %r8b, 3(%rdx)
1204; SCALAR-NEXT:    movb %cl, 2(%rdx)
1205; SCALAR-NEXT:    movb %dil, 1(%rdx)
1206; SCALAR-NEXT:    movb %al, (%rdx)
1207; SCALAR-NEXT:    movb %r8b, 7(%rdx)
1208; SCALAR-NEXT:    movb %cl, 6(%rdx)
1209; SCALAR-NEXT:    movb %dil, 5(%rdx)
1210; SCALAR-NEXT:    movb %al, 4(%rdx)
1211; SCALAR-NEXT:    movb %r8b, 11(%rdx)
1212; SCALAR-NEXT:    movb %cl, 10(%rdx)
1213; SCALAR-NEXT:    movb %dil, 9(%rdx)
1214; SCALAR-NEXT:    movb %al, 8(%rdx)
1215; SCALAR-NEXT:    movb %r8b, 15(%rdx)
1216; SCALAR-NEXT:    movb %cl, 14(%rdx)
1217; SCALAR-NEXT:    movb %dil, 13(%rdx)
1218; SCALAR-NEXT:    movb %al, 12(%rdx)
1219; SCALAR-NEXT:    movb %r8b, 19(%rdx)
1220; SCALAR-NEXT:    movb %cl, 18(%rdx)
1221; SCALAR-NEXT:    movb %dil, 17(%rdx)
1222; SCALAR-NEXT:    movb %al, 16(%rdx)
1223; SCALAR-NEXT:    movb %r8b, 23(%rdx)
1224; SCALAR-NEXT:    movb %cl, 22(%rdx)
1225; SCALAR-NEXT:    movb %dil, 21(%rdx)
1226; SCALAR-NEXT:    movb %al, 20(%rdx)
1227; SCALAR-NEXT:    movb %r8b, 27(%rdx)
1228; SCALAR-NEXT:    movb %cl, 26(%rdx)
1229; SCALAR-NEXT:    movb %dil, 25(%rdx)
1230; SCALAR-NEXT:    movb %al, 24(%rdx)
1231; SCALAR-NEXT:    movb %r8b, 31(%rdx)
1232; SCALAR-NEXT:    movb %cl, 30(%rdx)
1233; SCALAR-NEXT:    movb %dil, 29(%rdx)
1234; SCALAR-NEXT:    movb %al, 28(%rdx)
1235; SCALAR-NEXT:    retq
1236;
1237; SSE2-LABEL: vec256_v4i8:
1238; SSE2:       # %bb.0:
1239; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
1240; SSE2-NEXT:    pxor (%rdi), %xmm0
1241; SSE2-NEXT:    movd %xmm0, (%rsi)
1242; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1243; SSE2-NEXT:    movdqa %xmm0, (%rdx)
1244; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
1245; SSE2-NEXT:    retq
1246;
1247; AVX1-LABEL: vec256_v4i8:
1248; AVX1:       # %bb.0:
1249; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1250; AVX1-NEXT:    vpxor (%rdi), %xmm0, %xmm0
1251; AVX1-NEXT:    vmovd %xmm0, (%rsi)
1252; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1253; AVX1-NEXT:    vmovdqa %xmm0, 16(%rdx)
1254; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
1255; AVX1-NEXT:    retq
1256;
1257; AVX2-LABEL: vec256_v4i8:
1258; AVX2:       # %bb.0:
1259; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1260; AVX2-NEXT:    vpxor (%rdi), %xmm0, %xmm0
1261; AVX2-NEXT:    vmovd %xmm0, (%rsi)
1262; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
1263; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
1264; AVX2-NEXT:    vzeroupper
1265; AVX2-NEXT:    retq
1266  %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64
1267  %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1>
1268  store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
1269  %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0
1270  store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
1271  %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1
1272  store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
1273  %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2
1274  store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8
1275  %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3
1276  store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4
1277  %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4
1278  store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16
1279  %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5
1280  store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4
1281  %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6
1282  store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8
1283  %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7
1284  store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4
1285  ret void
1286}
1287
1288define void @vec256_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1289; SCALAR-LABEL: vec256_v4i16:
1290; SCALAR:       # %bb.0:
1291; SCALAR-NEXT:    movzwl 6(%rdi), %r8d
1292; SCALAR-NEXT:    movzwl 2(%rdi), %ecx
1293; SCALAR-NEXT:    movl (%rdi), %eax
1294; SCALAR-NEXT:    movl 4(%rdi), %edi
1295; SCALAR-NEXT:    notl %eax
1296; SCALAR-NEXT:    notl %ecx
1297; SCALAR-NEXT:    notl %edi
1298; SCALAR-NEXT:    notl %r8d
1299; SCALAR-NEXT:    movw %r8w, 6(%rsi)
1300; SCALAR-NEXT:    movw %di, 4(%rsi)
1301; SCALAR-NEXT:    movw %cx, 2(%rsi)
1302; SCALAR-NEXT:    movw %ax, (%rsi)
1303; SCALAR-NEXT:    movw %r8w, 6(%rdx)
1304; SCALAR-NEXT:    movw %di, 4(%rdx)
1305; SCALAR-NEXT:    movw %cx, 2(%rdx)
1306; SCALAR-NEXT:    movw %ax, (%rdx)
1307; SCALAR-NEXT:    movw %r8w, 14(%rdx)
1308; SCALAR-NEXT:    movw %di, 12(%rdx)
1309; SCALAR-NEXT:    movw %cx, 10(%rdx)
1310; SCALAR-NEXT:    movw %ax, 8(%rdx)
1311; SCALAR-NEXT:    movw %r8w, 22(%rdx)
1312; SCALAR-NEXT:    movw %di, 20(%rdx)
1313; SCALAR-NEXT:    movw %cx, 18(%rdx)
1314; SCALAR-NEXT:    movw %ax, 16(%rdx)
1315; SCALAR-NEXT:    movw %r8w, 30(%rdx)
1316; SCALAR-NEXT:    movw %di, 28(%rdx)
1317; SCALAR-NEXT:    movw %cx, 26(%rdx)
1318; SCALAR-NEXT:    movw %ax, 24(%rdx)
1319; SCALAR-NEXT:    retq
1320;
1321; SSE2-LABEL: vec256_v4i16:
1322; SSE2:       # %bb.0:
1323; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1324; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
1325; SSE2-NEXT:    pxor %xmm0, %xmm1
1326; SSE2-NEXT:    movq %xmm1, (%rsi)
1327; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
1328; SSE2-NEXT:    movdqa %xmm0, (%rdx)
1329; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
1330; SSE2-NEXT:    retq
1331;
1332; AVX1-LABEL: vec256_v4i16:
1333; AVX1:       # %bb.0:
1334; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1335; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1336; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1337; AVX1-NEXT:    vmovq %xmm0, (%rsi)
1338; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1339; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1340; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
1341; AVX1-NEXT:    vzeroupper
1342; AVX1-NEXT:    retq
1343;
1344; AVX2-ONLY-LABEL: vec256_v4i16:
1345; AVX2-ONLY:       # %bb.0:
1346; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1347; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1348; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1349; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
1350; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %ymm0
1351; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
1352; AVX2-ONLY-NEXT:    vzeroupper
1353; AVX2-ONLY-NEXT:    retq
1354;
1355; AVX512-LABEL: vec256_v4i16:
1356; AVX512:       # %bb.0:
1357; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1358; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
1359; AVX512-NEXT:    vmovq %xmm0, (%rsi)
1360; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
1361; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
1362; AVX512-NEXT:    vzeroupper
1363; AVX512-NEXT:    retq
1364  %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64
1365  %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1>
1366  store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
1367  %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0
1368  store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
1369  %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1
1370  store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8
1371  %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2
1372  store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16
1373  %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3
1374  store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8
1375  ret void
1376}
1377
1378define void @vec256_v4i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1379; SCALAR-LABEL: vec256_v4i32:
1380; SCALAR:       # %bb.0:
1381; SCALAR-NEXT:    movaps (%rdi), %xmm0
1382; SCALAR-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1383; SCALAR-NEXT:    movaps %xmm0, (%rsi)
1384; SCALAR-NEXT:    movaps %xmm0, (%rdx)
1385; SCALAR-NEXT:    movaps %xmm0, 16(%rdx)
1386; SCALAR-NEXT:    retq
1387;
1388; SSE2-LABEL: vec256_v4i32:
1389; SSE2:       # %bb.0:
1390; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
1391; SSE2-NEXT:    pxor (%rdi), %xmm0
1392; SSE2-NEXT:    movdqa %xmm0, (%rsi)
1393; SSE2-NEXT:    movdqa %xmm0, (%rdx)
1394; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
1395; SSE2-NEXT:    retq
1396;
1397; AVX-LABEL: vec256_v4i32:
1398; AVX:       # %bb.0:
1399; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1400; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
1401; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
1402; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
1403; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
1404; AVX-NEXT:    retq
1405  %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
1406  %in.subvec = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
1407  store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
1408  %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0
1409  store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
1410  %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1
1411  store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16
1412  ret void
1413}
1414
1415define void @vec256_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1416; SCALAR-LABEL: vec256_v4f32:
1417; SCALAR:       # %bb.0:
1418; SCALAR-NEXT:    movaps (%rdi), %xmm0
1419; SCALAR-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1420; SCALAR-NEXT:    movaps %xmm0, (%rsi)
1421; SCALAR-NEXT:    movaps %xmm0, (%rdx)
1422; SCALAR-NEXT:    movaps %xmm0, 16(%rdx)
1423; SCALAR-NEXT:    retq
1424;
1425; SSE2-LABEL: vec256_v4f32:
1426; SSE2:       # %bb.0:
1427; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
1428; SSE2-NEXT:    pxor (%rdi), %xmm0
1429; SSE2-NEXT:    movdqa %xmm0, (%rsi)
1430; SSE2-NEXT:    movdqa %xmm0, (%rdx)
1431; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
1432; SSE2-NEXT:    retq
1433;
1434; AVX-LABEL: vec256_v4f32:
1435; AVX:       # %bb.0:
1436; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1437; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
1438; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
1439; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
1440; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
1441; AVX-NEXT:    retq
1442  %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
1443  %in.subvec.int = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
1444  %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float>
1445  store <4 x float> %in.subvec, ptr %out.subvec.ptr, align 64
1446  %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0
1447  store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
1448  %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1
1449  store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16
1450  ret void
1451}
1452
1453define void @vec256_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1454; SCALAR-LABEL: vec256_v8i8:
1455; SCALAR:       # %bb.0:
1456; SCALAR-NEXT:    pushq %rbx
1457; SCALAR-NEXT:    movzbl 7(%rdi), %ebx
1458; SCALAR-NEXT:    movzbl 6(%rdi), %r11d
1459; SCALAR-NEXT:    movzbl 5(%rdi), %r10d
1460; SCALAR-NEXT:    movzbl 4(%rdi), %r9d
1461; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
1462; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
1463; SCALAR-NEXT:    movzbl (%rdi), %eax
1464; SCALAR-NEXT:    movzbl 1(%rdi), %edi
1465; SCALAR-NEXT:    notb %al
1466; SCALAR-NEXT:    notb %dil
1467; SCALAR-NEXT:    notb %cl
1468; SCALAR-NEXT:    notb %r8b
1469; SCALAR-NEXT:    notb %r9b
1470; SCALAR-NEXT:    notb %r10b
1471; SCALAR-NEXT:    notb %r11b
1472; SCALAR-NEXT:    notb %bl
1473; SCALAR-NEXT:    movb %bl, 7(%rsi)
1474; SCALAR-NEXT:    movb %r11b, 6(%rsi)
1475; SCALAR-NEXT:    movb %r10b, 5(%rsi)
1476; SCALAR-NEXT:    movb %r9b, 4(%rsi)
1477; SCALAR-NEXT:    movb %r8b, 3(%rsi)
1478; SCALAR-NEXT:    movb %cl, 2(%rsi)
1479; SCALAR-NEXT:    movb %dil, 1(%rsi)
1480; SCALAR-NEXT:    movb %al, (%rsi)
1481; SCALAR-NEXT:    movb %bl, 7(%rdx)
1482; SCALAR-NEXT:    movb %r11b, 6(%rdx)
1483; SCALAR-NEXT:    movb %r10b, 5(%rdx)
1484; SCALAR-NEXT:    movb %r9b, 4(%rdx)
1485; SCALAR-NEXT:    movb %r8b, 3(%rdx)
1486; SCALAR-NEXT:    movb %cl, 2(%rdx)
1487; SCALAR-NEXT:    movb %dil, 1(%rdx)
1488; SCALAR-NEXT:    movb %al, (%rdx)
1489; SCALAR-NEXT:    movb %bl, 15(%rdx)
1490; SCALAR-NEXT:    movb %r11b, 14(%rdx)
1491; SCALAR-NEXT:    movb %r10b, 13(%rdx)
1492; SCALAR-NEXT:    movb %r9b, 12(%rdx)
1493; SCALAR-NEXT:    movb %r8b, 11(%rdx)
1494; SCALAR-NEXT:    movb %cl, 10(%rdx)
1495; SCALAR-NEXT:    movb %dil, 9(%rdx)
1496; SCALAR-NEXT:    movb %al, 8(%rdx)
1497; SCALAR-NEXT:    movb %bl, 23(%rdx)
1498; SCALAR-NEXT:    movb %r11b, 22(%rdx)
1499; SCALAR-NEXT:    movb %r10b, 21(%rdx)
1500; SCALAR-NEXT:    movb %r9b, 20(%rdx)
1501; SCALAR-NEXT:    movb %r8b, 19(%rdx)
1502; SCALAR-NEXT:    movb %cl, 18(%rdx)
1503; SCALAR-NEXT:    movb %dil, 17(%rdx)
1504; SCALAR-NEXT:    movb %al, 16(%rdx)
1505; SCALAR-NEXT:    movb %bl, 31(%rdx)
1506; SCALAR-NEXT:    movb %r11b, 30(%rdx)
1507; SCALAR-NEXT:    movb %r10b, 29(%rdx)
1508; SCALAR-NEXT:    movb %r9b, 28(%rdx)
1509; SCALAR-NEXT:    movb %r8b, 27(%rdx)
1510; SCALAR-NEXT:    movb %cl, 26(%rdx)
1511; SCALAR-NEXT:    movb %dil, 25(%rdx)
1512; SCALAR-NEXT:    movb %al, 24(%rdx)
1513; SCALAR-NEXT:    popq %rbx
1514; SCALAR-NEXT:    retq
1515;
1516; SSE2-LABEL: vec256_v8i8:
1517; SSE2:       # %bb.0:
1518; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1519; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
1520; SSE2-NEXT:    pxor %xmm0, %xmm1
1521; SSE2-NEXT:    movq %xmm1, (%rsi)
1522; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
1523; SSE2-NEXT:    movdqa %xmm0, (%rdx)
1524; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
1525; SSE2-NEXT:    retq
1526;
1527; AVX1-LABEL: vec256_v8i8:
1528; AVX1:       # %bb.0:
1529; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1530; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1531; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1532; AVX1-NEXT:    vmovq %xmm0, (%rsi)
1533; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1534; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1535; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
1536; AVX1-NEXT:    vzeroupper
1537; AVX1-NEXT:    retq
1538;
1539; AVX2-ONLY-LABEL: vec256_v8i8:
1540; AVX2-ONLY:       # %bb.0:
1541; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1542; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1543; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1544; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
1545; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %ymm0
1546; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
1547; AVX2-ONLY-NEXT:    vzeroupper
1548; AVX2-ONLY-NEXT:    retq
1549;
1550; AVX512-LABEL: vec256_v8i8:
1551; AVX512:       # %bb.0:
1552; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1553; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
1554; AVX512-NEXT:    vmovq %xmm0, (%rsi)
1555; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
1556; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
1557; AVX512-NEXT:    vzeroupper
1558; AVX512-NEXT:    retq
1559  %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64
1560  %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1561  store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
1562  %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0
1563  store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
1564  %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1
1565  store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8
1566  %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2
1567  store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16
1568  %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3
1569  store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8
1570  ret void
1571}
1572
1573define void @vec256_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1574; SCALAR-LABEL: vec256_v8i16:
1575; SCALAR:       # %bb.0:
1576; SCALAR-NEXT:    pushq %rbx
1577; SCALAR-NEXT:    movzwl 14(%rdi), %ebx
1578; SCALAR-NEXT:    movl 12(%rdi), %r11d
1579; SCALAR-NEXT:    movzwl 10(%rdi), %r10d
1580; SCALAR-NEXT:    movl 8(%rdi), %r9d
1581; SCALAR-NEXT:    movzwl 6(%rdi), %r8d
1582; SCALAR-NEXT:    movzwl 2(%rdi), %ecx
1583; SCALAR-NEXT:    movl (%rdi), %eax
1584; SCALAR-NEXT:    movl 4(%rdi), %edi
1585; SCALAR-NEXT:    notl %eax
1586; SCALAR-NEXT:    notl %ecx
1587; SCALAR-NEXT:    notl %edi
1588; SCALAR-NEXT:    notl %r8d
1589; SCALAR-NEXT:    notl %r9d
1590; SCALAR-NEXT:    notl %r10d
1591; SCALAR-NEXT:    notl %r11d
1592; SCALAR-NEXT:    notl %ebx
1593; SCALAR-NEXT:    movw %bx, 14(%rsi)
1594; SCALAR-NEXT:    movw %r11w, 12(%rsi)
1595; SCALAR-NEXT:    movw %r10w, 10(%rsi)
1596; SCALAR-NEXT:    movw %r9w, 8(%rsi)
1597; SCALAR-NEXT:    movw %r8w, 6(%rsi)
1598; SCALAR-NEXT:    movw %di, 4(%rsi)
1599; SCALAR-NEXT:    movw %cx, 2(%rsi)
1600; SCALAR-NEXT:    movw %ax, (%rsi)
1601; SCALAR-NEXT:    movw %bx, 14(%rdx)
1602; SCALAR-NEXT:    movw %r11w, 12(%rdx)
1603; SCALAR-NEXT:    movw %r10w, 10(%rdx)
1604; SCALAR-NEXT:    movw %r9w, 8(%rdx)
1605; SCALAR-NEXT:    movw %r8w, 6(%rdx)
1606; SCALAR-NEXT:    movw %di, 4(%rdx)
1607; SCALAR-NEXT:    movw %cx, 2(%rdx)
1608; SCALAR-NEXT:    movw %ax, (%rdx)
1609; SCALAR-NEXT:    movw %bx, 30(%rdx)
1610; SCALAR-NEXT:    movw %r11w, 28(%rdx)
1611; SCALAR-NEXT:    movw %r10w, 26(%rdx)
1612; SCALAR-NEXT:    movw %r9w, 24(%rdx)
1613; SCALAR-NEXT:    movw %r8w, 22(%rdx)
1614; SCALAR-NEXT:    movw %di, 20(%rdx)
1615; SCALAR-NEXT:    movw %cx, 18(%rdx)
1616; SCALAR-NEXT:    movw %ax, 16(%rdx)
1617; SCALAR-NEXT:    popq %rbx
1618; SCALAR-NEXT:    retq
1619;
1620; SSE2-LABEL: vec256_v8i16:
1621; SSE2:       # %bb.0:
1622; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
1623; SSE2-NEXT:    pxor (%rdi), %xmm0
1624; SSE2-NEXT:    movdqa %xmm0, (%rsi)
1625; SSE2-NEXT:    movdqa %xmm0, (%rdx)
1626; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
1627; SSE2-NEXT:    retq
1628;
1629; AVX-LABEL: vec256_v8i16:
1630; AVX:       # %bb.0:
1631; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1632; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
1633; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
1634; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
1635; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
1636; AVX-NEXT:    retq
1637  %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64
1638  %in.subvec = xor <8 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
1639  store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
1640  %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0
1641  store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
1642  %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1
1643  store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16
1644  ret void
1645}
1646
1647define void @vec256_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1648; SCALAR-LABEL: vec256_v16i8:
1649; SCALAR:       # %bb.0:
1650; SCALAR-NEXT:    pushq %rbp
1651; SCALAR-NEXT:    pushq %r15
1652; SCALAR-NEXT:    pushq %r14
1653; SCALAR-NEXT:    pushq %r13
1654; SCALAR-NEXT:    pushq %r12
1655; SCALAR-NEXT:    pushq %rbx
1656; SCALAR-NEXT:    movzbl 15(%rdi), %eax
1657; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1658; SCALAR-NEXT:    movzbl 14(%rdi), %eax
1659; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1660; SCALAR-NEXT:    movzbl 13(%rdi), %eax
1661; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1662; SCALAR-NEXT:    movzbl 12(%rdi), %r15d
1663; SCALAR-NEXT:    movzbl 11(%rdi), %eax
1664; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1665; SCALAR-NEXT:    movzbl 10(%rdi), %ebp
1666; SCALAR-NEXT:    movzbl 9(%rdi), %r14d
1667; SCALAR-NEXT:    movzbl 8(%rdi), %eax
1668; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1669; SCALAR-NEXT:    movzbl 7(%rdi), %r12d
1670; SCALAR-NEXT:    movzbl 6(%rdi), %r10d
1671; SCALAR-NEXT:    movzbl 5(%rdi), %r9d
1672; SCALAR-NEXT:    movzbl 4(%rdi), %ebx
1673; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
1674; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
1675; SCALAR-NEXT:    movzbl (%rdi), %eax
1676; SCALAR-NEXT:    movzbl 1(%rdi), %r13d
1677; SCALAR-NEXT:    notb %al
1678; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1679; SCALAR-NEXT:    notb %r13b
1680; SCALAR-NEXT:    notb %cl
1681; SCALAR-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1682; SCALAR-NEXT:    notb %r8b
1683; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1684; SCALAR-NEXT:    notb %bl
1685; SCALAR-NEXT:    notb %r9b
1686; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1687; SCALAR-NEXT:    notb %r10b
1688; SCALAR-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1689; SCALAR-NEXT:    notb %r12b
1690; SCALAR-NEXT:    movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1691; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
1692; SCALAR-NEXT:    notb %r11b
1693; SCALAR-NEXT:    movl %r14d, %r10d
1694; SCALAR-NEXT:    notb %r10b
1695; SCALAR-NEXT:    notb %bpl
1696; SCALAR-NEXT:    movl %ebp, %r14d
1697; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
1698; SCALAR-NEXT:    notb %r8b
1699; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1700; SCALAR-NEXT:    movl %r15d, %edi
1701; SCALAR-NEXT:    notb %dil
1702; SCALAR-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1703; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
1704; SCALAR-NEXT:    notb %r9b
1705; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
1706; SCALAR-NEXT:    notb %bpl
1707; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
1708; SCALAR-NEXT:    notb %r15b
1709; SCALAR-NEXT:    movb %r15b, 15(%rsi)
1710; SCALAR-NEXT:    movb %bpl, 14(%rsi)
1711; SCALAR-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1712; SCALAR-NEXT:    movl %r9d, %eax
1713; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1714; SCALAR-NEXT:    movb %r9b, 13(%rsi)
1715; SCALAR-NEXT:    movb %dil, 12(%rsi)
1716; SCALAR-NEXT:    movb %r8b, 11(%rsi)
1717; SCALAR-NEXT:    movb %r14b, 10(%rsi)
1718; SCALAR-NEXT:    movb %r10b, 9(%rsi)
1719; SCALAR-NEXT:    movl %r10d, %r8d
1720; SCALAR-NEXT:    movb %r11b, 8(%rsi)
1721; SCALAR-NEXT:    movl %r11d, %r9d
1722; SCALAR-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1723; SCALAR-NEXT:    movb %r12b, 7(%rsi)
1724; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1725; SCALAR-NEXT:    movb %cl, 6(%rsi)
1726; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
1727; SCALAR-NEXT:    movb %dil, 5(%rsi)
1728; SCALAR-NEXT:    movb %bl, 4(%rsi)
1729; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1730; SCALAR-NEXT:    movb %cl, 3(%rsi)
1731; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1732; SCALAR-NEXT:    movb %cl, 2(%rsi)
1733; SCALAR-NEXT:    movb %r13b, 1(%rsi)
1734; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
1735; SCALAR-NEXT:    movb %r10b, (%rsi)
1736; SCALAR-NEXT:    movb %r15b, 15(%rdx)
1737; SCALAR-NEXT:    movl %r15d, %r11d
1738; SCALAR-NEXT:    movb %bpl, 14(%rdx)
1739; SCALAR-NEXT:    movb %al, 13(%rdx)
1740; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
1741; SCALAR-NEXT:    movb %r12b, 12(%rdx)
1742; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
1743; SCALAR-NEXT:    movb %r15b, 11(%rdx)
1744; SCALAR-NEXT:    movb %r14b, 10(%rdx)
1745; SCALAR-NEXT:    movb %r8b, 9(%rdx)
1746; SCALAR-NEXT:    movb %r9b, 8(%rdx)
1747; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
1748; SCALAR-NEXT:    movb %r9b, 7(%rdx)
1749; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1750; SCALAR-NEXT:    movb %al, 6(%rdx)
1751; SCALAR-NEXT:    movb %dil, 5(%rdx)
1752; SCALAR-NEXT:    movb %bl, 4(%rdx)
1753; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
1754; SCALAR-NEXT:    movb %sil, 3(%rdx)
1755; SCALAR-NEXT:    movb %cl, 2(%rdx)
1756; SCALAR-NEXT:    movb %r13b, 1(%rdx)
1757; SCALAR-NEXT:    movl %r10d, %edi
1758; SCALAR-NEXT:    movb %r10b, (%rdx)
1759; SCALAR-NEXT:    movb %r11b, 31(%rdx)
1760; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
1761; SCALAR-NEXT:    movb %r10b, 30(%rdx)
1762; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
1763; SCALAR-NEXT:    movb %r10b, 29(%rdx)
1764; SCALAR-NEXT:    movb %r12b, 28(%rdx)
1765; SCALAR-NEXT:    movb %r15b, 27(%rdx)
1766; SCALAR-NEXT:    movb %r14b, 26(%rdx)
1767; SCALAR-NEXT:    movb %r8b, 25(%rdx)
1768; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
1769; SCALAR-NEXT:    movb %r10b, 24(%rdx)
1770; SCALAR-NEXT:    movb %r9b, 23(%rdx)
1771; SCALAR-NEXT:    movb %al, 22(%rdx)
1772; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1773; SCALAR-NEXT:    movb %al, 21(%rdx)
1774; SCALAR-NEXT:    movb %bl, 20(%rdx)
1775; SCALAR-NEXT:    movb %sil, 19(%rdx)
1776; SCALAR-NEXT:    movb %cl, 18(%rdx)
1777; SCALAR-NEXT:    movb %r13b, 17(%rdx)
1778; SCALAR-NEXT:    movb %dil, 16(%rdx)
1779; SCALAR-NEXT:    popq %rbx
1780; SCALAR-NEXT:    popq %r12
1781; SCALAR-NEXT:    popq %r13
1782; SCALAR-NEXT:    popq %r14
1783; SCALAR-NEXT:    popq %r15
1784; SCALAR-NEXT:    popq %rbp
1785; SCALAR-NEXT:    retq
1786;
1787; SSE2-LABEL: vec256_v16i8:
1788; SSE2:       # %bb.0:
1789; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
1790; SSE2-NEXT:    pxor (%rdi), %xmm0
1791; SSE2-NEXT:    movdqa %xmm0, (%rsi)
1792; SSE2-NEXT:    movdqa %xmm0, (%rdx)
1793; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
1794; SSE2-NEXT:    retq
1795;
1796; AVX-LABEL: vec256_v16i8:
1797; AVX:       # %bb.0:
1798; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1799; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
1800; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
1801; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
1802; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
1803; AVX-NEXT:    retq
1804  %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64
1805  %in.subvec = xor <16 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1806  store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
1807  %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0
1808  store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
1809  %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1
1810  store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16
1811  ret void
1812}
1813
1814define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1815; SCALAR-LABEL: vec384_v2i8:
1816; SCALAR:       # %bb.0:
1817; SCALAR-NEXT:    movzbl (%rdi), %eax
1818; SCALAR-NEXT:    movzbl 1(%rdi), %ecx
1819; SCALAR-NEXT:    notb %al
1820; SCALAR-NEXT:    notb %cl
1821; SCALAR-NEXT:    movb %cl, 1(%rsi)
1822; SCALAR-NEXT:    movb %al, (%rsi)
1823; SCALAR-NEXT:    movb %cl, 1(%rdx)
1824; SCALAR-NEXT:    movb %al, (%rdx)
1825; SCALAR-NEXT:    movb %cl, 3(%rdx)
1826; SCALAR-NEXT:    movb %al, 2(%rdx)
1827; SCALAR-NEXT:    movb %cl, 5(%rdx)
1828; SCALAR-NEXT:    movb %al, 4(%rdx)
1829; SCALAR-NEXT:    movb %cl, 7(%rdx)
1830; SCALAR-NEXT:    movb %al, 6(%rdx)
1831; SCALAR-NEXT:    movb %cl, 9(%rdx)
1832; SCALAR-NEXT:    movb %al, 8(%rdx)
1833; SCALAR-NEXT:    movb %cl, 11(%rdx)
1834; SCALAR-NEXT:    movb %al, 10(%rdx)
1835; SCALAR-NEXT:    movb %cl, 13(%rdx)
1836; SCALAR-NEXT:    movb %al, 12(%rdx)
1837; SCALAR-NEXT:    movb %cl, 15(%rdx)
1838; SCALAR-NEXT:    movb %al, 14(%rdx)
1839; SCALAR-NEXT:    movb %cl, 17(%rdx)
1840; SCALAR-NEXT:    movb %al, 16(%rdx)
1841; SCALAR-NEXT:    movb %cl, 19(%rdx)
1842; SCALAR-NEXT:    movb %al, 18(%rdx)
1843; SCALAR-NEXT:    movb %cl, 21(%rdx)
1844; SCALAR-NEXT:    movb %al, 20(%rdx)
1845; SCALAR-NEXT:    movb %cl, 23(%rdx)
1846; SCALAR-NEXT:    movb %al, 22(%rdx)
1847; SCALAR-NEXT:    movb %cl, 25(%rdx)
1848; SCALAR-NEXT:    movb %al, 24(%rdx)
1849; SCALAR-NEXT:    movb %cl, 27(%rdx)
1850; SCALAR-NEXT:    movb %al, 26(%rdx)
1851; SCALAR-NEXT:    movb %cl, 29(%rdx)
1852; SCALAR-NEXT:    movb %al, 28(%rdx)
1853; SCALAR-NEXT:    movb %cl, 31(%rdx)
1854; SCALAR-NEXT:    movb %al, 30(%rdx)
1855; SCALAR-NEXT:    movb %cl, 33(%rdx)
1856; SCALAR-NEXT:    movb %al, 32(%rdx)
1857; SCALAR-NEXT:    movb %cl, 35(%rdx)
1858; SCALAR-NEXT:    movb %al, 34(%rdx)
1859; SCALAR-NEXT:    movb %cl, 37(%rdx)
1860; SCALAR-NEXT:    movb %al, 36(%rdx)
1861; SCALAR-NEXT:    movb %cl, 39(%rdx)
1862; SCALAR-NEXT:    movb %al, 38(%rdx)
1863; SCALAR-NEXT:    movb %cl, 41(%rdx)
1864; SCALAR-NEXT:    movb %al, 40(%rdx)
1865; SCALAR-NEXT:    movb %cl, 43(%rdx)
1866; SCALAR-NEXT:    movb %al, 42(%rdx)
1867; SCALAR-NEXT:    movb %cl, 45(%rdx)
1868; SCALAR-NEXT:    movb %al, 44(%rdx)
1869; SCALAR-NEXT:    movb %cl, 47(%rdx)
1870; SCALAR-NEXT:    movb %al, 46(%rdx)
1871; SCALAR-NEXT:    retq
1872;
1873; SSE2-ONLY-LABEL: vec384_v2i8:
1874; SSE2-ONLY:       # %bb.0:
1875; SSE2-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
1876; SSE2-ONLY-NEXT:    pxor (%rdi), %xmm0
1877; SSE2-ONLY-NEXT:    movd %xmm0, %eax
1878; SSE2-ONLY-NEXT:    movw %ax, (%rsi)
1879; SSE2-ONLY-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1880; SSE2-ONLY-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1881; SSE2-ONLY-NEXT:    movdqa %xmm0, (%rdx)
1882; SSE2-ONLY-NEXT:    movdqa %xmm0, 16(%rdx)
1883; SSE2-ONLY-NEXT:    movdqa %xmm0, 32(%rdx)
1884; SSE2-ONLY-NEXT:    retq
1885;
1886; SSE3-LABEL: vec384_v2i8:
1887; SSE3:       # %bb.0:
1888; SSE3-NEXT:    pcmpeqd %xmm0, %xmm0
1889; SSE3-NEXT:    pxor (%rdi), %xmm0
1890; SSE3-NEXT:    movd %xmm0, %eax
1891; SSE3-NEXT:    movw %ax, (%rsi)
1892; SSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1893; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1894; SSE3-NEXT:    movdqa %xmm0, (%rdx)
1895; SSE3-NEXT:    movdqa %xmm0, 16(%rdx)
1896; SSE3-NEXT:    movdqa %xmm0, 32(%rdx)
1897; SSE3-NEXT:    retq
1898;
1899; SSSE3-ONLY-LABEL: vec384_v2i8:
1900; SSSE3-ONLY:       # %bb.0:
1901; SSSE3-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
1902; SSSE3-ONLY-NEXT:    pxor (%rdi), %xmm0
1903; SSSE3-ONLY-NEXT:    movd %xmm0, %eax
1904; SSSE3-ONLY-NEXT:    movw %ax, (%rsi)
1905; SSSE3-ONLY-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1906; SSSE3-ONLY-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1907; SSSE3-ONLY-NEXT:    movdqa %xmm0, (%rdx)
1908; SSSE3-ONLY-NEXT:    movdqa %xmm0, 16(%rdx)
1909; SSSE3-ONLY-NEXT:    movdqa %xmm0, 32(%rdx)
1910; SSSE3-ONLY-NEXT:    retq
1911;
1912; SSE41-LABEL: vec384_v2i8:
1913; SSE41:       # %bb.0:
1914; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
1915; SSE41-NEXT:    pxor (%rdi), %xmm0
1916; SSE41-NEXT:    pextrw $0, %xmm0, (%rsi)
1917; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1918; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1919; SSE41-NEXT:    movdqa %xmm0, (%rdx)
1920; SSE41-NEXT:    movdqa %xmm0, 16(%rdx)
1921; SSE41-NEXT:    movdqa %xmm0, 32(%rdx)
1922; SSE41-NEXT:    retq
1923;
1924; SSE42-LABEL: vec384_v2i8:
1925; SSE42:       # %bb.0:
1926; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
1927; SSE42-NEXT:    pxor (%rdi), %xmm0
1928; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
1929; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1930; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1931; SSE42-NEXT:    movdqa %xmm0, (%rdx)
1932; SSE42-NEXT:    movdqa %xmm0, 16(%rdx)
1933; SSE42-NEXT:    movdqa %xmm0, 32(%rdx)
1934; SSE42-NEXT:    retq
1935;
1936; AVX1-LABEL: vec384_v2i8:
1937; AVX1:       # %bb.0:
1938; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1939; AVX1-NEXT:    vpxor (%rdi), %xmm0, %xmm0
1940; AVX1-NEXT:    vpextrw $0, %xmm0, (%rsi)
1941; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1942; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1943; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
1944; AVX1-NEXT:    vmovaps %ymm1, (%rdx)
1945; AVX1-NEXT:    vmovdqa %xmm0, 32(%rdx)
1946; AVX1-NEXT:    vzeroupper
1947; AVX1-NEXT:    retq
1948;
1949; AVX2-LABEL: vec384_v2i8:
1950; AVX2:       # %bb.0:
1951; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1952; AVX2-NEXT:    vpxor (%rdi), %xmm0, %xmm0
1953; AVX2-NEXT:    vpextrw $0, %xmm0, (%rsi)
1954; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
1955; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
1956; AVX2-NEXT:    vmovdqa %xmm0, 32(%rdx)
1957; AVX2-NEXT:    vzeroupper
1958; AVX2-NEXT:    retq
1959  %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
1960  %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
1961  store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
1962  %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
1963  store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
1964  %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
1965  store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
1966  %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2
1967  store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
1968  %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3
1969  store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
1970  %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4
1971  store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8
1972  %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5
1973  store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2
1974  %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6
1975  store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4
1976  %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7
1977  store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2
1978  %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8
1979  store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16
1980  %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9
1981  store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2
1982  %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10
1983  store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4
1984  %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11
1985  store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2
1986  %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12
1987  store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8
1988  %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13
1989  store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2
1990  %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14
1991  store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4
1992  %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15
1993  store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2
1994  %out.subvec16.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 16
1995  store <2 x i8> %in.subvec, ptr %out.subvec16.ptr, align 32
1996  %out.subvec17.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 17
1997  store <2 x i8> %in.subvec, ptr %out.subvec17.ptr, align 2
1998  %out.subvec18.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 18
1999  store <2 x i8> %in.subvec, ptr %out.subvec18.ptr, align 4
2000  %out.subvec19.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 19
2001  store <2 x i8> %in.subvec, ptr %out.subvec19.ptr, align 2
2002  %out.subvec20.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 20
2003  store <2 x i8> %in.subvec, ptr %out.subvec20.ptr, align 8
2004  %out.subvec21.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 21
2005  store <2 x i8> %in.subvec, ptr %out.subvec21.ptr, align 2
2006  %out.subvec22.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 22
2007  store <2 x i8> %in.subvec, ptr %out.subvec22.ptr, align 4
2008  %out.subvec23.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 23
2009  store <2 x i8> %in.subvec, ptr %out.subvec23.ptr, align 2
2010  ret void
2011}
2012
2013define void @vec384_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2014; SCALAR-LABEL: vec384_v2i16:
2015; SCALAR:       # %bb.0:
2016; SCALAR-NEXT:    movzwl 2(%rdi), %ecx
2017; SCALAR-NEXT:    movl (%rdi), %eax
2018; SCALAR-NEXT:    notl %eax
2019; SCALAR-NEXT:    notl %ecx
2020; SCALAR-NEXT:    movw %cx, 2(%rsi)
2021; SCALAR-NEXT:    movw %ax, (%rsi)
2022; SCALAR-NEXT:    movw %cx, 2(%rdx)
2023; SCALAR-NEXT:    movw %ax, (%rdx)
2024; SCALAR-NEXT:    movw %cx, 6(%rdx)
2025; SCALAR-NEXT:    movw %ax, 4(%rdx)
2026; SCALAR-NEXT:    movw %cx, 10(%rdx)
2027; SCALAR-NEXT:    movw %ax, 8(%rdx)
2028; SCALAR-NEXT:    movw %cx, 14(%rdx)
2029; SCALAR-NEXT:    movw %ax, 12(%rdx)
2030; SCALAR-NEXT:    movw %cx, 18(%rdx)
2031; SCALAR-NEXT:    movw %ax, 16(%rdx)
2032; SCALAR-NEXT:    movw %cx, 22(%rdx)
2033; SCALAR-NEXT:    movw %ax, 20(%rdx)
2034; SCALAR-NEXT:    movw %cx, 26(%rdx)
2035; SCALAR-NEXT:    movw %ax, 24(%rdx)
2036; SCALAR-NEXT:    movw %cx, 30(%rdx)
2037; SCALAR-NEXT:    movw %ax, 28(%rdx)
2038; SCALAR-NEXT:    movw %cx, 34(%rdx)
2039; SCALAR-NEXT:    movw %ax, 32(%rdx)
2040; SCALAR-NEXT:    movw %cx, 38(%rdx)
2041; SCALAR-NEXT:    movw %ax, 36(%rdx)
2042; SCALAR-NEXT:    movw %cx, 42(%rdx)
2043; SCALAR-NEXT:    movw %ax, 40(%rdx)
2044; SCALAR-NEXT:    movw %cx, 46(%rdx)
2045; SCALAR-NEXT:    movw %ax, 44(%rdx)
2046; SCALAR-NEXT:    retq
2047;
2048; SSE2-LABEL: vec384_v2i16:
2049; SSE2:       # %bb.0:
2050; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
2051; SSE2-NEXT:    pxor (%rdi), %xmm0
2052; SSE2-NEXT:    movd %xmm0, (%rsi)
2053; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2054; SSE2-NEXT:    movdqa %xmm0, (%rdx)
2055; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
2056; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
2057; SSE2-NEXT:    retq
2058;
2059; AVX1-LABEL: vec384_v2i16:
2060; AVX1:       # %bb.0:
2061; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
2062; AVX1-NEXT:    vpxor (%rdi), %xmm0, %xmm0
2063; AVX1-NEXT:    vmovd %xmm0, (%rsi)
2064; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2065; AVX1-NEXT:    vmovdqa %xmm0, 16(%rdx)
2066; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
2067; AVX1-NEXT:    vmovdqa %xmm0, 32(%rdx)
2068; AVX1-NEXT:    retq
2069;
2070; AVX2-LABEL: vec384_v2i16:
2071; AVX2:       # %bb.0:
2072; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
2073; AVX2-NEXT:    vpxor (%rdi), %xmm0, %xmm0
2074; AVX2-NEXT:    vmovd %xmm0, (%rsi)
2075; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
2076; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
2077; AVX2-NEXT:    vmovdqa %xmm0, 32(%rdx)
2078; AVX2-NEXT:    vzeroupper
2079; AVX2-NEXT:    retq
2080  %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64
2081  %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1>
2082  store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
2083  %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0
2084  store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
2085  %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1
2086  store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
2087  %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2
2088  store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8
2089  %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3
2090  store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4
2091  %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4
2092  store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16
2093  %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5
2094  store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4
2095  %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6
2096  store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8
2097  %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7
2098  store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4
2099  %out.subvec8.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 8
2100  store <2 x i16> %in.subvec, ptr %out.subvec8.ptr, align 32
2101  %out.subvec9.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 9
2102  store <2 x i16> %in.subvec, ptr %out.subvec9.ptr, align 4
2103  %out.subvec10.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 10
2104  store <2 x i16> %in.subvec, ptr %out.subvec10.ptr, align 8
2105  %out.subvec11.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 11
2106  store <2 x i16> %in.subvec, ptr %out.subvec11.ptr, align 4
2107  ret void
2108}
2109
2110define void @vec384_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2111; SCALAR-LABEL: vec384_v2i32:
2112; SCALAR:       # %bb.0:
2113; SCALAR-NEXT:    movl (%rdi), %eax
2114; SCALAR-NEXT:    movl 4(%rdi), %ecx
2115; SCALAR-NEXT:    notl %eax
2116; SCALAR-NEXT:    notl %ecx
2117; SCALAR-NEXT:    movl %ecx, 4(%rsi)
2118; SCALAR-NEXT:    movl %eax, (%rsi)
2119; SCALAR-NEXT:    movl %ecx, 4(%rdx)
2120; SCALAR-NEXT:    movl %eax, (%rdx)
2121; SCALAR-NEXT:    movl %ecx, 12(%rdx)
2122; SCALAR-NEXT:    movl %eax, 8(%rdx)
2123; SCALAR-NEXT:    movl %ecx, 20(%rdx)
2124; SCALAR-NEXT:    movl %eax, 16(%rdx)
2125; SCALAR-NEXT:    movl %ecx, 28(%rdx)
2126; SCALAR-NEXT:    movl %eax, 24(%rdx)
2127; SCALAR-NEXT:    movl %ecx, 36(%rdx)
2128; SCALAR-NEXT:    movl %eax, 32(%rdx)
2129; SCALAR-NEXT:    movl %ecx, 44(%rdx)
2130; SCALAR-NEXT:    movl %eax, 40(%rdx)
2131; SCALAR-NEXT:    retq
2132;
2133; SSE2-LABEL: vec384_v2i32:
2134; SSE2:       # %bb.0:
2135; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2136; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
2137; SSE2-NEXT:    pxor %xmm0, %xmm1
2138; SSE2-NEXT:    movq %xmm1, (%rsi)
2139; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
2140; SSE2-NEXT:    movdqa %xmm0, (%rdx)
2141; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
2142; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
2143; SSE2-NEXT:    retq
2144;
2145; AVX1-LABEL: vec384_v2i32:
2146; AVX1:       # %bb.0:
2147; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
2148; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2149; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2150; AVX1-NEXT:    vmovq %xmm0, (%rsi)
2151; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2152; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
2153; AVX1-NEXT:    vmovaps %ymm1, (%rdx)
2154; AVX1-NEXT:    vmovdqa %xmm0, 32(%rdx)
2155; AVX1-NEXT:    vzeroupper
2156; AVX1-NEXT:    retq
2157;
2158; AVX2-ONLY-LABEL: vec384_v2i32:
2159; AVX2-ONLY:       # %bb.0:
2160; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
2161; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2162; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2163; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
2164; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %ymm0
2165; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
2166; AVX2-ONLY-NEXT:    vmovdqa %xmm0, 32(%rdx)
2167; AVX2-ONLY-NEXT:    vzeroupper
2168; AVX2-ONLY-NEXT:    retq
2169;
2170; AVX512-LABEL: vec384_v2i32:
2171; AVX512:       # %bb.0:
2172; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
2173; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
2174; AVX512-NEXT:    vmovq %xmm0, (%rsi)
2175; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
2176; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
2177; AVX512-NEXT:    vmovdqa %xmm0, 32(%rdx)
2178; AVX512-NEXT:    vzeroupper
2179; AVX512-NEXT:    retq
2180  %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
2181  %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
2182  store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
2183  %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0
2184  store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
2185  %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1
2186  store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8
2187  %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2
2188  store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16
2189  %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3
2190  store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8
2191  %out.subvec4.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 4
2192  store <2 x i32> %in.subvec, ptr %out.subvec4.ptr, align 32
2193  %out.subvec5.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 5
2194  store <2 x i32> %in.subvec, ptr %out.subvec5.ptr, align 8
2195  ret void
2196}
2197
2198define void @vec384_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2199; SCALAR-LABEL: vec384_v2f32:
2200; SCALAR:       # %bb.0:
2201; SCALAR-NEXT:    movl (%rdi), %eax
2202; SCALAR-NEXT:    movl 4(%rdi), %ecx
2203; SCALAR-NEXT:    notl %eax
2204; SCALAR-NEXT:    notl %ecx
2205; SCALAR-NEXT:    movl %ecx, 4(%rsi)
2206; SCALAR-NEXT:    movl %eax, (%rsi)
2207; SCALAR-NEXT:    movl %ecx, 4(%rdx)
2208; SCALAR-NEXT:    movl %eax, (%rdx)
2209; SCALAR-NEXT:    movl %ecx, 12(%rdx)
2210; SCALAR-NEXT:    movl %eax, 8(%rdx)
2211; SCALAR-NEXT:    movl %ecx, 20(%rdx)
2212; SCALAR-NEXT:    movl %eax, 16(%rdx)
2213; SCALAR-NEXT:    movl %ecx, 28(%rdx)
2214; SCALAR-NEXT:    movl %eax, 24(%rdx)
2215; SCALAR-NEXT:    movl %ecx, 36(%rdx)
2216; SCALAR-NEXT:    movl %eax, 32(%rdx)
2217; SCALAR-NEXT:    movl %ecx, 44(%rdx)
2218; SCALAR-NEXT:    movl %eax, 40(%rdx)
2219; SCALAR-NEXT:    retq
2220;
2221; SSE2-LABEL: vec384_v2f32:
2222; SSE2:       # %bb.0:
2223; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2224; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
2225; SSE2-NEXT:    pxor %xmm0, %xmm1
2226; SSE2-NEXT:    movq %xmm1, (%rsi)
2227; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
2228; SSE2-NEXT:    movdqa %xmm0, (%rdx)
2229; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
2230; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
2231; SSE2-NEXT:    retq
2232;
2233; AVX1-LABEL: vec384_v2f32:
2234; AVX1:       # %bb.0:
2235; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
2236; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2237; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2238; AVX1-NEXT:    vmovq %xmm0, (%rsi)
2239; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2240; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
2241; AVX1-NEXT:    vmovaps %ymm1, (%rdx)
2242; AVX1-NEXT:    vmovdqa %xmm0, 32(%rdx)
2243; AVX1-NEXT:    vzeroupper
2244; AVX1-NEXT:    retq
2245;
2246; AVX2-ONLY-LABEL: vec384_v2f32:
2247; AVX2-ONLY:       # %bb.0:
2248; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
2249; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2250; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2251; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
2252; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %ymm0
2253; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
2254; AVX2-ONLY-NEXT:    vmovdqa %xmm0, 32(%rdx)
2255; AVX2-ONLY-NEXT:    vzeroupper
2256; AVX2-ONLY-NEXT:    retq
2257;
2258; AVX512-LABEL: vec384_v2f32:
2259; AVX512:       # %bb.0:
2260; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
2261; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
2262; AVX512-NEXT:    vmovq %xmm0, (%rsi)
2263; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
2264; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
2265; AVX512-NEXT:    vmovdqa %xmm0, 32(%rdx)
2266; AVX512-NEXT:    vzeroupper
2267; AVX512-NEXT:    retq
2268  %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
2269  %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
2270  %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float>
2271  store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64
2272  %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0
2273  store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
2274  %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1
2275  store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8
2276  %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2
2277  store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16
2278  %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3
2279  store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8
2280  %out.subvec4.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 4
2281  store <2 x float> %in.subvec, ptr %out.subvec4.ptr, align 32
2282  %out.subvec5.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 5
2283  store <2 x float> %in.subvec, ptr %out.subvec5.ptr, align 8
2284  ret void
2285}
2286
2287define void @vec384_v2i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2288; SCALAR-LABEL: vec384_v2i64:
2289; SCALAR:       # %bb.0:
2290; SCALAR-NEXT:    movq (%rdi), %rax
2291; SCALAR-NEXT:    movq 8(%rdi), %rcx
2292; SCALAR-NEXT:    notq %rax
2293; SCALAR-NEXT:    notq %rcx
2294; SCALAR-NEXT:    movq %rcx, 8(%rsi)
2295; SCALAR-NEXT:    movq %rax, (%rsi)
2296; SCALAR-NEXT:    movq %rcx, 8(%rdx)
2297; SCALAR-NEXT:    movq %rax, (%rdx)
2298; SCALAR-NEXT:    movq %rcx, 24(%rdx)
2299; SCALAR-NEXT:    movq %rax, 16(%rdx)
2300; SCALAR-NEXT:    movq %rcx, 40(%rdx)
2301; SCALAR-NEXT:    movq %rax, 32(%rdx)
2302; SCALAR-NEXT:    retq
2303;
2304; SSE2-LABEL: vec384_v2i64:
2305; SSE2:       # %bb.0:
2306; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
2307; SSE2-NEXT:    pxor (%rdi), %xmm0
2308; SSE2-NEXT:    movdqa %xmm0, (%rsi)
2309; SSE2-NEXT:    movdqa %xmm0, (%rdx)
2310; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
2311; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
2312; SSE2-NEXT:    retq
2313;
2314; AVX-LABEL: vec384_v2i64:
2315; AVX:       # %bb.0:
2316; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
2317; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
2318; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
2319; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
2320; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
2321; AVX-NEXT:    vmovdqa %xmm0, 32(%rdx)
2322; AVX-NEXT:    retq
2323  %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
2324  %in.subvec = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
2325  store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64
2326  %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0
2327  store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64
2328  %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1
2329  store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16
2330  %out.subvec2.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 2
2331  store <2 x i64> %in.subvec, ptr %out.subvec2.ptr, align 32
2332  ret void
2333}
2334
2335define void @vec384_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2336; SCALAR-LABEL: vec384_v2f64:
2337; SCALAR:       # %bb.0:
2338; SCALAR-NEXT:    movq (%rdi), %rax
2339; SCALAR-NEXT:    movq 8(%rdi), %rcx
2340; SCALAR-NEXT:    notq %rax
2341; SCALAR-NEXT:    notq %rcx
2342; SCALAR-NEXT:    movq %rcx, 8(%rsi)
2343; SCALAR-NEXT:    movq %rax, (%rsi)
2344; SCALAR-NEXT:    movq %rcx, 8(%rdx)
2345; SCALAR-NEXT:    movq %rax, (%rdx)
2346; SCALAR-NEXT:    movq %rcx, 24(%rdx)
2347; SCALAR-NEXT:    movq %rax, 16(%rdx)
2348; SCALAR-NEXT:    movq %rcx, 40(%rdx)
2349; SCALAR-NEXT:    movq %rax, 32(%rdx)
2350; SCALAR-NEXT:    retq
2351;
2352; SSE2-LABEL: vec384_v2f64:
2353; SSE2:       # %bb.0:
2354; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
2355; SSE2-NEXT:    pxor (%rdi), %xmm0
2356; SSE2-NEXT:    movdqa %xmm0, (%rsi)
2357; SSE2-NEXT:    movdqa %xmm0, (%rdx)
2358; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
2359; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
2360; SSE2-NEXT:    retq
2361;
2362; AVX-LABEL: vec384_v2f64:
2363; AVX:       # %bb.0:
2364; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
2365; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
2366; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
2367; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
2368; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
2369; AVX-NEXT:    vmovdqa %xmm0, 32(%rdx)
2370; AVX-NEXT:    retq
2371  %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
2372  %in.subvec.int = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
2373  %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double>
2374  store <2 x double> %in.subvec, ptr %out.subvec.ptr, align 64
2375  %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0
2376  store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64
2377  %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1
2378  store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16
2379  %out.subvec2.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 2
2380  store <2 x double> %in.subvec, ptr %out.subvec2.ptr, align 32
2381  ret void
2382}
2383
2384define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2385; SCALAR-LABEL: vec384_v3i8:
2386; SCALAR:       # %bb.0:
2387; SCALAR-NEXT:    movl (%rdi), %eax
2388; SCALAR-NEXT:    movl %eax, %ecx
2389; SCALAR-NEXT:    shrl $16, %ecx
2390; SCALAR-NEXT:    notb %cl
2391; SCALAR-NEXT:    notl %eax
2392; SCALAR-NEXT:    movw %ax, (%rsi)
2393; SCALAR-NEXT:    movb %cl, 2(%rsi)
2394; SCALAR-NEXT:    movb %cl, 2(%rdx)
2395; SCALAR-NEXT:    movw %ax, (%rdx)
2396; SCALAR-NEXT:    movb %cl, 6(%rdx)
2397; SCALAR-NEXT:    movw %ax, 4(%rdx)
2398; SCALAR-NEXT:    movb %cl, 10(%rdx)
2399; SCALAR-NEXT:    movw %ax, 8(%rdx)
2400; SCALAR-NEXT:    movb %cl, 14(%rdx)
2401; SCALAR-NEXT:    movw %ax, 12(%rdx)
2402; SCALAR-NEXT:    movb %cl, 18(%rdx)
2403; SCALAR-NEXT:    movw %ax, 16(%rdx)
2404; SCALAR-NEXT:    movb %cl, 22(%rdx)
2405; SCALAR-NEXT:    movw %ax, 20(%rdx)
2406; SCALAR-NEXT:    movb %cl, 26(%rdx)
2407; SCALAR-NEXT:    movw %ax, 24(%rdx)
2408; SCALAR-NEXT:    movb %cl, 30(%rdx)
2409; SCALAR-NEXT:    movw %ax, 28(%rdx)
2410; SCALAR-NEXT:    movb %cl, 34(%rdx)
2411; SCALAR-NEXT:    movw %ax, 32(%rdx)
2412; SCALAR-NEXT:    movb %cl, 38(%rdx)
2413; SCALAR-NEXT:    movw %ax, 36(%rdx)
2414; SCALAR-NEXT:    movb %cl, 42(%rdx)
2415; SCALAR-NEXT:    movw %ax, 40(%rdx)
2416; SCALAR-NEXT:    movb %cl, 46(%rdx)
2417; SCALAR-NEXT:    movw %ax, 44(%rdx)
2418; SCALAR-NEXT:    movb %cl, 50(%rdx)
2419; SCALAR-NEXT:    movw %ax, 48(%rdx)
2420; SCALAR-NEXT:    movb %cl, 54(%rdx)
2421; SCALAR-NEXT:    movw %ax, 52(%rdx)
2422; SCALAR-NEXT:    movb %cl, 58(%rdx)
2423; SCALAR-NEXT:    movw %ax, 56(%rdx)
2424; SCALAR-NEXT:    movb %cl, 62(%rdx)
2425; SCALAR-NEXT:    movw %ax, 60(%rdx)
2426; SCALAR-NEXT:    retq
2427;
2428; SSE2-ONLY-LABEL: vec384_v3i8:
2429; SSE2-ONLY:       # %bb.0:
2430; SSE2-ONLY-NEXT:    movl (%rdi), %eax
2431; SSE2-ONLY-NEXT:    notl %eax
2432; SSE2-ONLY-NEXT:    movw %ax, (%rsi)
2433; SSE2-ONLY-NEXT:    movl %eax, %ecx
2434; SSE2-ONLY-NEXT:    shrl $16, %ecx
2435; SSE2-ONLY-NEXT:    movb %cl, 2(%rsi)
2436; SSE2-ONLY-NEXT:    movb %cl, 2(%rdx)
2437; SSE2-ONLY-NEXT:    movw %ax, (%rdx)
2438; SSE2-ONLY-NEXT:    movb %cl, 6(%rdx)
2439; SSE2-ONLY-NEXT:    movw %ax, 4(%rdx)
2440; SSE2-ONLY-NEXT:    movb %cl, 10(%rdx)
2441; SSE2-ONLY-NEXT:    movw %ax, 8(%rdx)
2442; SSE2-ONLY-NEXT:    movb %cl, 14(%rdx)
2443; SSE2-ONLY-NEXT:    movw %ax, 12(%rdx)
2444; SSE2-ONLY-NEXT:    movb %cl, 18(%rdx)
2445; SSE2-ONLY-NEXT:    movw %ax, 16(%rdx)
2446; SSE2-ONLY-NEXT:    movb %cl, 22(%rdx)
2447; SSE2-ONLY-NEXT:    movw %ax, 20(%rdx)
2448; SSE2-ONLY-NEXT:    movb %cl, 26(%rdx)
2449; SSE2-ONLY-NEXT:    movw %ax, 24(%rdx)
2450; SSE2-ONLY-NEXT:    movb %cl, 30(%rdx)
2451; SSE2-ONLY-NEXT:    movw %ax, 28(%rdx)
2452; SSE2-ONLY-NEXT:    movb %cl, 34(%rdx)
2453; SSE2-ONLY-NEXT:    movw %ax, 32(%rdx)
2454; SSE2-ONLY-NEXT:    movb %cl, 38(%rdx)
2455; SSE2-ONLY-NEXT:    movw %ax, 36(%rdx)
2456; SSE2-ONLY-NEXT:    movb %cl, 42(%rdx)
2457; SSE2-ONLY-NEXT:    movw %ax, 40(%rdx)
2458; SSE2-ONLY-NEXT:    movb %cl, 46(%rdx)
2459; SSE2-ONLY-NEXT:    movw %ax, 44(%rdx)
2460; SSE2-ONLY-NEXT:    movb %cl, 50(%rdx)
2461; SSE2-ONLY-NEXT:    movw %ax, 48(%rdx)
2462; SSE2-ONLY-NEXT:    movb %cl, 54(%rdx)
2463; SSE2-ONLY-NEXT:    movw %ax, 52(%rdx)
2464; SSE2-ONLY-NEXT:    movb %cl, 58(%rdx)
2465; SSE2-ONLY-NEXT:    movw %ax, 56(%rdx)
2466; SSE2-ONLY-NEXT:    movb %cl, 62(%rdx)
2467; SSE2-ONLY-NEXT:    movw %ax, 60(%rdx)
2468; SSE2-ONLY-NEXT:    retq
2469;
2470; SSE3-LABEL: vec384_v3i8:
2471; SSE3:       # %bb.0:
2472; SSE3-NEXT:    movl (%rdi), %eax
2473; SSE3-NEXT:    notl %eax
2474; SSE3-NEXT:    movw %ax, (%rsi)
2475; SSE3-NEXT:    movl %eax, %ecx
2476; SSE3-NEXT:    shrl $16, %ecx
2477; SSE3-NEXT:    movb %cl, 2(%rsi)
2478; SSE3-NEXT:    movb %cl, 2(%rdx)
2479; SSE3-NEXT:    movw %ax, (%rdx)
2480; SSE3-NEXT:    movb %cl, 6(%rdx)
2481; SSE3-NEXT:    movw %ax, 4(%rdx)
2482; SSE3-NEXT:    movb %cl, 10(%rdx)
2483; SSE3-NEXT:    movw %ax, 8(%rdx)
2484; SSE3-NEXT:    movb %cl, 14(%rdx)
2485; SSE3-NEXT:    movw %ax, 12(%rdx)
2486; SSE3-NEXT:    movb %cl, 18(%rdx)
2487; SSE3-NEXT:    movw %ax, 16(%rdx)
2488; SSE3-NEXT:    movb %cl, 22(%rdx)
2489; SSE3-NEXT:    movw %ax, 20(%rdx)
2490; SSE3-NEXT:    movb %cl, 26(%rdx)
2491; SSE3-NEXT:    movw %ax, 24(%rdx)
2492; SSE3-NEXT:    movb %cl, 30(%rdx)
2493; SSE3-NEXT:    movw %ax, 28(%rdx)
2494; SSE3-NEXT:    movb %cl, 34(%rdx)
2495; SSE3-NEXT:    movw %ax, 32(%rdx)
2496; SSE3-NEXT:    movb %cl, 38(%rdx)
2497; SSE3-NEXT:    movw %ax, 36(%rdx)
2498; SSE3-NEXT:    movb %cl, 42(%rdx)
2499; SSE3-NEXT:    movw %ax, 40(%rdx)
2500; SSE3-NEXT:    movb %cl, 46(%rdx)
2501; SSE3-NEXT:    movw %ax, 44(%rdx)
2502; SSE3-NEXT:    movb %cl, 50(%rdx)
2503; SSE3-NEXT:    movw %ax, 48(%rdx)
2504; SSE3-NEXT:    movb %cl, 54(%rdx)
2505; SSE3-NEXT:    movw %ax, 52(%rdx)
2506; SSE3-NEXT:    movb %cl, 58(%rdx)
2507; SSE3-NEXT:    movw %ax, 56(%rdx)
2508; SSE3-NEXT:    movb %cl, 62(%rdx)
2509; SSE3-NEXT:    movw %ax, 60(%rdx)
2510; SSE3-NEXT:    retq
2511;
2512; SSSE3-ONLY-LABEL: vec384_v3i8:
2513; SSSE3-ONLY:       # %bb.0:
2514; SSSE3-ONLY-NEXT:    movl (%rdi), %eax
2515; SSSE3-ONLY-NEXT:    notl %eax
2516; SSSE3-ONLY-NEXT:    movw %ax, (%rsi)
2517; SSSE3-ONLY-NEXT:    movl %eax, %ecx
2518; SSSE3-ONLY-NEXT:    shrl $16, %ecx
2519; SSSE3-ONLY-NEXT:    movb %cl, 2(%rsi)
2520; SSSE3-ONLY-NEXT:    movb %cl, 2(%rdx)
2521; SSSE3-ONLY-NEXT:    movw %ax, (%rdx)
2522; SSSE3-ONLY-NEXT:    movb %cl, 6(%rdx)
2523; SSSE3-ONLY-NEXT:    movw %ax, 4(%rdx)
2524; SSSE3-ONLY-NEXT:    movb %cl, 10(%rdx)
2525; SSSE3-ONLY-NEXT:    movw %ax, 8(%rdx)
2526; SSSE3-ONLY-NEXT:    movb %cl, 14(%rdx)
2527; SSSE3-ONLY-NEXT:    movw %ax, 12(%rdx)
2528; SSSE3-ONLY-NEXT:    movb %cl, 18(%rdx)
2529; SSSE3-ONLY-NEXT:    movw %ax, 16(%rdx)
2530; SSSE3-ONLY-NEXT:    movb %cl, 22(%rdx)
2531; SSSE3-ONLY-NEXT:    movw %ax, 20(%rdx)
2532; SSSE3-ONLY-NEXT:    movb %cl, 26(%rdx)
2533; SSSE3-ONLY-NEXT:    movw %ax, 24(%rdx)
2534; SSSE3-ONLY-NEXT:    movb %cl, 30(%rdx)
2535; SSSE3-ONLY-NEXT:    movw %ax, 28(%rdx)
2536; SSSE3-ONLY-NEXT:    movb %cl, 34(%rdx)
2537; SSSE3-ONLY-NEXT:    movw %ax, 32(%rdx)
2538; SSSE3-ONLY-NEXT:    movb %cl, 38(%rdx)
2539; SSSE3-ONLY-NEXT:    movw %ax, 36(%rdx)
2540; SSSE3-ONLY-NEXT:    movb %cl, 42(%rdx)
2541; SSSE3-ONLY-NEXT:    movw %ax, 40(%rdx)
2542; SSSE3-ONLY-NEXT:    movb %cl, 46(%rdx)
2543; SSSE3-ONLY-NEXT:    movw %ax, 44(%rdx)
2544; SSSE3-ONLY-NEXT:    movb %cl, 50(%rdx)
2545; SSSE3-ONLY-NEXT:    movw %ax, 48(%rdx)
2546; SSSE3-ONLY-NEXT:    movb %cl, 54(%rdx)
2547; SSSE3-ONLY-NEXT:    movw %ax, 52(%rdx)
2548; SSSE3-ONLY-NEXT:    movb %cl, 58(%rdx)
2549; SSSE3-ONLY-NEXT:    movw %ax, 56(%rdx)
2550; SSSE3-ONLY-NEXT:    movb %cl, 62(%rdx)
2551; SSSE3-ONLY-NEXT:    movw %ax, 60(%rdx)
2552; SSSE3-ONLY-NEXT:    retq
2553;
2554; SSE41-LABEL: vec384_v3i8:
2555; SSE41:       # %bb.0:
2556; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2557; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
2558; SSE41-NEXT:    pxor %xmm1, %xmm0
2559; SSE41-NEXT:    pextrb $2, %xmm0, 2(%rsi)
2560; SSE41-NEXT:    movd %xmm0, %eax
2561; SSE41-NEXT:    movw %ax, (%rsi)
2562; SSE41-NEXT:    pextrb $2, %xmm0, 2(%rdx)
2563; SSE41-NEXT:    movw %ax, (%rdx)
2564; SSE41-NEXT:    pextrb $2, %xmm0, 6(%rdx)
2565; SSE41-NEXT:    movw %ax, 4(%rdx)
2566; SSE41-NEXT:    pextrb $2, %xmm0, 10(%rdx)
2567; SSE41-NEXT:    movw %ax, 8(%rdx)
2568; SSE41-NEXT:    pextrb $2, %xmm0, 14(%rdx)
2569; SSE41-NEXT:    movw %ax, 12(%rdx)
2570; SSE41-NEXT:    pextrb $2, %xmm0, 18(%rdx)
2571; SSE41-NEXT:    movw %ax, 16(%rdx)
2572; SSE41-NEXT:    pextrb $2, %xmm0, 22(%rdx)
2573; SSE41-NEXT:    movw %ax, 20(%rdx)
2574; SSE41-NEXT:    pextrb $2, %xmm0, 26(%rdx)
2575; SSE41-NEXT:    movw %ax, 24(%rdx)
2576; SSE41-NEXT:    pextrb $2, %xmm0, 30(%rdx)
2577; SSE41-NEXT:    movw %ax, 28(%rdx)
2578; SSE41-NEXT:    pextrb $2, %xmm0, 34(%rdx)
2579; SSE41-NEXT:    movw %ax, 32(%rdx)
2580; SSE41-NEXT:    pextrb $2, %xmm0, 38(%rdx)
2581; SSE41-NEXT:    movw %ax, 36(%rdx)
2582; SSE41-NEXT:    pextrb $2, %xmm0, 42(%rdx)
2583; SSE41-NEXT:    movw %ax, 40(%rdx)
2584; SSE41-NEXT:    pextrb $2, %xmm0, 46(%rdx)
2585; SSE41-NEXT:    movw %ax, 44(%rdx)
2586; SSE41-NEXT:    pextrb $2, %xmm0, 50(%rdx)
2587; SSE41-NEXT:    movw %ax, 48(%rdx)
2588; SSE41-NEXT:    pextrb $2, %xmm0, 54(%rdx)
2589; SSE41-NEXT:    movw %ax, 52(%rdx)
2590; SSE41-NEXT:    pextrb $2, %xmm0, 58(%rdx)
2591; SSE41-NEXT:    movw %ax, 56(%rdx)
2592; SSE41-NEXT:    pextrb $2, %xmm0, 62(%rdx)
2593; SSE41-NEXT:    movw %ax, 60(%rdx)
2594; SSE41-NEXT:    retq
2595;
2596; SSE42-LABEL: vec384_v3i8:
2597; SSE42:       # %bb.0:
2598; SSE42-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2599; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
2600; SSE42-NEXT:    pxor %xmm1, %xmm0
2601; SSE42-NEXT:    pextrb $2, %xmm0, 2(%rsi)
2602; SSE42-NEXT:    movd %xmm0, %eax
2603; SSE42-NEXT:    movw %ax, (%rsi)
2604; SSE42-NEXT:    pextrb $2, %xmm0, 2(%rdx)
2605; SSE42-NEXT:    movw %ax, (%rdx)
2606; SSE42-NEXT:    pextrb $2, %xmm0, 6(%rdx)
2607; SSE42-NEXT:    movw %ax, 4(%rdx)
2608; SSE42-NEXT:    pextrb $2, %xmm0, 10(%rdx)
2609; SSE42-NEXT:    movw %ax, 8(%rdx)
2610; SSE42-NEXT:    pextrb $2, %xmm0, 14(%rdx)
2611; SSE42-NEXT:    movw %ax, 12(%rdx)
2612; SSE42-NEXT:    pextrb $2, %xmm0, 18(%rdx)
2613; SSE42-NEXT:    movw %ax, 16(%rdx)
2614; SSE42-NEXT:    pextrb $2, %xmm0, 22(%rdx)
2615; SSE42-NEXT:    movw %ax, 20(%rdx)
2616; SSE42-NEXT:    pextrb $2, %xmm0, 26(%rdx)
2617; SSE42-NEXT:    movw %ax, 24(%rdx)
2618; SSE42-NEXT:    pextrb $2, %xmm0, 30(%rdx)
2619; SSE42-NEXT:    movw %ax, 28(%rdx)
2620; SSE42-NEXT:    pextrb $2, %xmm0, 34(%rdx)
2621; SSE42-NEXT:    movw %ax, 32(%rdx)
2622; SSE42-NEXT:    pextrb $2, %xmm0, 38(%rdx)
2623; SSE42-NEXT:    movw %ax, 36(%rdx)
2624; SSE42-NEXT:    pextrb $2, %xmm0, 42(%rdx)
2625; SSE42-NEXT:    movw %ax, 40(%rdx)
2626; SSE42-NEXT:    pextrb $2, %xmm0, 46(%rdx)
2627; SSE42-NEXT:    movw %ax, 44(%rdx)
2628; SSE42-NEXT:    pextrb $2, %xmm0, 50(%rdx)
2629; SSE42-NEXT:    movw %ax, 48(%rdx)
2630; SSE42-NEXT:    pextrb $2, %xmm0, 54(%rdx)
2631; SSE42-NEXT:    movw %ax, 52(%rdx)
2632; SSE42-NEXT:    pextrb $2, %xmm0, 58(%rdx)
2633; SSE42-NEXT:    movw %ax, 56(%rdx)
2634; SSE42-NEXT:    pextrb $2, %xmm0, 62(%rdx)
2635; SSE42-NEXT:    movw %ax, 60(%rdx)
2636; SSE42-NEXT:    retq
2637;
2638; AVX1-LABEL: vec384_v3i8:
2639; AVX1:       # %bb.0:
2640; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2641; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2642; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2643; AVX1-NEXT:    vpextrb $2, %xmm0, 2(%rsi)
2644; AVX1-NEXT:    vmovd %xmm0, %eax
2645; AVX1-NEXT:    movw %ax, (%rsi)
2646; AVX1-NEXT:    vpextrb $2, %xmm0, 2(%rdx)
2647; AVX1-NEXT:    movw %ax, (%rdx)
2648; AVX1-NEXT:    vpextrb $2, %xmm0, 6(%rdx)
2649; AVX1-NEXT:    movw %ax, 4(%rdx)
2650; AVX1-NEXT:    vpextrb $2, %xmm0, 10(%rdx)
2651; AVX1-NEXT:    movw %ax, 8(%rdx)
2652; AVX1-NEXT:    vpextrb $2, %xmm0, 14(%rdx)
2653; AVX1-NEXT:    movw %ax, 12(%rdx)
2654; AVX1-NEXT:    vpextrb $2, %xmm0, 18(%rdx)
2655; AVX1-NEXT:    movw %ax, 16(%rdx)
2656; AVX1-NEXT:    vpextrb $2, %xmm0, 22(%rdx)
2657; AVX1-NEXT:    movw %ax, 20(%rdx)
2658; AVX1-NEXT:    vpextrb $2, %xmm0, 26(%rdx)
2659; AVX1-NEXT:    movw %ax, 24(%rdx)
2660; AVX1-NEXT:    vpextrb $2, %xmm0, 30(%rdx)
2661; AVX1-NEXT:    movw %ax, 28(%rdx)
2662; AVX1-NEXT:    vpextrb $2, %xmm0, 34(%rdx)
2663; AVX1-NEXT:    movw %ax, 32(%rdx)
2664; AVX1-NEXT:    vpextrb $2, %xmm0, 38(%rdx)
2665; AVX1-NEXT:    movw %ax, 36(%rdx)
2666; AVX1-NEXT:    vpextrb $2, %xmm0, 42(%rdx)
2667; AVX1-NEXT:    movw %ax, 40(%rdx)
2668; AVX1-NEXT:    vpextrb $2, %xmm0, 46(%rdx)
2669; AVX1-NEXT:    movw %ax, 44(%rdx)
2670; AVX1-NEXT:    vpextrb $2, %xmm0, 50(%rdx)
2671; AVX1-NEXT:    movw %ax, 48(%rdx)
2672; AVX1-NEXT:    vpextrb $2, %xmm0, 54(%rdx)
2673; AVX1-NEXT:    movw %ax, 52(%rdx)
2674; AVX1-NEXT:    vpextrb $2, %xmm0, 58(%rdx)
2675; AVX1-NEXT:    movw %ax, 56(%rdx)
2676; AVX1-NEXT:    vpextrb $2, %xmm0, 62(%rdx)
2677; AVX1-NEXT:    movw %ax, 60(%rdx)
2678; AVX1-NEXT:    retq
2679;
2680; AVX2-ONLY-LABEL: vec384_v3i8:
2681; AVX2-ONLY:       # %bb.0:
2682; AVX2-ONLY-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2683; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2684; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2685; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 2(%rsi)
2686; AVX2-ONLY-NEXT:    vmovd %xmm0, %eax
2687; AVX2-ONLY-NEXT:    movw %ax, (%rsi)
2688; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 2(%rdx)
2689; AVX2-ONLY-NEXT:    movw %ax, (%rdx)
2690; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 6(%rdx)
2691; AVX2-ONLY-NEXT:    movw %ax, 4(%rdx)
2692; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 10(%rdx)
2693; AVX2-ONLY-NEXT:    movw %ax, 8(%rdx)
2694; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 14(%rdx)
2695; AVX2-ONLY-NEXT:    movw %ax, 12(%rdx)
2696; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 18(%rdx)
2697; AVX2-ONLY-NEXT:    movw %ax, 16(%rdx)
2698; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 22(%rdx)
2699; AVX2-ONLY-NEXT:    movw %ax, 20(%rdx)
2700; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 26(%rdx)
2701; AVX2-ONLY-NEXT:    movw %ax, 24(%rdx)
2702; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 30(%rdx)
2703; AVX2-ONLY-NEXT:    movw %ax, 28(%rdx)
2704; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 34(%rdx)
2705; AVX2-ONLY-NEXT:    movw %ax, 32(%rdx)
2706; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 38(%rdx)
2707; AVX2-ONLY-NEXT:    movw %ax, 36(%rdx)
2708; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 42(%rdx)
2709; AVX2-ONLY-NEXT:    movw %ax, 40(%rdx)
2710; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 46(%rdx)
2711; AVX2-ONLY-NEXT:    movw %ax, 44(%rdx)
2712; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 50(%rdx)
2713; AVX2-ONLY-NEXT:    movw %ax, 48(%rdx)
2714; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 54(%rdx)
2715; AVX2-ONLY-NEXT:    movw %ax, 52(%rdx)
2716; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 58(%rdx)
2717; AVX2-ONLY-NEXT:    movw %ax, 56(%rdx)
2718; AVX2-ONLY-NEXT:    vpextrb $2, %xmm0, 62(%rdx)
2719; AVX2-ONLY-NEXT:    movw %ax, 60(%rdx)
2720; AVX2-ONLY-NEXT:    retq
2721;
2722; AVX512-LABEL: vec384_v3i8:
2723; AVX512:       # %bb.0:
2724; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2725; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
2726; AVX512-NEXT:    vpextrb $2, %xmm0, 2(%rsi)
2727; AVX512-NEXT:    vmovd %xmm0, %eax
2728; AVX512-NEXT:    movw %ax, (%rsi)
2729; AVX512-NEXT:    vpextrb $2, %xmm0, 2(%rdx)
2730; AVX512-NEXT:    movw %ax, (%rdx)
2731; AVX512-NEXT:    vpextrb $2, %xmm0, 6(%rdx)
2732; AVX512-NEXT:    movw %ax, 4(%rdx)
2733; AVX512-NEXT:    vpextrb $2, %xmm0, 10(%rdx)
2734; AVX512-NEXT:    movw %ax, 8(%rdx)
2735; AVX512-NEXT:    vpextrb $2, %xmm0, 14(%rdx)
2736; AVX512-NEXT:    movw %ax, 12(%rdx)
2737; AVX512-NEXT:    vpextrb $2, %xmm0, 18(%rdx)
2738; AVX512-NEXT:    movw %ax, 16(%rdx)
2739; AVX512-NEXT:    vpextrb $2, %xmm0, 22(%rdx)
2740; AVX512-NEXT:    movw %ax, 20(%rdx)
2741; AVX512-NEXT:    vpextrb $2, %xmm0, 26(%rdx)
2742; AVX512-NEXT:    movw %ax, 24(%rdx)
2743; AVX512-NEXT:    vpextrb $2, %xmm0, 30(%rdx)
2744; AVX512-NEXT:    movw %ax, 28(%rdx)
2745; AVX512-NEXT:    vpextrb $2, %xmm0, 34(%rdx)
2746; AVX512-NEXT:    movw %ax, 32(%rdx)
2747; AVX512-NEXT:    vpextrb $2, %xmm0, 38(%rdx)
2748; AVX512-NEXT:    movw %ax, 36(%rdx)
2749; AVX512-NEXT:    vpextrb $2, %xmm0, 42(%rdx)
2750; AVX512-NEXT:    movw %ax, 40(%rdx)
2751; AVX512-NEXT:    vpextrb $2, %xmm0, 46(%rdx)
2752; AVX512-NEXT:    movw %ax, 44(%rdx)
2753; AVX512-NEXT:    vpextrb $2, %xmm0, 50(%rdx)
2754; AVX512-NEXT:    movw %ax, 48(%rdx)
2755; AVX512-NEXT:    vpextrb $2, %xmm0, 54(%rdx)
2756; AVX512-NEXT:    movw %ax, 52(%rdx)
2757; AVX512-NEXT:    vpextrb $2, %xmm0, 58(%rdx)
2758; AVX512-NEXT:    movw %ax, 56(%rdx)
2759; AVX512-NEXT:    vpextrb $2, %xmm0, 62(%rdx)
2760; AVX512-NEXT:    movw %ax, 60(%rdx)
2761; AVX512-NEXT:    retq
2762  %in.subvec.not = load <3 x i8>, ptr %in.subvec.ptr, align 64
2763  %in.subvec = xor <3 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1>
2764  store <3 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
2765  %out.subvec0.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 0
2766  store <3 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
2767  %out.subvec1.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 1
2768  store <3 x i8> %in.subvec, ptr %out.subvec1.ptr, align 1
2769  %out.subvec2.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 2
2770  store <3 x i8> %in.subvec, ptr %out.subvec2.ptr, align 2
2771  %out.subvec3.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 3
2772  store <3 x i8> %in.subvec, ptr %out.subvec3.ptr, align 1
2773  %out.subvec4.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 4
2774  store <3 x i8> %in.subvec, ptr %out.subvec4.ptr, align 4
2775  %out.subvec5.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 5
2776  store <3 x i8> %in.subvec, ptr %out.subvec5.ptr, align 1
2777  %out.subvec6.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 6
2778  store <3 x i8> %in.subvec, ptr %out.subvec6.ptr, align 2
2779  %out.subvec7.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 7
2780  store <3 x i8> %in.subvec, ptr %out.subvec7.ptr, align 1
2781  %out.subvec8.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 8
2782  store <3 x i8> %in.subvec, ptr %out.subvec8.ptr, align 8
2783  %out.subvec9.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 9
2784  store <3 x i8> %in.subvec, ptr %out.subvec9.ptr, align 1
2785  %out.subvec10.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 10
2786  store <3 x i8> %in.subvec, ptr %out.subvec10.ptr, align 2
2787  %out.subvec11.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 11
2788  store <3 x i8> %in.subvec, ptr %out.subvec11.ptr, align 1
2789  %out.subvec12.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 12
2790  store <3 x i8> %in.subvec, ptr %out.subvec12.ptr, align 4
2791  %out.subvec13.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 13
2792  store <3 x i8> %in.subvec, ptr %out.subvec13.ptr, align 1
2793  %out.subvec14.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 14
2794  store <3 x i8> %in.subvec, ptr %out.subvec14.ptr, align 2
2795  %out.subvec15.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 15
2796  store <3 x i8> %in.subvec, ptr %out.subvec15.ptr, align 1
2797  ret void
2798}
2799
2800define void @vec384_v3i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2801; SCALAR-LABEL: vec384_v3i16:
2802; SCALAR:       # %bb.0:
2803; SCALAR-NEXT:    movq (%rdi), %rax
2804; SCALAR-NEXT:    movq %rax, %rcx
2805; SCALAR-NEXT:    shrq $32, %rcx
2806; SCALAR-NEXT:    notl %ecx
2807; SCALAR-NEXT:    notl %eax
2808; SCALAR-NEXT:    movl %eax, (%rsi)
2809; SCALAR-NEXT:    movw %cx, 4(%rsi)
2810; SCALAR-NEXT:    movw %cx, 4(%rdx)
2811; SCALAR-NEXT:    movl %eax, (%rdx)
2812; SCALAR-NEXT:    movw %cx, 12(%rdx)
2813; SCALAR-NEXT:    movl %eax, 8(%rdx)
2814; SCALAR-NEXT:    movw %cx, 20(%rdx)
2815; SCALAR-NEXT:    movl %eax, 16(%rdx)
2816; SCALAR-NEXT:    movw %cx, 28(%rdx)
2817; SCALAR-NEXT:    movl %eax, 24(%rdx)
2818; SCALAR-NEXT:    movw %cx, 36(%rdx)
2819; SCALAR-NEXT:    movl %eax, 32(%rdx)
2820; SCALAR-NEXT:    movw %cx, 44(%rdx)
2821; SCALAR-NEXT:    movl %eax, 40(%rdx)
2822; SCALAR-NEXT:    movw %cx, 52(%rdx)
2823; SCALAR-NEXT:    movl %eax, 48(%rdx)
2824; SCALAR-NEXT:    movw %cx, 60(%rdx)
2825; SCALAR-NEXT:    movl %eax, 56(%rdx)
2826; SCALAR-NEXT:    retq
2827;
2828; SSE2-ONLY-LABEL: vec384_v3i16:
2829; SSE2-ONLY:       # %bb.0:
2830; SSE2-ONLY-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2831; SSE2-ONLY-NEXT:    pcmpeqd %xmm1, %xmm1
2832; SSE2-ONLY-NEXT:    pxor %xmm0, %xmm1
2833; SSE2-ONLY-NEXT:    movd %xmm1, (%rsi)
2834; SSE2-ONLY-NEXT:    pextrw $2, %xmm1, %eax
2835; SSE2-ONLY-NEXT:    movw %ax, 4(%rsi)
2836; SSE2-ONLY-NEXT:    movw %ax, 4(%rdx)
2837; SSE2-ONLY-NEXT:    movd %xmm1, (%rdx)
2838; SSE2-ONLY-NEXT:    movw %ax, 12(%rdx)
2839; SSE2-ONLY-NEXT:    movd %xmm1, 8(%rdx)
2840; SSE2-ONLY-NEXT:    movw %ax, 20(%rdx)
2841; SSE2-ONLY-NEXT:    movd %xmm1, 16(%rdx)
2842; SSE2-ONLY-NEXT:    movw %ax, 28(%rdx)
2843; SSE2-ONLY-NEXT:    movd %xmm1, 24(%rdx)
2844; SSE2-ONLY-NEXT:    movw %ax, 36(%rdx)
2845; SSE2-ONLY-NEXT:    movd %xmm1, 32(%rdx)
2846; SSE2-ONLY-NEXT:    movw %ax, 44(%rdx)
2847; SSE2-ONLY-NEXT:    movd %xmm1, 40(%rdx)
2848; SSE2-ONLY-NEXT:    movw %ax, 52(%rdx)
2849; SSE2-ONLY-NEXT:    movd %xmm1, 48(%rdx)
2850; SSE2-ONLY-NEXT:    movw %ax, 60(%rdx)
2851; SSE2-ONLY-NEXT:    movd %xmm1, 56(%rdx)
2852; SSE2-ONLY-NEXT:    retq
2853;
2854; SSE3-LABEL: vec384_v3i16:
2855; SSE3:       # %bb.0:
2856; SSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2857; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
2858; SSE3-NEXT:    pxor %xmm0, %xmm1
2859; SSE3-NEXT:    movd %xmm1, (%rsi)
2860; SSE3-NEXT:    pextrw $2, %xmm1, %eax
2861; SSE3-NEXT:    movw %ax, 4(%rsi)
2862; SSE3-NEXT:    movw %ax, 4(%rdx)
2863; SSE3-NEXT:    movd %xmm1, (%rdx)
2864; SSE3-NEXT:    movw %ax, 12(%rdx)
2865; SSE3-NEXT:    movd %xmm1, 8(%rdx)
2866; SSE3-NEXT:    movw %ax, 20(%rdx)
2867; SSE3-NEXT:    movd %xmm1, 16(%rdx)
2868; SSE3-NEXT:    movw %ax, 28(%rdx)
2869; SSE3-NEXT:    movd %xmm1, 24(%rdx)
2870; SSE3-NEXT:    movw %ax, 36(%rdx)
2871; SSE3-NEXT:    movd %xmm1, 32(%rdx)
2872; SSE3-NEXT:    movw %ax, 44(%rdx)
2873; SSE3-NEXT:    movd %xmm1, 40(%rdx)
2874; SSE3-NEXT:    movw %ax, 52(%rdx)
2875; SSE3-NEXT:    movd %xmm1, 48(%rdx)
2876; SSE3-NEXT:    movw %ax, 60(%rdx)
2877; SSE3-NEXT:    movd %xmm1, 56(%rdx)
2878; SSE3-NEXT:    retq
2879;
2880; SSSE3-ONLY-LABEL: vec384_v3i16:
2881; SSSE3-ONLY:       # %bb.0:
2882; SSSE3-ONLY-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2883; SSSE3-ONLY-NEXT:    pcmpeqd %xmm1, %xmm1
2884; SSSE3-ONLY-NEXT:    pxor %xmm0, %xmm1
2885; SSSE3-ONLY-NEXT:    movd %xmm1, (%rsi)
2886; SSSE3-ONLY-NEXT:    pextrw $2, %xmm1, %eax
2887; SSSE3-ONLY-NEXT:    movw %ax, 4(%rsi)
2888; SSSE3-ONLY-NEXT:    movw %ax, 4(%rdx)
2889; SSSE3-ONLY-NEXT:    movd %xmm1, (%rdx)
2890; SSSE3-ONLY-NEXT:    movw %ax, 12(%rdx)
2891; SSSE3-ONLY-NEXT:    movd %xmm1, 8(%rdx)
2892; SSSE3-ONLY-NEXT:    movw %ax, 20(%rdx)
2893; SSSE3-ONLY-NEXT:    movd %xmm1, 16(%rdx)
2894; SSSE3-ONLY-NEXT:    movw %ax, 28(%rdx)
2895; SSSE3-ONLY-NEXT:    movd %xmm1, 24(%rdx)
2896; SSSE3-ONLY-NEXT:    movw %ax, 36(%rdx)
2897; SSSE3-ONLY-NEXT:    movd %xmm1, 32(%rdx)
2898; SSSE3-ONLY-NEXT:    movw %ax, 44(%rdx)
2899; SSSE3-ONLY-NEXT:    movd %xmm1, 40(%rdx)
2900; SSSE3-ONLY-NEXT:    movw %ax, 52(%rdx)
2901; SSSE3-ONLY-NEXT:    movd %xmm1, 48(%rdx)
2902; SSSE3-ONLY-NEXT:    movw %ax, 60(%rdx)
2903; SSSE3-ONLY-NEXT:    movd %xmm1, 56(%rdx)
2904; SSSE3-ONLY-NEXT:    retq
2905;
2906; SSE41-LABEL: vec384_v3i16:
2907; SSE41:       # %bb.0:
2908; SSE41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2909; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
2910; SSE41-NEXT:    pxor %xmm0, %xmm1
2911; SSE41-NEXT:    pextrw $2, %xmm1, 4(%rsi)
2912; SSE41-NEXT:    movd %xmm1, (%rsi)
2913; SSE41-NEXT:    pextrw $2, %xmm1, 4(%rdx)
2914; SSE41-NEXT:    movd %xmm1, (%rdx)
2915; SSE41-NEXT:    pextrw $2, %xmm1, 12(%rdx)
2916; SSE41-NEXT:    movd %xmm1, 8(%rdx)
2917; SSE41-NEXT:    pextrw $2, %xmm1, 20(%rdx)
2918; SSE41-NEXT:    movd %xmm1, 16(%rdx)
2919; SSE41-NEXT:    pextrw $2, %xmm1, 28(%rdx)
2920; SSE41-NEXT:    movd %xmm1, 24(%rdx)
2921; SSE41-NEXT:    pextrw $2, %xmm1, 36(%rdx)
2922; SSE41-NEXT:    movd %xmm1, 32(%rdx)
2923; SSE41-NEXT:    pextrw $2, %xmm1, 44(%rdx)
2924; SSE41-NEXT:    movd %xmm1, 40(%rdx)
2925; SSE41-NEXT:    pextrw $2, %xmm1, 52(%rdx)
2926; SSE41-NEXT:    movd %xmm1, 48(%rdx)
2927; SSE41-NEXT:    pextrw $2, %xmm1, 60(%rdx)
2928; SSE41-NEXT:    movd %xmm1, 56(%rdx)
2929; SSE41-NEXT:    retq
2930;
2931; SSE42-LABEL: vec384_v3i16:
2932; SSE42:       # %bb.0:
2933; SSE42-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2934; SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
2935; SSE42-NEXT:    pxor %xmm0, %xmm1
2936; SSE42-NEXT:    pextrw $2, %xmm1, 4(%rsi)
2937; SSE42-NEXT:    movd %xmm1, (%rsi)
2938; SSE42-NEXT:    pextrw $2, %xmm1, 4(%rdx)
2939; SSE42-NEXT:    movd %xmm1, (%rdx)
2940; SSE42-NEXT:    pextrw $2, %xmm1, 12(%rdx)
2941; SSE42-NEXT:    movd %xmm1, 8(%rdx)
2942; SSE42-NEXT:    pextrw $2, %xmm1, 20(%rdx)
2943; SSE42-NEXT:    movd %xmm1, 16(%rdx)
2944; SSE42-NEXT:    pextrw $2, %xmm1, 28(%rdx)
2945; SSE42-NEXT:    movd %xmm1, 24(%rdx)
2946; SSE42-NEXT:    pextrw $2, %xmm1, 36(%rdx)
2947; SSE42-NEXT:    movd %xmm1, 32(%rdx)
2948; SSE42-NEXT:    pextrw $2, %xmm1, 44(%rdx)
2949; SSE42-NEXT:    movd %xmm1, 40(%rdx)
2950; SSE42-NEXT:    pextrw $2, %xmm1, 52(%rdx)
2951; SSE42-NEXT:    movd %xmm1, 48(%rdx)
2952; SSE42-NEXT:    pextrw $2, %xmm1, 60(%rdx)
2953; SSE42-NEXT:    movd %xmm1, 56(%rdx)
2954; SSE42-NEXT:    retq
2955;
2956; AVX1-LABEL: vec384_v3i16:
2957; AVX1:       # %bb.0:
2958; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
2959; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2960; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2961; AVX1-NEXT:    vpextrw $2, %xmm0, 4(%rsi)
2962; AVX1-NEXT:    vmovd %xmm0, (%rsi)
2963; AVX1-NEXT:    vpextrw $2, %xmm0, 4(%rdx)
2964; AVX1-NEXT:    vmovd %xmm0, (%rdx)
2965; AVX1-NEXT:    vpextrw $2, %xmm0, 12(%rdx)
2966; AVX1-NEXT:    vmovd %xmm0, 8(%rdx)
2967; AVX1-NEXT:    vpextrw $2, %xmm0, 20(%rdx)
2968; AVX1-NEXT:    vmovd %xmm0, 16(%rdx)
2969; AVX1-NEXT:    vpextrw $2, %xmm0, 28(%rdx)
2970; AVX1-NEXT:    vmovd %xmm0, 24(%rdx)
2971; AVX1-NEXT:    vpextrw $2, %xmm0, 36(%rdx)
2972; AVX1-NEXT:    vmovd %xmm0, 32(%rdx)
2973; AVX1-NEXT:    vpextrw $2, %xmm0, 44(%rdx)
2974; AVX1-NEXT:    vmovd %xmm0, 40(%rdx)
2975; AVX1-NEXT:    vpextrw $2, %xmm0, 52(%rdx)
2976; AVX1-NEXT:    vmovd %xmm0, 48(%rdx)
2977; AVX1-NEXT:    vpextrw $2, %xmm0, 60(%rdx)
2978; AVX1-NEXT:    vmovd %xmm0, 56(%rdx)
2979; AVX1-NEXT:    retq
2980;
2981; AVX2-ONLY-LABEL: vec384_v3i16:
2982; AVX2-ONLY:       # %bb.0:
2983; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
2984; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2985; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2986; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 4(%rsi)
2987; AVX2-ONLY-NEXT:    vmovd %xmm0, (%rsi)
2988; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 4(%rdx)
2989; AVX2-ONLY-NEXT:    vmovd %xmm0, (%rdx)
2990; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 12(%rdx)
2991; AVX2-ONLY-NEXT:    vmovd %xmm0, 8(%rdx)
2992; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 20(%rdx)
2993; AVX2-ONLY-NEXT:    vmovd %xmm0, 16(%rdx)
2994; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 28(%rdx)
2995; AVX2-ONLY-NEXT:    vmovd %xmm0, 24(%rdx)
2996; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 36(%rdx)
2997; AVX2-ONLY-NEXT:    vmovd %xmm0, 32(%rdx)
2998; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 44(%rdx)
2999; AVX2-ONLY-NEXT:    vmovd %xmm0, 40(%rdx)
3000; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 52(%rdx)
3001; AVX2-ONLY-NEXT:    vmovd %xmm0, 48(%rdx)
3002; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 60(%rdx)
3003; AVX2-ONLY-NEXT:    vmovd %xmm0, 56(%rdx)
3004; AVX2-ONLY-NEXT:    retq
3005;
3006; AVX512-LABEL: vec384_v3i16:
3007; AVX512:       # %bb.0:
3008; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
3009; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
3010; AVX512-NEXT:    vpextrw $2, %xmm0, 4(%rsi)
3011; AVX512-NEXT:    vmovd %xmm0, (%rsi)
3012; AVX512-NEXT:    vpextrw $2, %xmm0, 4(%rdx)
3013; AVX512-NEXT:    vmovd %xmm0, (%rdx)
3014; AVX512-NEXT:    vpextrw $2, %xmm0, 12(%rdx)
3015; AVX512-NEXT:    vmovd %xmm0, 8(%rdx)
3016; AVX512-NEXT:    vpextrw $2, %xmm0, 20(%rdx)
3017; AVX512-NEXT:    vmovd %xmm0, 16(%rdx)
3018; AVX512-NEXT:    vpextrw $2, %xmm0, 28(%rdx)
3019; AVX512-NEXT:    vmovd %xmm0, 24(%rdx)
3020; AVX512-NEXT:    vpextrw $2, %xmm0, 36(%rdx)
3021; AVX512-NEXT:    vmovd %xmm0, 32(%rdx)
3022; AVX512-NEXT:    vpextrw $2, %xmm0, 44(%rdx)
3023; AVX512-NEXT:    vmovd %xmm0, 40(%rdx)
3024; AVX512-NEXT:    vpextrw $2, %xmm0, 52(%rdx)
3025; AVX512-NEXT:    vmovd %xmm0, 48(%rdx)
3026; AVX512-NEXT:    vpextrw $2, %xmm0, 60(%rdx)
3027; AVX512-NEXT:    vmovd %xmm0, 56(%rdx)
3028; AVX512-NEXT:    retq
3029  %in.subvec.not = load <3 x i16>, ptr %in.subvec.ptr, align 64
3030  %in.subvec = xor <3 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1>
3031  store <3 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
3032  %out.subvec0.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 0
3033  store <3 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
3034  %out.subvec1.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 1
3035  store <3 x i16> %in.subvec, ptr %out.subvec1.ptr, align 2
3036  %out.subvec2.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 2
3037  store <3 x i16> %in.subvec, ptr %out.subvec2.ptr, align 4
3038  %out.subvec3.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 3
3039  store <3 x i16> %in.subvec, ptr %out.subvec3.ptr, align 2
3040  %out.subvec4.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 4
3041  store <3 x i16> %in.subvec, ptr %out.subvec4.ptr, align 8
3042  %out.subvec5.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 5
3043  store <3 x i16> %in.subvec, ptr %out.subvec5.ptr, align 2
3044  %out.subvec6.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 6
3045  store <3 x i16> %in.subvec, ptr %out.subvec6.ptr, align 4
3046  %out.subvec7.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 7
3047  store <3 x i16> %in.subvec, ptr %out.subvec7.ptr, align 2
3048  ret void
3049}
3050
3051define void @vec384_v3i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3052; SCALAR-LABEL: vec384_v3i32:
3053; SCALAR:       # %bb.0:
3054; SCALAR-NEXT:    movl 8(%rdi), %eax
3055; SCALAR-NEXT:    movq (%rdi), %rcx
3056; SCALAR-NEXT:    notq %rcx
3057; SCALAR-NEXT:    notl %eax
3058; SCALAR-NEXT:    movl %eax, 8(%rsi)
3059; SCALAR-NEXT:    movq %rcx, (%rsi)
3060; SCALAR-NEXT:    movl %eax, 8(%rdx)
3061; SCALAR-NEXT:    movq %rcx, (%rdx)
3062; SCALAR-NEXT:    movl %eax, 24(%rdx)
3063; SCALAR-NEXT:    movq %rcx, 16(%rdx)
3064; SCALAR-NEXT:    movl %eax, 40(%rdx)
3065; SCALAR-NEXT:    movq %rcx, 32(%rdx)
3066; SCALAR-NEXT:    movl %eax, 56(%rdx)
3067; SCALAR-NEXT:    movq %rcx, 48(%rdx)
3068; SCALAR-NEXT:    retq
3069;
3070; SSE2-ONLY-LABEL: vec384_v3i32:
3071; SSE2-ONLY:       # %bb.0:
3072; SSE2-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
3073; SSE2-ONLY-NEXT:    pxor (%rdi), %xmm0
3074; SSE2-ONLY-NEXT:    movq %xmm0, (%rsi)
3075; SSE2-ONLY-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3076; SSE2-ONLY-NEXT:    movd %xmm1, 8(%rsi)
3077; SSE2-ONLY-NEXT:    movd %xmm1, 8(%rdx)
3078; SSE2-ONLY-NEXT:    movq %xmm0, (%rdx)
3079; SSE2-ONLY-NEXT:    movd %xmm1, 24(%rdx)
3080; SSE2-ONLY-NEXT:    movq %xmm0, 16(%rdx)
3081; SSE2-ONLY-NEXT:    movd %xmm1, 40(%rdx)
3082; SSE2-ONLY-NEXT:    movq %xmm0, 32(%rdx)
3083; SSE2-ONLY-NEXT:    movd %xmm1, 56(%rdx)
3084; SSE2-ONLY-NEXT:    movq %xmm0, 48(%rdx)
3085; SSE2-ONLY-NEXT:    retq
3086;
3087; SSE3-LABEL: vec384_v3i32:
3088; SSE3:       # %bb.0:
3089; SSE3-NEXT:    pcmpeqd %xmm0, %xmm0
3090; SSE3-NEXT:    pxor (%rdi), %xmm0
3091; SSE3-NEXT:    movq %xmm0, (%rsi)
3092; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3093; SSE3-NEXT:    movd %xmm1, 8(%rsi)
3094; SSE3-NEXT:    movd %xmm1, 8(%rdx)
3095; SSE3-NEXT:    movq %xmm0, (%rdx)
3096; SSE3-NEXT:    movd %xmm1, 24(%rdx)
3097; SSE3-NEXT:    movq %xmm0, 16(%rdx)
3098; SSE3-NEXT:    movd %xmm1, 40(%rdx)
3099; SSE3-NEXT:    movq %xmm0, 32(%rdx)
3100; SSE3-NEXT:    movd %xmm1, 56(%rdx)
3101; SSE3-NEXT:    movq %xmm0, 48(%rdx)
3102; SSE3-NEXT:    retq
3103;
3104; SSSE3-ONLY-LABEL: vec384_v3i32:
3105; SSSE3-ONLY:       # %bb.0:
3106; SSSE3-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
3107; SSSE3-ONLY-NEXT:    pxor (%rdi), %xmm0
3108; SSSE3-ONLY-NEXT:    movq %xmm0, (%rsi)
3109; SSSE3-ONLY-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3110; SSSE3-ONLY-NEXT:    movd %xmm1, 8(%rsi)
3111; SSSE3-ONLY-NEXT:    movd %xmm1, 8(%rdx)
3112; SSSE3-ONLY-NEXT:    movq %xmm0, (%rdx)
3113; SSSE3-ONLY-NEXT:    movd %xmm1, 24(%rdx)
3114; SSSE3-ONLY-NEXT:    movq %xmm0, 16(%rdx)
3115; SSSE3-ONLY-NEXT:    movd %xmm1, 40(%rdx)
3116; SSSE3-ONLY-NEXT:    movq %xmm0, 32(%rdx)
3117; SSSE3-ONLY-NEXT:    movd %xmm1, 56(%rdx)
3118; SSSE3-ONLY-NEXT:    movq %xmm0, 48(%rdx)
3119; SSSE3-ONLY-NEXT:    retq
3120;
3121; SSE41-LABEL: vec384_v3i32:
3122; SSE41:       # %bb.0:
3123; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
3124; SSE41-NEXT:    pxor (%rdi), %xmm0
3125; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rsi)
3126; SSE41-NEXT:    movq %xmm0, (%rsi)
3127; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdx)
3128; SSE41-NEXT:    movq %xmm0, (%rdx)
3129; SSE41-NEXT:    pextrd $2, %xmm0, 24(%rdx)
3130; SSE41-NEXT:    movq %xmm0, 16(%rdx)
3131; SSE41-NEXT:    pextrd $2, %xmm0, 40(%rdx)
3132; SSE41-NEXT:    movq %xmm0, 32(%rdx)
3133; SSE41-NEXT:    pextrd $2, %xmm0, 56(%rdx)
3134; SSE41-NEXT:    movq %xmm0, 48(%rdx)
3135; SSE41-NEXT:    retq
3136;
3137; SSE42-LABEL: vec384_v3i32:
3138; SSE42:       # %bb.0:
3139; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
3140; SSE42-NEXT:    pxor (%rdi), %xmm0
3141; SSE42-NEXT:    pextrd $2, %xmm0, 8(%rsi)
3142; SSE42-NEXT:    movq %xmm0, (%rsi)
3143; SSE42-NEXT:    pextrd $2, %xmm0, 8(%rdx)
3144; SSE42-NEXT:    movq %xmm0, (%rdx)
3145; SSE42-NEXT:    pextrd $2, %xmm0, 24(%rdx)
3146; SSE42-NEXT:    movq %xmm0, 16(%rdx)
3147; SSE42-NEXT:    pextrd $2, %xmm0, 40(%rdx)
3148; SSE42-NEXT:    movq %xmm0, 32(%rdx)
3149; SSE42-NEXT:    pextrd $2, %xmm0, 56(%rdx)
3150; SSE42-NEXT:    movq %xmm0, 48(%rdx)
3151; SSE42-NEXT:    retq
3152;
3153; AVX-LABEL: vec384_v3i32:
3154; AVX:       # %bb.0:
3155; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
3156; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
3157; AVX-NEXT:    vpextrd $2, %xmm0, 8(%rsi)
3158; AVX-NEXT:    vmovq %xmm0, (%rsi)
3159; AVX-NEXT:    vpextrd $2, %xmm0, 8(%rdx)
3160; AVX-NEXT:    vmovq %xmm0, (%rdx)
3161; AVX-NEXT:    vpextrd $2, %xmm0, 24(%rdx)
3162; AVX-NEXT:    vmovq %xmm0, 16(%rdx)
3163; AVX-NEXT:    vpextrd $2, %xmm0, 40(%rdx)
3164; AVX-NEXT:    vmovq %xmm0, 32(%rdx)
3165; AVX-NEXT:    vpextrd $2, %xmm0, 56(%rdx)
3166; AVX-NEXT:    vmovq %xmm0, 48(%rdx)
3167; AVX-NEXT:    retq
3168  %in.subvec.not = load <3 x i32>, ptr %in.subvec.ptr, align 64
3169  %in.subvec = xor <3 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1>
3170  store <3 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
3171  %out.subvec0.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 0
3172  store <3 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
3173  %out.subvec1.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 1
3174  store <3 x i32> %in.subvec, ptr %out.subvec1.ptr, align 4
3175  %out.subvec2.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 2
3176  store <3 x i32> %in.subvec, ptr %out.subvec2.ptr, align 8
3177  %out.subvec3.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 3
3178  store <3 x i32> %in.subvec, ptr %out.subvec3.ptr, align 4
3179  ret void
3180}
3181
3182define void @vec384_v3f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3183; SCALAR-LABEL: vec384_v3f32:
3184; SCALAR:       # %bb.0:
3185; SCALAR-NEXT:    movl 8(%rdi), %eax
3186; SCALAR-NEXT:    movq (%rdi), %rcx
3187; SCALAR-NEXT:    notq %rcx
3188; SCALAR-NEXT:    notl %eax
3189; SCALAR-NEXT:    movl %eax, 8(%rsi)
3190; SCALAR-NEXT:    movq %rcx, (%rsi)
3191; SCALAR-NEXT:    movl %eax, 8(%rdx)
3192; SCALAR-NEXT:    movq %rcx, (%rdx)
3193; SCALAR-NEXT:    movl %eax, 24(%rdx)
3194; SCALAR-NEXT:    movq %rcx, 16(%rdx)
3195; SCALAR-NEXT:    movl %eax, 40(%rdx)
3196; SCALAR-NEXT:    movq %rcx, 32(%rdx)
3197; SCALAR-NEXT:    movl %eax, 56(%rdx)
3198; SCALAR-NEXT:    movq %rcx, 48(%rdx)
3199; SCALAR-NEXT:    retq
3200;
3201; SSE2-ONLY-LABEL: vec384_v3f32:
3202; SSE2-ONLY:       # %bb.0:
3203; SSE2-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
3204; SSE2-ONLY-NEXT:    pxor (%rdi), %xmm0
3205; SSE2-ONLY-NEXT:    movq %xmm0, (%rsi)
3206; SSE2-ONLY-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3207; SSE2-ONLY-NEXT:    movd %xmm1, 8(%rsi)
3208; SSE2-ONLY-NEXT:    movd %xmm1, 8(%rdx)
3209; SSE2-ONLY-NEXT:    movq %xmm0, (%rdx)
3210; SSE2-ONLY-NEXT:    movd %xmm1, 24(%rdx)
3211; SSE2-ONLY-NEXT:    movq %xmm0, 16(%rdx)
3212; SSE2-ONLY-NEXT:    movd %xmm1, 40(%rdx)
3213; SSE2-ONLY-NEXT:    movq %xmm0, 32(%rdx)
3214; SSE2-ONLY-NEXT:    movd %xmm1, 56(%rdx)
3215; SSE2-ONLY-NEXT:    movq %xmm0, 48(%rdx)
3216; SSE2-ONLY-NEXT:    retq
3217;
3218; SSE3-LABEL: vec384_v3f32:
3219; SSE3:       # %bb.0:
3220; SSE3-NEXT:    pcmpeqd %xmm0, %xmm0
3221; SSE3-NEXT:    pxor (%rdi), %xmm0
3222; SSE3-NEXT:    movq %xmm0, (%rsi)
3223; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3224; SSE3-NEXT:    movd %xmm1, 8(%rsi)
3225; SSE3-NEXT:    movd %xmm1, 8(%rdx)
3226; SSE3-NEXT:    movq %xmm0, (%rdx)
3227; SSE3-NEXT:    movd %xmm1, 24(%rdx)
3228; SSE3-NEXT:    movq %xmm0, 16(%rdx)
3229; SSE3-NEXT:    movd %xmm1, 40(%rdx)
3230; SSE3-NEXT:    movq %xmm0, 32(%rdx)
3231; SSE3-NEXT:    movd %xmm1, 56(%rdx)
3232; SSE3-NEXT:    movq %xmm0, 48(%rdx)
3233; SSE3-NEXT:    retq
3234;
3235; SSSE3-ONLY-LABEL: vec384_v3f32:
3236; SSSE3-ONLY:       # %bb.0:
3237; SSSE3-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
3238; SSSE3-ONLY-NEXT:    pxor (%rdi), %xmm0
3239; SSSE3-ONLY-NEXT:    movq %xmm0, (%rsi)
3240; SSSE3-ONLY-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3241; SSSE3-ONLY-NEXT:    movd %xmm1, 8(%rsi)
3242; SSSE3-ONLY-NEXT:    movd %xmm1, 8(%rdx)
3243; SSSE3-ONLY-NEXT:    movq %xmm0, (%rdx)
3244; SSSE3-ONLY-NEXT:    movd %xmm1, 24(%rdx)
3245; SSSE3-ONLY-NEXT:    movq %xmm0, 16(%rdx)
3246; SSSE3-ONLY-NEXT:    movd %xmm1, 40(%rdx)
3247; SSSE3-ONLY-NEXT:    movq %xmm0, 32(%rdx)
3248; SSSE3-ONLY-NEXT:    movd %xmm1, 56(%rdx)
3249; SSSE3-ONLY-NEXT:    movq %xmm0, 48(%rdx)
3250; SSSE3-ONLY-NEXT:    retq
3251;
3252; SSE41-LABEL: vec384_v3f32:
3253; SSE41:       # %bb.0:
3254; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
3255; SSE41-NEXT:    pxor (%rdi), %xmm0
3256; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rsi)
3257; SSE41-NEXT:    movq %xmm0, (%rsi)
3258; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdx)
3259; SSE41-NEXT:    movq %xmm0, (%rdx)
3260; SSE41-NEXT:    pextrd $2, %xmm0, 24(%rdx)
3261; SSE41-NEXT:    movq %xmm0, 16(%rdx)
3262; SSE41-NEXT:    pextrd $2, %xmm0, 40(%rdx)
3263; SSE41-NEXT:    movq %xmm0, 32(%rdx)
3264; SSE41-NEXT:    pextrd $2, %xmm0, 56(%rdx)
3265; SSE41-NEXT:    movq %xmm0, 48(%rdx)
3266; SSE41-NEXT:    retq
3267;
3268; SSE42-LABEL: vec384_v3f32:
3269; SSE42:       # %bb.0:
3270; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
3271; SSE42-NEXT:    pxor (%rdi), %xmm0
3272; SSE42-NEXT:    pextrd $2, %xmm0, 8(%rsi)
3273; SSE42-NEXT:    movq %xmm0, (%rsi)
3274; SSE42-NEXT:    pextrd $2, %xmm0, 8(%rdx)
3275; SSE42-NEXT:    movq %xmm0, (%rdx)
3276; SSE42-NEXT:    pextrd $2, %xmm0, 24(%rdx)
3277; SSE42-NEXT:    movq %xmm0, 16(%rdx)
3278; SSE42-NEXT:    pextrd $2, %xmm0, 40(%rdx)
3279; SSE42-NEXT:    movq %xmm0, 32(%rdx)
3280; SSE42-NEXT:    pextrd $2, %xmm0, 56(%rdx)
3281; SSE42-NEXT:    movq %xmm0, 48(%rdx)
3282; SSE42-NEXT:    retq
3283;
3284; AVX-LABEL: vec384_v3f32:
3285; AVX:       # %bb.0:
3286; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
3287; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
3288; AVX-NEXT:    vpextrd $2, %xmm0, 8(%rsi)
3289; AVX-NEXT:    vmovq %xmm0, (%rsi)
3290; AVX-NEXT:    vpextrd $2, %xmm0, 8(%rdx)
3291; AVX-NEXT:    vmovq %xmm0, (%rdx)
3292; AVX-NEXT:    vpextrd $2, %xmm0, 24(%rdx)
3293; AVX-NEXT:    vmovq %xmm0, 16(%rdx)
3294; AVX-NEXT:    vpextrd $2, %xmm0, 40(%rdx)
3295; AVX-NEXT:    vmovq %xmm0, 32(%rdx)
3296; AVX-NEXT:    vpextrd $2, %xmm0, 56(%rdx)
3297; AVX-NEXT:    vmovq %xmm0, 48(%rdx)
3298; AVX-NEXT:    retq
3299  %in.subvec.not = load <3 x i32>, ptr %in.subvec.ptr, align 64
3300  %in.subvec.int = xor <3 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1>
3301  %in.subvec = bitcast <3 x i32> %in.subvec.int to <3 x float>
3302  store <3 x float> %in.subvec, ptr %out.subvec.ptr, align 64
3303  %out.subvec0.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 0
3304  store <3 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
3305  %out.subvec1.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 1
3306  store <3 x float> %in.subvec, ptr %out.subvec1.ptr, align 4
3307  %out.subvec2.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 2
3308  store <3 x float> %in.subvec, ptr %out.subvec2.ptr, align 8
3309  %out.subvec3.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 3
3310  store <3 x float> %in.subvec, ptr %out.subvec3.ptr, align 4
3311  ret void
3312}
3313
3314define void @vec384_v3i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3315; SCALAR-LABEL: vec384_v3i64:
3316; SCALAR:       # %bb.0:
3317; SCALAR-NEXT:    movq (%rdi), %rax
3318; SCALAR-NEXT:    movq 8(%rdi), %rcx
3319; SCALAR-NEXT:    movq 16(%rdi), %rdi
3320; SCALAR-NEXT:    notq %rdi
3321; SCALAR-NEXT:    notq %rcx
3322; SCALAR-NEXT:    notq %rax
3323; SCALAR-NEXT:    movq %rax, (%rsi)
3324; SCALAR-NEXT:    movq %rcx, 8(%rsi)
3325; SCALAR-NEXT:    movq %rdi, 16(%rsi)
3326; SCALAR-NEXT:    movq %rax, (%rdx)
3327; SCALAR-NEXT:    movq %rcx, 8(%rdx)
3328; SCALAR-NEXT:    movq %rdi, 16(%rdx)
3329; SCALAR-NEXT:    movq %rdi, 48(%rdx)
3330; SCALAR-NEXT:    movq %rcx, 40(%rdx)
3331; SCALAR-NEXT:    movq %rax, 32(%rdx)
3332; SCALAR-NEXT:    retq
3333;
3334; SSE2-LABEL: vec384_v3i64:
3335; SSE2:       # %bb.0:
3336; SSE2-NEXT:    movq 16(%rdi), %rax
3337; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
3338; SSE2-NEXT:    pxor (%rdi), %xmm0
3339; SSE2-NEXT:    movdqa %xmm0, (%rsi)
3340; SSE2-NEXT:    notq %rax
3341; SSE2-NEXT:    movq %rax, 16(%rsi)
3342; SSE2-NEXT:    movq %rax, 16(%rdx)
3343; SSE2-NEXT:    movdqa %xmm0, (%rdx)
3344; SSE2-NEXT:    movq %rax, 48(%rdx)
3345; SSE2-NEXT:    movdqu %xmm0, 32(%rdx)
3346; SSE2-NEXT:    retq
3347;
3348; AVX1-LABEL: vec384_v3i64:
3349; AVX1:       # %bb.0:
3350; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
3351; AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
3352; AVX1-NEXT:    vxorps (%rdi), %ymm0, %ymm0
3353; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3354; AVX1-NEXT:    vmovlps %xmm1, 16(%rsi)
3355; AVX1-NEXT:    vmovaps %xmm0, (%rsi)
3356; AVX1-NEXT:    vmovlps %xmm1, 16(%rdx)
3357; AVX1-NEXT:    vmovaps %xmm0, (%rdx)
3358; AVX1-NEXT:    vmovlps %xmm1, 48(%rdx)
3359; AVX1-NEXT:    vmovups %xmm0, 32(%rdx)
3360; AVX1-NEXT:    vzeroupper
3361; AVX1-NEXT:    retq
3362;
3363; AVX2-LABEL: vec384_v3i64:
3364; AVX2:       # %bb.0:
3365; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
3366; AVX2-NEXT:    vpxor (%rdi), %ymm0, %ymm0
3367; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3368; AVX2-NEXT:    vmovq %xmm1, 16(%rsi)
3369; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
3370; AVX2-NEXT:    vmovq %xmm1, 16(%rdx)
3371; AVX2-NEXT:    vmovdqa %xmm0, (%rdx)
3372; AVX2-NEXT:    vmovq %xmm1, 48(%rdx)
3373; AVX2-NEXT:    vmovdqu %xmm0, 32(%rdx)
3374; AVX2-NEXT:    vzeroupper
3375; AVX2-NEXT:    retq
3376  %in.subvec.not = load <3 x i64>, ptr %in.subvec.ptr, align 64
3377  %in.subvec = xor <3 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1>
3378  store <3 x i64> %in.subvec, ptr %out.subvec.ptr, align 64
3379  %out.subvec0.ptr = getelementptr <3 x i64>, ptr %out.vec.ptr, i64 0
3380  store <3 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64
3381  %out.subvec1.ptr = getelementptr <3 x i64>, ptr %out.vec.ptr, i64 1
3382  store <3 x i64> %in.subvec, ptr %out.subvec1.ptr, align 8
3383  ret void
3384}
3385
3386define void @vec384_v3f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3387; SCALAR-LABEL: vec384_v3f64:
3388; SCALAR:       # %bb.0:
3389; SCALAR-NEXT:    movq (%rdi), %rax
3390; SCALAR-NEXT:    movq 8(%rdi), %rcx
3391; SCALAR-NEXT:    movq 16(%rdi), %rdi
3392; SCALAR-NEXT:    notq %rdi
3393; SCALAR-NEXT:    notq %rcx
3394; SCALAR-NEXT:    notq %rax
3395; SCALAR-NEXT:    movq %rax, (%rsi)
3396; SCALAR-NEXT:    movq %rcx, 8(%rsi)
3397; SCALAR-NEXT:    movq %rdi, 16(%rsi)
3398; SCALAR-NEXT:    movq %rax, (%rdx)
3399; SCALAR-NEXT:    movq %rcx, 8(%rdx)
3400; SCALAR-NEXT:    movq %rdi, 16(%rdx)
3401; SCALAR-NEXT:    movq %rdi, 48(%rdx)
3402; SCALAR-NEXT:    movq %rcx, 40(%rdx)
3403; SCALAR-NEXT:    movq %rax, 32(%rdx)
3404; SCALAR-NEXT:    retq
3405;
3406; SSE2-LABEL: vec384_v3f64:
3407; SSE2:       # %bb.0:
3408; SSE2-NEXT:    movq 16(%rdi), %rax
3409; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
3410; SSE2-NEXT:    pxor (%rdi), %xmm0
3411; SSE2-NEXT:    movdqa %xmm0, (%rsi)
3412; SSE2-NEXT:    notq %rax
3413; SSE2-NEXT:    movq %rax, 16(%rsi)
3414; SSE2-NEXT:    movq %rax, 16(%rdx)
3415; SSE2-NEXT:    movdqa %xmm0, (%rdx)
3416; SSE2-NEXT:    movq %rax, 48(%rdx)
3417; SSE2-NEXT:    movdqu %xmm0, 32(%rdx)
3418; SSE2-NEXT:    retq
3419;
3420; AVX1-LABEL: vec384_v3f64:
3421; AVX1:       # %bb.0:
3422; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
3423; AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
3424; AVX1-NEXT:    vxorps (%rdi), %ymm0, %ymm0
3425; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3426; AVX1-NEXT:    vmovlps %xmm1, 16(%rsi)
3427; AVX1-NEXT:    vmovaps %xmm0, (%rsi)
3428; AVX1-NEXT:    vmovlps %xmm1, 16(%rdx)
3429; AVX1-NEXT:    vmovaps %xmm0, (%rdx)
3430; AVX1-NEXT:    vmovlps %xmm1, 48(%rdx)
3431; AVX1-NEXT:    vmovups %xmm0, 32(%rdx)
3432; AVX1-NEXT:    vzeroupper
3433; AVX1-NEXT:    retq
3434;
3435; AVX2-LABEL: vec384_v3f64:
3436; AVX2:       # %bb.0:
3437; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
3438; AVX2-NEXT:    vpxor (%rdi), %ymm0, %ymm0
3439; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3440; AVX2-NEXT:    vmovq %xmm1, 16(%rsi)
3441; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
3442; AVX2-NEXT:    vmovq %xmm1, 16(%rdx)
3443; AVX2-NEXT:    vmovdqa %xmm0, (%rdx)
3444; AVX2-NEXT:    vmovq %xmm1, 48(%rdx)
3445; AVX2-NEXT:    vmovdqu %xmm0, 32(%rdx)
3446; AVX2-NEXT:    vzeroupper
3447; AVX2-NEXT:    retq
3448  %in.subvec.not = load <3 x i64>, ptr %in.subvec.ptr, align 64
3449  %in.subvec.int = xor <3 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1>
3450  %in.subvec = bitcast <3 x i64> %in.subvec.int to <3 x double>
3451  store <3 x double> %in.subvec, ptr %out.subvec.ptr, align 64
3452  %out.subvec0.ptr = getelementptr <3 x double>, ptr %out.vec.ptr, i64 0
3453  store <3 x double> %in.subvec, ptr %out.subvec0.ptr, align 64
3454  %out.subvec1.ptr = getelementptr <3 x double>, ptr %out.vec.ptr, i64 1
3455  store <3 x double> %in.subvec, ptr %out.subvec1.ptr, align 8
3456  ret void
3457}
3458
3459define void @vec384_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3460; SCALAR-LABEL: vec384_v4i8:
3461; SCALAR:       # %bb.0:
3462; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
3463; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
3464; SCALAR-NEXT:    movzbl (%rdi), %eax
3465; SCALAR-NEXT:    movzbl 1(%rdi), %edi
3466; SCALAR-NEXT:    notb %al
3467; SCALAR-NEXT:    notb %dil
3468; SCALAR-NEXT:    notb %cl
3469; SCALAR-NEXT:    notb %r8b
3470; SCALAR-NEXT:    movb %r8b, 3(%rsi)
3471; SCALAR-NEXT:    movb %cl, 2(%rsi)
3472; SCALAR-NEXT:    movb %dil, 1(%rsi)
3473; SCALAR-NEXT:    movb %al, (%rsi)
3474; SCALAR-NEXT:    movb %r8b, 3(%rdx)
3475; SCALAR-NEXT:    movb %cl, 2(%rdx)
3476; SCALAR-NEXT:    movb %dil, 1(%rdx)
3477; SCALAR-NEXT:    movb %al, (%rdx)
3478; SCALAR-NEXT:    movb %r8b, 7(%rdx)
3479; SCALAR-NEXT:    movb %cl, 6(%rdx)
3480; SCALAR-NEXT:    movb %dil, 5(%rdx)
3481; SCALAR-NEXT:    movb %al, 4(%rdx)
3482; SCALAR-NEXT:    movb %r8b, 11(%rdx)
3483; SCALAR-NEXT:    movb %cl, 10(%rdx)
3484; SCALAR-NEXT:    movb %dil, 9(%rdx)
3485; SCALAR-NEXT:    movb %al, 8(%rdx)
3486; SCALAR-NEXT:    movb %r8b, 15(%rdx)
3487; SCALAR-NEXT:    movb %cl, 14(%rdx)
3488; SCALAR-NEXT:    movb %dil, 13(%rdx)
3489; SCALAR-NEXT:    movb %al, 12(%rdx)
3490; SCALAR-NEXT:    movb %r8b, 19(%rdx)
3491; SCALAR-NEXT:    movb %cl, 18(%rdx)
3492; SCALAR-NEXT:    movb %dil, 17(%rdx)
3493; SCALAR-NEXT:    movb %al, 16(%rdx)
3494; SCALAR-NEXT:    movb %r8b, 23(%rdx)
3495; SCALAR-NEXT:    movb %cl, 22(%rdx)
3496; SCALAR-NEXT:    movb %dil, 21(%rdx)
3497; SCALAR-NEXT:    movb %al, 20(%rdx)
3498; SCALAR-NEXT:    movb %r8b, 27(%rdx)
3499; SCALAR-NEXT:    movb %cl, 26(%rdx)
3500; SCALAR-NEXT:    movb %dil, 25(%rdx)
3501; SCALAR-NEXT:    movb %al, 24(%rdx)
3502; SCALAR-NEXT:    movb %r8b, 31(%rdx)
3503; SCALAR-NEXT:    movb %cl, 30(%rdx)
3504; SCALAR-NEXT:    movb %dil, 29(%rdx)
3505; SCALAR-NEXT:    movb %al, 28(%rdx)
3506; SCALAR-NEXT:    movb %r8b, 35(%rdx)
3507; SCALAR-NEXT:    movb %cl, 34(%rdx)
3508; SCALAR-NEXT:    movb %dil, 33(%rdx)
3509; SCALAR-NEXT:    movb %al, 32(%rdx)
3510; SCALAR-NEXT:    movb %r8b, 39(%rdx)
3511; SCALAR-NEXT:    movb %cl, 38(%rdx)
3512; SCALAR-NEXT:    movb %dil, 37(%rdx)
3513; SCALAR-NEXT:    movb %al, 36(%rdx)
3514; SCALAR-NEXT:    movb %r8b, 43(%rdx)
3515; SCALAR-NEXT:    movb %cl, 42(%rdx)
3516; SCALAR-NEXT:    movb %dil, 41(%rdx)
3517; SCALAR-NEXT:    movb %al, 40(%rdx)
3518; SCALAR-NEXT:    movb %r8b, 47(%rdx)
3519; SCALAR-NEXT:    movb %cl, 46(%rdx)
3520; SCALAR-NEXT:    movb %dil, 45(%rdx)
3521; SCALAR-NEXT:    movb %al, 44(%rdx)
3522; SCALAR-NEXT:    retq
3523;
3524; SSE2-LABEL: vec384_v4i8:
3525; SSE2:       # %bb.0:
3526; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
3527; SSE2-NEXT:    pxor (%rdi), %xmm0
3528; SSE2-NEXT:    movd %xmm0, (%rsi)
3529; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3530; SSE2-NEXT:    movdqa %xmm0, (%rdx)
3531; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
3532; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
3533; SSE2-NEXT:    retq
3534;
3535; AVX1-LABEL: vec384_v4i8:
3536; AVX1:       # %bb.0:
3537; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
3538; AVX1-NEXT:    vpxor (%rdi), %xmm0, %xmm0
3539; AVX1-NEXT:    vmovd %xmm0, (%rsi)
3540; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3541; AVX1-NEXT:    vmovdqa %xmm0, 16(%rdx)
3542; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
3543; AVX1-NEXT:    vmovdqa %xmm0, 32(%rdx)
3544; AVX1-NEXT:    retq
3545;
3546; AVX2-LABEL: vec384_v4i8:
3547; AVX2:       # %bb.0:
3548; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
3549; AVX2-NEXT:    vpxor (%rdi), %xmm0, %xmm0
3550; AVX2-NEXT:    vmovd %xmm0, (%rsi)
3551; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
3552; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
3553; AVX2-NEXT:    vmovdqa %xmm0, 32(%rdx)
3554; AVX2-NEXT:    vzeroupper
3555; AVX2-NEXT:    retq
3556  %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64
3557  %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1>
3558  store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
3559  %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0
3560  store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
3561  %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1
3562  store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
3563  %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2
3564  store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8
3565  %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3
3566  store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4
3567  %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4
3568  store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16
3569  %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5
3570  store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4
3571  %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6
3572  store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8
3573  %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7
3574  store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4
3575  %out.subvec8.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 8
3576  store <4 x i8> %in.subvec, ptr %out.subvec8.ptr, align 32
3577  %out.subvec9.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 9
3578  store <4 x i8> %in.subvec, ptr %out.subvec9.ptr, align 4
3579  %out.subvec10.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 10
3580  store <4 x i8> %in.subvec, ptr %out.subvec10.ptr, align 8
3581  %out.subvec11.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 11
3582  store <4 x i8> %in.subvec, ptr %out.subvec11.ptr, align 4
3583  ret void
3584}
3585
3586define void @vec384_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3587; SCALAR-LABEL: vec384_v4i16:
3588; SCALAR:       # %bb.0:
3589; SCALAR-NEXT:    movzwl 6(%rdi), %r8d
3590; SCALAR-NEXT:    movzwl 2(%rdi), %ecx
3591; SCALAR-NEXT:    movl (%rdi), %eax
3592; SCALAR-NEXT:    movl 4(%rdi), %edi
3593; SCALAR-NEXT:    notl %eax
3594; SCALAR-NEXT:    notl %ecx
3595; SCALAR-NEXT:    notl %edi
3596; SCALAR-NEXT:    notl %r8d
3597; SCALAR-NEXT:    movw %r8w, 6(%rsi)
3598; SCALAR-NEXT:    movw %di, 4(%rsi)
3599; SCALAR-NEXT:    movw %cx, 2(%rsi)
3600; SCALAR-NEXT:    movw %ax, (%rsi)
3601; SCALAR-NEXT:    movw %r8w, 6(%rdx)
3602; SCALAR-NEXT:    movw %di, 4(%rdx)
3603; SCALAR-NEXT:    movw %cx, 2(%rdx)
3604; SCALAR-NEXT:    movw %ax, (%rdx)
3605; SCALAR-NEXT:    movw %r8w, 14(%rdx)
3606; SCALAR-NEXT:    movw %di, 12(%rdx)
3607; SCALAR-NEXT:    movw %cx, 10(%rdx)
3608; SCALAR-NEXT:    movw %ax, 8(%rdx)
3609; SCALAR-NEXT:    movw %r8w, 22(%rdx)
3610; SCALAR-NEXT:    movw %di, 20(%rdx)
3611; SCALAR-NEXT:    movw %cx, 18(%rdx)
3612; SCALAR-NEXT:    movw %ax, 16(%rdx)
3613; SCALAR-NEXT:    movw %r8w, 30(%rdx)
3614; SCALAR-NEXT:    movw %di, 28(%rdx)
3615; SCALAR-NEXT:    movw %cx, 26(%rdx)
3616; SCALAR-NEXT:    movw %ax, 24(%rdx)
3617; SCALAR-NEXT:    movw %r8w, 38(%rdx)
3618; SCALAR-NEXT:    movw %di, 36(%rdx)
3619; SCALAR-NEXT:    movw %cx, 34(%rdx)
3620; SCALAR-NEXT:    movw %ax, 32(%rdx)
3621; SCALAR-NEXT:    movw %r8w, 46(%rdx)
3622; SCALAR-NEXT:    movw %di, 44(%rdx)
3623; SCALAR-NEXT:    movw %cx, 42(%rdx)
3624; SCALAR-NEXT:    movw %ax, 40(%rdx)
3625; SCALAR-NEXT:    retq
3626;
3627; SSE2-LABEL: vec384_v4i16:
3628; SSE2:       # %bb.0:
3629; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3630; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
3631; SSE2-NEXT:    pxor %xmm0, %xmm1
3632; SSE2-NEXT:    movq %xmm1, (%rsi)
3633; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
3634; SSE2-NEXT:    movdqa %xmm0, (%rdx)
3635; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
3636; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
3637; SSE2-NEXT:    retq
3638;
3639; AVX1-LABEL: vec384_v4i16:
3640; AVX1:       # %bb.0:
3641; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
3642; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
3643; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
3644; AVX1-NEXT:    vmovq %xmm0, (%rsi)
3645; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3646; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
3647; AVX1-NEXT:    vmovaps %ymm1, (%rdx)
3648; AVX1-NEXT:    vmovdqa %xmm0, 32(%rdx)
3649; AVX1-NEXT:    vzeroupper
3650; AVX1-NEXT:    retq
3651;
3652; AVX2-ONLY-LABEL: vec384_v4i16:
3653; AVX2-ONLY:       # %bb.0:
3654; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
3655; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
3656; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
3657; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
3658; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %ymm0
3659; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
3660; AVX2-ONLY-NEXT:    vmovdqa %xmm0, 32(%rdx)
3661; AVX2-ONLY-NEXT:    vzeroupper
3662; AVX2-ONLY-NEXT:    retq
3663;
3664; AVX512-LABEL: vec384_v4i16:
3665; AVX512:       # %bb.0:
3666; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
3667; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
3668; AVX512-NEXT:    vmovq %xmm0, (%rsi)
3669; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
3670; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
3671; AVX512-NEXT:    vmovdqa %xmm0, 32(%rdx)
3672; AVX512-NEXT:    vzeroupper
3673; AVX512-NEXT:    retq
3674  %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64
3675  %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1>
3676  store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
3677  %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0
3678  store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
3679  %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1
3680  store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8
3681  %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2
3682  store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16
3683  %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3
3684  store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8
3685  %out.subvec4.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 4
3686  store <4 x i16> %in.subvec, ptr %out.subvec4.ptr, align 32
3687  %out.subvec5.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 5
3688  store <4 x i16> %in.subvec, ptr %out.subvec5.ptr, align 8
3689  ret void
3690}
3691
3692define void @vec384_v4i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3693; SCALAR-LABEL: vec384_v4i32:
3694; SCALAR:       # %bb.0:
3695; SCALAR-NEXT:    movaps (%rdi), %xmm0
3696; SCALAR-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3697; SCALAR-NEXT:    movaps %xmm0, (%rsi)
3698; SCALAR-NEXT:    movaps %xmm0, (%rdx)
3699; SCALAR-NEXT:    movaps %xmm0, 16(%rdx)
3700; SCALAR-NEXT:    movaps %xmm0, 32(%rdx)
3701; SCALAR-NEXT:    retq
3702;
3703; SSE2-LABEL: vec384_v4i32:
3704; SSE2:       # %bb.0:
3705; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
3706; SSE2-NEXT:    pxor (%rdi), %xmm0
3707; SSE2-NEXT:    movdqa %xmm0, (%rsi)
3708; SSE2-NEXT:    movdqa %xmm0, (%rdx)
3709; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
3710; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
3711; SSE2-NEXT:    retq
3712;
3713; AVX-LABEL: vec384_v4i32:
3714; AVX:       # %bb.0:
3715; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
3716; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
3717; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
3718; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
3719; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
3720; AVX-NEXT:    vmovdqa %xmm0, 32(%rdx)
3721; AVX-NEXT:    retq
3722  %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
3723  %in.subvec = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
3724  store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
3725  %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0
3726  store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
3727  %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1
3728  store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16
3729  %out.subvec2.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 2
3730  store <4 x i32> %in.subvec, ptr %out.subvec2.ptr, align 32
3731  ret void
3732}
3733
3734define void @vec384_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3735; SCALAR-LABEL: vec384_v4f32:
3736; SCALAR:       # %bb.0:
3737; SCALAR-NEXT:    movaps (%rdi), %xmm0
3738; SCALAR-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3739; SCALAR-NEXT:    movaps %xmm0, (%rsi)
3740; SCALAR-NEXT:    movaps %xmm0, (%rdx)
3741; SCALAR-NEXT:    movaps %xmm0, 16(%rdx)
3742; SCALAR-NEXT:    movaps %xmm0, 32(%rdx)
3743; SCALAR-NEXT:    retq
3744;
3745; SSE2-LABEL: vec384_v4f32:
3746; SSE2:       # %bb.0:
3747; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
3748; SSE2-NEXT:    pxor (%rdi), %xmm0
3749; SSE2-NEXT:    movdqa %xmm0, (%rsi)
3750; SSE2-NEXT:    movdqa %xmm0, (%rdx)
3751; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
3752; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
3753; SSE2-NEXT:    retq
3754;
3755; AVX-LABEL: vec384_v4f32:
3756; AVX:       # %bb.0:
3757; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
3758; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
3759; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
3760; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
3761; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
3762; AVX-NEXT:    vmovdqa %xmm0, 32(%rdx)
3763; AVX-NEXT:    retq
3764  %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
3765  %in.subvec.int = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
3766  %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float>
3767  store <4 x float> %in.subvec, ptr %out.subvec.ptr, align 64
3768  %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0
3769  store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
3770  %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1
3771  store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16
3772  %out.subvec2.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 2
3773  store <4 x float> %in.subvec, ptr %out.subvec2.ptr, align 32
3774  ret void
3775}
3776
3777define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3778; SCALAR-LABEL: vec384_v6i8:
3779; SCALAR:       # %bb.0:
3780; SCALAR-NEXT:    movq (%rdi), %rax
3781; SCALAR-NEXT:    movq %rax, %rcx
3782; SCALAR-NEXT:    shrq $32, %rcx
3783; SCALAR-NEXT:    notl %ecx
3784; SCALAR-NEXT:    notl %eax
3785; SCALAR-NEXT:    movl %eax, (%rsi)
3786; SCALAR-NEXT:    movw %cx, 4(%rsi)
3787; SCALAR-NEXT:    movw %cx, 4(%rdx)
3788; SCALAR-NEXT:    movl %eax, (%rdx)
3789; SCALAR-NEXT:    movw %cx, 12(%rdx)
3790; SCALAR-NEXT:    movl %eax, 8(%rdx)
3791; SCALAR-NEXT:    movw %cx, 20(%rdx)
3792; SCALAR-NEXT:    movl %eax, 16(%rdx)
3793; SCALAR-NEXT:    movw %cx, 28(%rdx)
3794; SCALAR-NEXT:    movl %eax, 24(%rdx)
3795; SCALAR-NEXT:    movw %cx, 36(%rdx)
3796; SCALAR-NEXT:    movl %eax, 32(%rdx)
3797; SCALAR-NEXT:    movw %cx, 44(%rdx)
3798; SCALAR-NEXT:    movl %eax, 40(%rdx)
3799; SCALAR-NEXT:    movw %cx, 52(%rdx)
3800; SCALAR-NEXT:    movl %eax, 48(%rdx)
3801; SCALAR-NEXT:    movw %cx, 60(%rdx)
3802; SCALAR-NEXT:    movl %eax, 56(%rdx)
3803; SCALAR-NEXT:    retq
3804;
3805; SSE2-ONLY-LABEL: vec384_v6i8:
3806; SSE2-ONLY:       # %bb.0:
3807; SSE2-ONLY-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3808; SSE2-ONLY-NEXT:    pcmpeqd %xmm1, %xmm1
3809; SSE2-ONLY-NEXT:    pxor %xmm0, %xmm1
3810; SSE2-ONLY-NEXT:    movd %xmm1, (%rsi)
3811; SSE2-ONLY-NEXT:    pextrw $2, %xmm1, %eax
3812; SSE2-ONLY-NEXT:    movw %ax, 4(%rsi)
3813; SSE2-ONLY-NEXT:    movw %ax, 4(%rdx)
3814; SSE2-ONLY-NEXT:    movd %xmm1, (%rdx)
3815; SSE2-ONLY-NEXT:    movw %ax, 12(%rdx)
3816; SSE2-ONLY-NEXT:    movd %xmm1, 8(%rdx)
3817; SSE2-ONLY-NEXT:    movw %ax, 20(%rdx)
3818; SSE2-ONLY-NEXT:    movd %xmm1, 16(%rdx)
3819; SSE2-ONLY-NEXT:    movw %ax, 28(%rdx)
3820; SSE2-ONLY-NEXT:    movd %xmm1, 24(%rdx)
3821; SSE2-ONLY-NEXT:    movw %ax, 36(%rdx)
3822; SSE2-ONLY-NEXT:    movd %xmm1, 32(%rdx)
3823; SSE2-ONLY-NEXT:    movw %ax, 44(%rdx)
3824; SSE2-ONLY-NEXT:    movd %xmm1, 40(%rdx)
3825; SSE2-ONLY-NEXT:    movw %ax, 52(%rdx)
3826; SSE2-ONLY-NEXT:    movd %xmm1, 48(%rdx)
3827; SSE2-ONLY-NEXT:    movw %ax, 60(%rdx)
3828; SSE2-ONLY-NEXT:    movd %xmm1, 56(%rdx)
3829; SSE2-ONLY-NEXT:    retq
3830;
3831; SSE3-LABEL: vec384_v6i8:
3832; SSE3:       # %bb.0:
3833; SSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3834; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
3835; SSE3-NEXT:    pxor %xmm0, %xmm1
3836; SSE3-NEXT:    movd %xmm1, (%rsi)
3837; SSE3-NEXT:    pextrw $2, %xmm1, %eax
3838; SSE3-NEXT:    movw %ax, 4(%rsi)
3839; SSE3-NEXT:    movw %ax, 4(%rdx)
3840; SSE3-NEXT:    movd %xmm1, (%rdx)
3841; SSE3-NEXT:    movw %ax, 12(%rdx)
3842; SSE3-NEXT:    movd %xmm1, 8(%rdx)
3843; SSE3-NEXT:    movw %ax, 20(%rdx)
3844; SSE3-NEXT:    movd %xmm1, 16(%rdx)
3845; SSE3-NEXT:    movw %ax, 28(%rdx)
3846; SSE3-NEXT:    movd %xmm1, 24(%rdx)
3847; SSE3-NEXT:    movw %ax, 36(%rdx)
3848; SSE3-NEXT:    movd %xmm1, 32(%rdx)
3849; SSE3-NEXT:    movw %ax, 44(%rdx)
3850; SSE3-NEXT:    movd %xmm1, 40(%rdx)
3851; SSE3-NEXT:    movw %ax, 52(%rdx)
3852; SSE3-NEXT:    movd %xmm1, 48(%rdx)
3853; SSE3-NEXT:    movw %ax, 60(%rdx)
3854; SSE3-NEXT:    movd %xmm1, 56(%rdx)
3855; SSE3-NEXT:    retq
3856;
3857; SSSE3-ONLY-LABEL: vec384_v6i8:
3858; SSSE3-ONLY:       # %bb.0:
3859; SSSE3-ONLY-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3860; SSSE3-ONLY-NEXT:    pcmpeqd %xmm1, %xmm1
3861; SSSE3-ONLY-NEXT:    pxor %xmm0, %xmm1
3862; SSSE3-ONLY-NEXT:    movd %xmm1, (%rsi)
3863; SSSE3-ONLY-NEXT:    pextrw $2, %xmm1, %eax
3864; SSSE3-ONLY-NEXT:    movw %ax, 4(%rsi)
3865; SSSE3-ONLY-NEXT:    movw %ax, 4(%rdx)
3866; SSSE3-ONLY-NEXT:    movd %xmm1, (%rdx)
3867; SSSE3-ONLY-NEXT:    movw %ax, 12(%rdx)
3868; SSSE3-ONLY-NEXT:    movd %xmm1, 8(%rdx)
3869; SSSE3-ONLY-NEXT:    movw %ax, 20(%rdx)
3870; SSSE3-ONLY-NEXT:    movd %xmm1, 16(%rdx)
3871; SSSE3-ONLY-NEXT:    movw %ax, 28(%rdx)
3872; SSSE3-ONLY-NEXT:    movd %xmm1, 24(%rdx)
3873; SSSE3-ONLY-NEXT:    movw %ax, 36(%rdx)
3874; SSSE3-ONLY-NEXT:    movd %xmm1, 32(%rdx)
3875; SSSE3-ONLY-NEXT:    movw %ax, 44(%rdx)
3876; SSSE3-ONLY-NEXT:    movd %xmm1, 40(%rdx)
3877; SSSE3-ONLY-NEXT:    movw %ax, 52(%rdx)
3878; SSSE3-ONLY-NEXT:    movd %xmm1, 48(%rdx)
3879; SSSE3-ONLY-NEXT:    movw %ax, 60(%rdx)
3880; SSSE3-ONLY-NEXT:    movd %xmm1, 56(%rdx)
3881; SSSE3-ONLY-NEXT:    retq
3882;
3883; SSE41-LABEL: vec384_v6i8:
3884; SSE41:       # %bb.0:
3885; SSE41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3886; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
3887; SSE41-NEXT:    pxor %xmm0, %xmm1
3888; SSE41-NEXT:    pextrw $2, %xmm1, 4(%rsi)
3889; SSE41-NEXT:    movd %xmm1, (%rsi)
3890; SSE41-NEXT:    pextrw $2, %xmm1, 4(%rdx)
3891; SSE41-NEXT:    movd %xmm1, (%rdx)
3892; SSE41-NEXT:    pextrw $2, %xmm1, 12(%rdx)
3893; SSE41-NEXT:    movd %xmm1, 8(%rdx)
3894; SSE41-NEXT:    pextrw $2, %xmm1, 20(%rdx)
3895; SSE41-NEXT:    movd %xmm1, 16(%rdx)
3896; SSE41-NEXT:    pextrw $2, %xmm1, 28(%rdx)
3897; SSE41-NEXT:    movd %xmm1, 24(%rdx)
3898; SSE41-NEXT:    pextrw $2, %xmm1, 36(%rdx)
3899; SSE41-NEXT:    movd %xmm1, 32(%rdx)
3900; SSE41-NEXT:    pextrw $2, %xmm1, 44(%rdx)
3901; SSE41-NEXT:    movd %xmm1, 40(%rdx)
3902; SSE41-NEXT:    pextrw $2, %xmm1, 52(%rdx)
3903; SSE41-NEXT:    movd %xmm1, 48(%rdx)
3904; SSE41-NEXT:    pextrw $2, %xmm1, 60(%rdx)
3905; SSE41-NEXT:    movd %xmm1, 56(%rdx)
3906; SSE41-NEXT:    retq
3907;
3908; SSE42-LABEL: vec384_v6i8:
3909; SSE42:       # %bb.0:
3910; SSE42-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3911; SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
3912; SSE42-NEXT:    pxor %xmm0, %xmm1
3913; SSE42-NEXT:    pextrw $2, %xmm1, 4(%rsi)
3914; SSE42-NEXT:    movd %xmm1, (%rsi)
3915; SSE42-NEXT:    pextrw $2, %xmm1, 4(%rdx)
3916; SSE42-NEXT:    movd %xmm1, (%rdx)
3917; SSE42-NEXT:    pextrw $2, %xmm1, 12(%rdx)
3918; SSE42-NEXT:    movd %xmm1, 8(%rdx)
3919; SSE42-NEXT:    pextrw $2, %xmm1, 20(%rdx)
3920; SSE42-NEXT:    movd %xmm1, 16(%rdx)
3921; SSE42-NEXT:    pextrw $2, %xmm1, 28(%rdx)
3922; SSE42-NEXT:    movd %xmm1, 24(%rdx)
3923; SSE42-NEXT:    pextrw $2, %xmm1, 36(%rdx)
3924; SSE42-NEXT:    movd %xmm1, 32(%rdx)
3925; SSE42-NEXT:    pextrw $2, %xmm1, 44(%rdx)
3926; SSE42-NEXT:    movd %xmm1, 40(%rdx)
3927; SSE42-NEXT:    pextrw $2, %xmm1, 52(%rdx)
3928; SSE42-NEXT:    movd %xmm1, 48(%rdx)
3929; SSE42-NEXT:    pextrw $2, %xmm1, 60(%rdx)
3930; SSE42-NEXT:    movd %xmm1, 56(%rdx)
3931; SSE42-NEXT:    retq
3932;
3933; AVX1-LABEL: vec384_v6i8:
3934; AVX1:       # %bb.0:
3935; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
3936; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
3937; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
3938; AVX1-NEXT:    vpextrw $2, %xmm0, 4(%rsi)
3939; AVX1-NEXT:    vmovd %xmm0, (%rsi)
3940; AVX1-NEXT:    vpextrw $2, %xmm0, 4(%rdx)
3941; AVX1-NEXT:    vmovd %xmm0, (%rdx)
3942; AVX1-NEXT:    vpextrw $2, %xmm0, 12(%rdx)
3943; AVX1-NEXT:    vmovd %xmm0, 8(%rdx)
3944; AVX1-NEXT:    vpextrw $2, %xmm0, 20(%rdx)
3945; AVX1-NEXT:    vmovd %xmm0, 16(%rdx)
3946; AVX1-NEXT:    vpextrw $2, %xmm0, 28(%rdx)
3947; AVX1-NEXT:    vmovd %xmm0, 24(%rdx)
3948; AVX1-NEXT:    vpextrw $2, %xmm0, 36(%rdx)
3949; AVX1-NEXT:    vmovd %xmm0, 32(%rdx)
3950; AVX1-NEXT:    vpextrw $2, %xmm0, 44(%rdx)
3951; AVX1-NEXT:    vmovd %xmm0, 40(%rdx)
3952; AVX1-NEXT:    vpextrw $2, %xmm0, 52(%rdx)
3953; AVX1-NEXT:    vmovd %xmm0, 48(%rdx)
3954; AVX1-NEXT:    vpextrw $2, %xmm0, 60(%rdx)
3955; AVX1-NEXT:    vmovd %xmm0, 56(%rdx)
3956; AVX1-NEXT:    retq
3957;
3958; AVX2-ONLY-LABEL: vec384_v6i8:
3959; AVX2-ONLY:       # %bb.0:
3960; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
3961; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
3962; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
3963; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 4(%rsi)
3964; AVX2-ONLY-NEXT:    vmovd %xmm0, (%rsi)
3965; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 4(%rdx)
3966; AVX2-ONLY-NEXT:    vmovd %xmm0, (%rdx)
3967; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 12(%rdx)
3968; AVX2-ONLY-NEXT:    vmovd %xmm0, 8(%rdx)
3969; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 20(%rdx)
3970; AVX2-ONLY-NEXT:    vmovd %xmm0, 16(%rdx)
3971; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 28(%rdx)
3972; AVX2-ONLY-NEXT:    vmovd %xmm0, 24(%rdx)
3973; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 36(%rdx)
3974; AVX2-ONLY-NEXT:    vmovd %xmm0, 32(%rdx)
3975; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 44(%rdx)
3976; AVX2-ONLY-NEXT:    vmovd %xmm0, 40(%rdx)
3977; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 52(%rdx)
3978; AVX2-ONLY-NEXT:    vmovd %xmm0, 48(%rdx)
3979; AVX2-ONLY-NEXT:    vpextrw $2, %xmm0, 60(%rdx)
3980; AVX2-ONLY-NEXT:    vmovd %xmm0, 56(%rdx)
3981; AVX2-ONLY-NEXT:    retq
3982;
3983; AVX512-LABEL: vec384_v6i8:
3984; AVX512:       # %bb.0:
3985; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
3986; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
3987; AVX512-NEXT:    vpextrw $2, %xmm0, 4(%rsi)
3988; AVX512-NEXT:    vmovd %xmm0, (%rsi)
3989; AVX512-NEXT:    vpextrw $2, %xmm0, 4(%rdx)
3990; AVX512-NEXT:    vmovd %xmm0, (%rdx)
3991; AVX512-NEXT:    vpextrw $2, %xmm0, 12(%rdx)
3992; AVX512-NEXT:    vmovd %xmm0, 8(%rdx)
3993; AVX512-NEXT:    vpextrw $2, %xmm0, 20(%rdx)
3994; AVX512-NEXT:    vmovd %xmm0, 16(%rdx)
3995; AVX512-NEXT:    vpextrw $2, %xmm0, 28(%rdx)
3996; AVX512-NEXT:    vmovd %xmm0, 24(%rdx)
3997; AVX512-NEXT:    vpextrw $2, %xmm0, 36(%rdx)
3998; AVX512-NEXT:    vmovd %xmm0, 32(%rdx)
3999; AVX512-NEXT:    vpextrw $2, %xmm0, 44(%rdx)
4000; AVX512-NEXT:    vmovd %xmm0, 40(%rdx)
4001; AVX512-NEXT:    vpextrw $2, %xmm0, 52(%rdx)
4002; AVX512-NEXT:    vmovd %xmm0, 48(%rdx)
4003; AVX512-NEXT:    vpextrw $2, %xmm0, 60(%rdx)
4004; AVX512-NEXT:    vmovd %xmm0, 56(%rdx)
4005; AVX512-NEXT:    retq
4006  %in.subvec.not = load <6 x i8>, ptr %in.subvec.ptr, align 64
4007  %in.subvec = xor <6 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
4008  store <6 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
4009  %out.subvec0.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 0
4010  store <6 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
4011  %out.subvec1.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 1
4012  store <6 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
4013  %out.subvec2.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 2
4014  store <6 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
4015  %out.subvec3.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 3
4016  store <6 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
4017  %out.subvec4.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 4
4018  store <6 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8
4019  %out.subvec5.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 5
4020  store <6 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2
4021  %out.subvec6.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 6
4022  store <6 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4
4023  %out.subvec7.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 7
4024  store <6 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2
4025  ret void
4026}
4027
4028define void @vec384_v6i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4029; SCALAR-LABEL: vec384_v6i16:
4030; SCALAR:       # %bb.0:
4031; SCALAR-NEXT:    movq (%rdi), %rax
4032; SCALAR-NEXT:    movl 8(%rdi), %ecx
4033; SCALAR-NEXT:    notl %ecx
4034; SCALAR-NEXT:    notq %rax
4035; SCALAR-NEXT:    movq %rax, (%rsi)
4036; SCALAR-NEXT:    movl %ecx, 8(%rsi)
4037; SCALAR-NEXT:    movl %ecx, 8(%rdx)
4038; SCALAR-NEXT:    movq %rax, (%rdx)
4039; SCALAR-NEXT:    movl %ecx, 24(%rdx)
4040; SCALAR-NEXT:    movq %rax, 16(%rdx)
4041; SCALAR-NEXT:    movl %ecx, 40(%rdx)
4042; SCALAR-NEXT:    movq %rax, 32(%rdx)
4043; SCALAR-NEXT:    movl %ecx, 56(%rdx)
4044; SCALAR-NEXT:    movq %rax, 48(%rdx)
4045; SCALAR-NEXT:    retq
4046;
4047; SSE2-ONLY-LABEL: vec384_v6i16:
4048; SSE2-ONLY:       # %bb.0:
4049; SSE2-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
4050; SSE2-ONLY-NEXT:    pxor (%rdi), %xmm0
4051; SSE2-ONLY-NEXT:    movq %xmm0, (%rsi)
4052; SSE2-ONLY-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4053; SSE2-ONLY-NEXT:    movd %xmm1, 8(%rsi)
4054; SSE2-ONLY-NEXT:    movd %xmm1, 8(%rdx)
4055; SSE2-ONLY-NEXT:    movq %xmm0, (%rdx)
4056; SSE2-ONLY-NEXT:    movd %xmm1, 24(%rdx)
4057; SSE2-ONLY-NEXT:    movq %xmm0, 16(%rdx)
4058; SSE2-ONLY-NEXT:    movd %xmm1, 40(%rdx)
4059; SSE2-ONLY-NEXT:    movq %xmm0, 32(%rdx)
4060; SSE2-ONLY-NEXT:    movd %xmm1, 56(%rdx)
4061; SSE2-ONLY-NEXT:    movq %xmm0, 48(%rdx)
4062; SSE2-ONLY-NEXT:    retq
4063;
4064; SSE3-LABEL: vec384_v6i16:
4065; SSE3:       # %bb.0:
4066; SSE3-NEXT:    pcmpeqd %xmm0, %xmm0
4067; SSE3-NEXT:    pxor (%rdi), %xmm0
4068; SSE3-NEXT:    movq %xmm0, (%rsi)
4069; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4070; SSE3-NEXT:    movd %xmm1, 8(%rsi)
4071; SSE3-NEXT:    movd %xmm1, 8(%rdx)
4072; SSE3-NEXT:    movq %xmm0, (%rdx)
4073; SSE3-NEXT:    movd %xmm1, 24(%rdx)
4074; SSE3-NEXT:    movq %xmm0, 16(%rdx)
4075; SSE3-NEXT:    movd %xmm1, 40(%rdx)
4076; SSE3-NEXT:    movq %xmm0, 32(%rdx)
4077; SSE3-NEXT:    movd %xmm1, 56(%rdx)
4078; SSE3-NEXT:    movq %xmm0, 48(%rdx)
4079; SSE3-NEXT:    retq
4080;
4081; SSSE3-ONLY-LABEL: vec384_v6i16:
4082; SSSE3-ONLY:       # %bb.0:
4083; SSSE3-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
4084; SSSE3-ONLY-NEXT:    pxor (%rdi), %xmm0
4085; SSSE3-ONLY-NEXT:    movq %xmm0, (%rsi)
4086; SSSE3-ONLY-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4087; SSSE3-ONLY-NEXT:    movd %xmm1, 8(%rsi)
4088; SSSE3-ONLY-NEXT:    movd %xmm1, 8(%rdx)
4089; SSSE3-ONLY-NEXT:    movq %xmm0, (%rdx)
4090; SSSE3-ONLY-NEXT:    movd %xmm1, 24(%rdx)
4091; SSSE3-ONLY-NEXT:    movq %xmm0, 16(%rdx)
4092; SSSE3-ONLY-NEXT:    movd %xmm1, 40(%rdx)
4093; SSSE3-ONLY-NEXT:    movq %xmm0, 32(%rdx)
4094; SSSE3-ONLY-NEXT:    movd %xmm1, 56(%rdx)
4095; SSSE3-ONLY-NEXT:    movq %xmm0, 48(%rdx)
4096; SSSE3-ONLY-NEXT:    retq
4097;
4098; SSE41-LABEL: vec384_v6i16:
4099; SSE41:       # %bb.0:
4100; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
4101; SSE41-NEXT:    pxor (%rdi), %xmm0
4102; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rsi)
4103; SSE41-NEXT:    movq %xmm0, (%rsi)
4104; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdx)
4105; SSE41-NEXT:    movq %xmm0, (%rdx)
4106; SSE41-NEXT:    pextrd $2, %xmm0, 24(%rdx)
4107; SSE41-NEXT:    movq %xmm0, 16(%rdx)
4108; SSE41-NEXT:    pextrd $2, %xmm0, 40(%rdx)
4109; SSE41-NEXT:    movq %xmm0, 32(%rdx)
4110; SSE41-NEXT:    pextrd $2, %xmm0, 56(%rdx)
4111; SSE41-NEXT:    movq %xmm0, 48(%rdx)
4112; SSE41-NEXT:    retq
4113;
4114; SSE42-LABEL: vec384_v6i16:
4115; SSE42:       # %bb.0:
4116; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
4117; SSE42-NEXT:    pxor (%rdi), %xmm0
4118; SSE42-NEXT:    pextrd $2, %xmm0, 8(%rsi)
4119; SSE42-NEXT:    movq %xmm0, (%rsi)
4120; SSE42-NEXT:    pextrd $2, %xmm0, 8(%rdx)
4121; SSE42-NEXT:    movq %xmm0, (%rdx)
4122; SSE42-NEXT:    pextrd $2, %xmm0, 24(%rdx)
4123; SSE42-NEXT:    movq %xmm0, 16(%rdx)
4124; SSE42-NEXT:    pextrd $2, %xmm0, 40(%rdx)
4125; SSE42-NEXT:    movq %xmm0, 32(%rdx)
4126; SSE42-NEXT:    pextrd $2, %xmm0, 56(%rdx)
4127; SSE42-NEXT:    movq %xmm0, 48(%rdx)
4128; SSE42-NEXT:    retq
4129;
4130; AVX-LABEL: vec384_v6i16:
4131; AVX:       # %bb.0:
4132; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
4133; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
4134; AVX-NEXT:    vpextrd $2, %xmm0, 8(%rsi)
4135; AVX-NEXT:    vmovq %xmm0, (%rsi)
4136; AVX-NEXT:    vpextrd $2, %xmm0, 8(%rdx)
4137; AVX-NEXT:    vmovq %xmm0, (%rdx)
4138; AVX-NEXT:    vpextrd $2, %xmm0, 24(%rdx)
4139; AVX-NEXT:    vmovq %xmm0, 16(%rdx)
4140; AVX-NEXT:    vpextrd $2, %xmm0, 40(%rdx)
4141; AVX-NEXT:    vmovq %xmm0, 32(%rdx)
4142; AVX-NEXT:    vpextrd $2, %xmm0, 56(%rdx)
4143; AVX-NEXT:    vmovq %xmm0, 48(%rdx)
4144; AVX-NEXT:    retq
4145  %in.subvec.not = load <6 x i16>, ptr %in.subvec.ptr, align 64
4146  %in.subvec = xor <6 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
4147  store <6 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
4148  %out.subvec0.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 0
4149  store <6 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
4150  %out.subvec1.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 1
4151  store <6 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
4152  %out.subvec2.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 2
4153  store <6 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8
4154  %out.subvec3.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 3
4155  store <6 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4
4156  ret void
4157}
4158
4159define void @vec384_v6i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4160; SCALAR-LABEL: vec384_v6i32:
4161; SCALAR:       # %bb.0:
4162; SCALAR-NEXT:    movq (%rdi), %rax
4163; SCALAR-NEXT:    movq 8(%rdi), %rcx
4164; SCALAR-NEXT:    movq 16(%rdi), %rdi
4165; SCALAR-NEXT:    notq %rdi
4166; SCALAR-NEXT:    notq %rcx
4167; SCALAR-NEXT:    notq %rax
4168; SCALAR-NEXT:    movq %rax, (%rsi)
4169; SCALAR-NEXT:    movq %rcx, 8(%rsi)
4170; SCALAR-NEXT:    movq %rdi, 16(%rsi)
4171; SCALAR-NEXT:    movq %rax, (%rdx)
4172; SCALAR-NEXT:    movq %rcx, 8(%rdx)
4173; SCALAR-NEXT:    movq %rdi, 16(%rdx)
4174; SCALAR-NEXT:    movq %rdi, 48(%rdx)
4175; SCALAR-NEXT:    movq %rcx, 40(%rdx)
4176; SCALAR-NEXT:    movq %rax, 32(%rdx)
4177; SCALAR-NEXT:    retq
4178;
4179; SSE2-LABEL: vec384_v6i32:
4180; SSE2:       # %bb.0:
4181; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
4182; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
4183; SSE2-NEXT:    pxor %xmm0, %xmm1
4184; SSE2-NEXT:    pxor (%rdi), %xmm0
4185; SSE2-NEXT:    movdqa %xmm0, (%rsi)
4186; SSE2-NEXT:    movq %xmm1, 16(%rsi)
4187; SSE2-NEXT:    movq %xmm1, 16(%rdx)
4188; SSE2-NEXT:    movdqa %xmm0, (%rdx)
4189; SSE2-NEXT:    movq %xmm1, 48(%rdx)
4190; SSE2-NEXT:    movdqu %xmm0, 32(%rdx)
4191; SSE2-NEXT:    retq
4192;
4193; AVX1-LABEL: vec384_v6i32:
4194; AVX1:       # %bb.0:
4195; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
4196; AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
4197; AVX1-NEXT:    vxorps (%rdi), %ymm0, %ymm0
4198; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4199; AVX1-NEXT:    vmovlps %xmm1, 16(%rsi)
4200; AVX1-NEXT:    vmovaps %xmm0, (%rsi)
4201; AVX1-NEXT:    vmovlps %xmm1, 16(%rdx)
4202; AVX1-NEXT:    vmovaps %xmm0, (%rdx)
4203; AVX1-NEXT:    vmovlps %xmm1, 48(%rdx)
4204; AVX1-NEXT:    vmovups %xmm0, 32(%rdx)
4205; AVX1-NEXT:    vzeroupper
4206; AVX1-NEXT:    retq
4207;
4208; AVX2-LABEL: vec384_v6i32:
4209; AVX2:       # %bb.0:
4210; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
4211; AVX2-NEXT:    vpxor (%rdi), %ymm0, %ymm0
4212; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4213; AVX2-NEXT:    vmovq %xmm1, 16(%rsi)
4214; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
4215; AVX2-NEXT:    vmovq %xmm1, 16(%rdx)
4216; AVX2-NEXT:    vmovdqa %xmm0, (%rdx)
4217; AVX2-NEXT:    vmovq %xmm1, 48(%rdx)
4218; AVX2-NEXT:    vmovdqu %xmm0, 32(%rdx)
4219; AVX2-NEXT:    vzeroupper
4220; AVX2-NEXT:    retq
4221  %in.subvec.not = load <6 x i32>, ptr %in.subvec.ptr, align 64
4222  %in.subvec = xor <6 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
4223  store <6 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
4224  %out.subvec0.ptr = getelementptr <6 x i32>, ptr %out.vec.ptr, i64 0
4225  store <6 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
4226  %out.subvec1.ptr = getelementptr <6 x i32>, ptr %out.vec.ptr, i64 1
4227  store <6 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8
4228  ret void
4229}
4230
4231define void @vec384_v6f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4232; SCALAR-LABEL: vec384_v6f32:
4233; SCALAR:       # %bb.0:
4234; SCALAR-NEXT:    movq (%rdi), %rax
4235; SCALAR-NEXT:    movq 8(%rdi), %rcx
4236; SCALAR-NEXT:    movq 16(%rdi), %rdi
4237; SCALAR-NEXT:    notq %rdi
4238; SCALAR-NEXT:    notq %rcx
4239; SCALAR-NEXT:    notq %rax
4240; SCALAR-NEXT:    movq %rax, (%rsi)
4241; SCALAR-NEXT:    movq %rcx, 8(%rsi)
4242; SCALAR-NEXT:    movq %rdi, 16(%rsi)
4243; SCALAR-NEXT:    movq %rax, (%rdx)
4244; SCALAR-NEXT:    movq %rcx, 8(%rdx)
4245; SCALAR-NEXT:    movq %rdi, 16(%rdx)
4246; SCALAR-NEXT:    movq %rdi, 48(%rdx)
4247; SCALAR-NEXT:    movq %rcx, 40(%rdx)
4248; SCALAR-NEXT:    movq %rax, 32(%rdx)
4249; SCALAR-NEXT:    retq
4250;
4251; SSE2-LABEL: vec384_v6f32:
4252; SSE2:       # %bb.0:
4253; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
4254; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
4255; SSE2-NEXT:    pxor %xmm0, %xmm1
4256; SSE2-NEXT:    pxor (%rdi), %xmm0
4257; SSE2-NEXT:    movdqa %xmm0, (%rsi)
4258; SSE2-NEXT:    movq %xmm1, 16(%rsi)
4259; SSE2-NEXT:    movq %xmm1, 16(%rdx)
4260; SSE2-NEXT:    movdqa %xmm0, (%rdx)
4261; SSE2-NEXT:    movq %xmm1, 48(%rdx)
4262; SSE2-NEXT:    movdqu %xmm0, 32(%rdx)
4263; SSE2-NEXT:    retq
4264;
4265; AVX1-LABEL: vec384_v6f32:
4266; AVX1:       # %bb.0:
4267; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
4268; AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
4269; AVX1-NEXT:    vxorps (%rdi), %ymm0, %ymm0
4270; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4271; AVX1-NEXT:    vmovlps %xmm1, 16(%rsi)
4272; AVX1-NEXT:    vmovaps %xmm0, (%rsi)
4273; AVX1-NEXT:    vmovlps %xmm1, 16(%rdx)
4274; AVX1-NEXT:    vmovaps %xmm0, (%rdx)
4275; AVX1-NEXT:    vmovlps %xmm1, 48(%rdx)
4276; AVX1-NEXT:    vmovups %xmm0, 32(%rdx)
4277; AVX1-NEXT:    vzeroupper
4278; AVX1-NEXT:    retq
4279;
4280; AVX2-LABEL: vec384_v6f32:
4281; AVX2:       # %bb.0:
4282; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
4283; AVX2-NEXT:    vpxor (%rdi), %ymm0, %ymm0
4284; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4285; AVX2-NEXT:    vmovq %xmm1, 16(%rsi)
4286; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
4287; AVX2-NEXT:    vmovq %xmm1, 16(%rdx)
4288; AVX2-NEXT:    vmovdqa %xmm0, (%rdx)
4289; AVX2-NEXT:    vmovq %xmm1, 48(%rdx)
4290; AVX2-NEXT:    vmovdqu %xmm0, 32(%rdx)
4291; AVX2-NEXT:    vzeroupper
4292; AVX2-NEXT:    retq
4293  %in.subvec.not = load <6 x i32>, ptr %in.subvec.ptr, align 64
4294  %in.subvec.int = xor <6 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
4295  %in.subvec = bitcast <6 x i32> %in.subvec.int to <6 x float>
4296  store <6 x float> %in.subvec, ptr %out.subvec.ptr, align 64
4297  %out.subvec0.ptr = getelementptr <6 x float>, ptr %out.vec.ptr, i64 0
4298  store <6 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
4299  %out.subvec1.ptr = getelementptr <6 x float>, ptr %out.vec.ptr, i64 1
4300  store <6 x float> %in.subvec, ptr %out.subvec1.ptr, align 8
4301  ret void
4302}
4303
4304define void @vec384_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4305; SCALAR-LABEL: vec384_v8i8:
4306; SCALAR:       # %bb.0:
4307; SCALAR-NEXT:    pushq %rbx
4308; SCALAR-NEXT:    movzbl 7(%rdi), %ebx
4309; SCALAR-NEXT:    movzbl 6(%rdi), %r11d
4310; SCALAR-NEXT:    movzbl 5(%rdi), %r10d
4311; SCALAR-NEXT:    movzbl 4(%rdi), %r9d
4312; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
4313; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
4314; SCALAR-NEXT:    movzbl (%rdi), %eax
4315; SCALAR-NEXT:    movzbl 1(%rdi), %edi
4316; SCALAR-NEXT:    notb %al
4317; SCALAR-NEXT:    notb %dil
4318; SCALAR-NEXT:    notb %cl
4319; SCALAR-NEXT:    notb %r8b
4320; SCALAR-NEXT:    notb %r9b
4321; SCALAR-NEXT:    notb %r10b
4322; SCALAR-NEXT:    notb %r11b
4323; SCALAR-NEXT:    notb %bl
4324; SCALAR-NEXT:    movb %bl, 7(%rsi)
4325; SCALAR-NEXT:    movb %r11b, 6(%rsi)
4326; SCALAR-NEXT:    movb %r10b, 5(%rsi)
4327; SCALAR-NEXT:    movb %r9b, 4(%rsi)
4328; SCALAR-NEXT:    movb %r8b, 3(%rsi)
4329; SCALAR-NEXT:    movb %cl, 2(%rsi)
4330; SCALAR-NEXT:    movb %dil, 1(%rsi)
4331; SCALAR-NEXT:    movb %al, (%rsi)
4332; SCALAR-NEXT:    movb %bl, 7(%rdx)
4333; SCALAR-NEXT:    movb %r11b, 6(%rdx)
4334; SCALAR-NEXT:    movb %r10b, 5(%rdx)
4335; SCALAR-NEXT:    movb %r9b, 4(%rdx)
4336; SCALAR-NEXT:    movb %r8b, 3(%rdx)
4337; SCALAR-NEXT:    movb %cl, 2(%rdx)
4338; SCALAR-NEXT:    movb %dil, 1(%rdx)
4339; SCALAR-NEXT:    movb %al, (%rdx)
4340; SCALAR-NEXT:    movb %bl, 15(%rdx)
4341; SCALAR-NEXT:    movb %r11b, 14(%rdx)
4342; SCALAR-NEXT:    movb %r10b, 13(%rdx)
4343; SCALAR-NEXT:    movb %r9b, 12(%rdx)
4344; SCALAR-NEXT:    movb %r8b, 11(%rdx)
4345; SCALAR-NEXT:    movb %cl, 10(%rdx)
4346; SCALAR-NEXT:    movb %dil, 9(%rdx)
4347; SCALAR-NEXT:    movb %al, 8(%rdx)
4348; SCALAR-NEXT:    movb %bl, 23(%rdx)
4349; SCALAR-NEXT:    movb %r11b, 22(%rdx)
4350; SCALAR-NEXT:    movb %r10b, 21(%rdx)
4351; SCALAR-NEXT:    movb %r9b, 20(%rdx)
4352; SCALAR-NEXT:    movb %r8b, 19(%rdx)
4353; SCALAR-NEXT:    movb %cl, 18(%rdx)
4354; SCALAR-NEXT:    movb %dil, 17(%rdx)
4355; SCALAR-NEXT:    movb %al, 16(%rdx)
4356; SCALAR-NEXT:    movb %bl, 31(%rdx)
4357; SCALAR-NEXT:    movb %r11b, 30(%rdx)
4358; SCALAR-NEXT:    movb %r10b, 29(%rdx)
4359; SCALAR-NEXT:    movb %r9b, 28(%rdx)
4360; SCALAR-NEXT:    movb %r8b, 27(%rdx)
4361; SCALAR-NEXT:    movb %cl, 26(%rdx)
4362; SCALAR-NEXT:    movb %dil, 25(%rdx)
4363; SCALAR-NEXT:    movb %al, 24(%rdx)
4364; SCALAR-NEXT:    movb %bl, 39(%rdx)
4365; SCALAR-NEXT:    movb %r11b, 38(%rdx)
4366; SCALAR-NEXT:    movb %r10b, 37(%rdx)
4367; SCALAR-NEXT:    movb %r9b, 36(%rdx)
4368; SCALAR-NEXT:    movb %r8b, 35(%rdx)
4369; SCALAR-NEXT:    movb %cl, 34(%rdx)
4370; SCALAR-NEXT:    movb %dil, 33(%rdx)
4371; SCALAR-NEXT:    movb %al, 32(%rdx)
4372; SCALAR-NEXT:    movb %bl, 47(%rdx)
4373; SCALAR-NEXT:    movb %r11b, 46(%rdx)
4374; SCALAR-NEXT:    movb %r10b, 45(%rdx)
4375; SCALAR-NEXT:    movb %r9b, 44(%rdx)
4376; SCALAR-NEXT:    movb %r8b, 43(%rdx)
4377; SCALAR-NEXT:    movb %cl, 42(%rdx)
4378; SCALAR-NEXT:    movb %dil, 41(%rdx)
4379; SCALAR-NEXT:    movb %al, 40(%rdx)
4380; SCALAR-NEXT:    popq %rbx
4381; SCALAR-NEXT:    retq
4382;
4383; SSE2-LABEL: vec384_v8i8:
4384; SSE2:       # %bb.0:
4385; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
4386; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
4387; SSE2-NEXT:    pxor %xmm0, %xmm1
4388; SSE2-NEXT:    movq %xmm1, (%rsi)
4389; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
4390; SSE2-NEXT:    movdqa %xmm0, (%rdx)
4391; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
4392; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
4393; SSE2-NEXT:    retq
4394;
4395; AVX1-LABEL: vec384_v8i8:
4396; AVX1:       # %bb.0:
4397; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
4398; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
4399; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
4400; AVX1-NEXT:    vmovq %xmm0, (%rsi)
4401; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4402; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
4403; AVX1-NEXT:    vmovaps %ymm1, (%rdx)
4404; AVX1-NEXT:    vmovdqa %xmm0, 32(%rdx)
4405; AVX1-NEXT:    vzeroupper
4406; AVX1-NEXT:    retq
4407;
4408; AVX2-ONLY-LABEL: vec384_v8i8:
4409; AVX2-ONLY:       # %bb.0:
4410; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
4411; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
4412; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
4413; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
4414; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %ymm0
4415; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
4416; AVX2-ONLY-NEXT:    vmovdqa %xmm0, 32(%rdx)
4417; AVX2-ONLY-NEXT:    vzeroupper
4418; AVX2-ONLY-NEXT:    retq
4419;
4420; AVX512-LABEL: vec384_v8i8:
4421; AVX512:       # %bb.0:
4422; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
4423; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
4424; AVX512-NEXT:    vmovq %xmm0, (%rsi)
4425; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
4426; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
4427; AVX512-NEXT:    vmovdqa %xmm0, 32(%rdx)
4428; AVX512-NEXT:    vzeroupper
4429; AVX512-NEXT:    retq
4430  %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64
4431  %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
4432  store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
4433  %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0
4434  store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
4435  %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1
4436  store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8
4437  %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2
4438  store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16
4439  %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3
4440  store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8
4441  %out.subvec4.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 4
4442  store <8 x i8> %in.subvec, ptr %out.subvec4.ptr, align 32
4443  %out.subvec5.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 5
4444  store <8 x i8> %in.subvec, ptr %out.subvec5.ptr, align 8
4445  ret void
4446}
4447
4448define void @vec384_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4449; SCALAR-LABEL: vec384_v8i16:
4450; SCALAR:       # %bb.0:
4451; SCALAR-NEXT:    pushq %rbx
4452; SCALAR-NEXT:    movzwl 14(%rdi), %ebx
4453; SCALAR-NEXT:    movl 12(%rdi), %r11d
4454; SCALAR-NEXT:    movzwl 10(%rdi), %r10d
4455; SCALAR-NEXT:    movl 8(%rdi), %r9d
4456; SCALAR-NEXT:    movzwl 6(%rdi), %r8d
4457; SCALAR-NEXT:    movzwl 2(%rdi), %ecx
4458; SCALAR-NEXT:    movl (%rdi), %eax
4459; SCALAR-NEXT:    movl 4(%rdi), %edi
4460; SCALAR-NEXT:    notl %eax
4461; SCALAR-NEXT:    notl %ecx
4462; SCALAR-NEXT:    notl %edi
4463; SCALAR-NEXT:    notl %r8d
4464; SCALAR-NEXT:    notl %r9d
4465; SCALAR-NEXT:    notl %r10d
4466; SCALAR-NEXT:    notl %r11d
4467; SCALAR-NEXT:    notl %ebx
4468; SCALAR-NEXT:    movw %bx, 14(%rsi)
4469; SCALAR-NEXT:    movw %r11w, 12(%rsi)
4470; SCALAR-NEXT:    movw %r10w, 10(%rsi)
4471; SCALAR-NEXT:    movw %r9w, 8(%rsi)
4472; SCALAR-NEXT:    movw %r8w, 6(%rsi)
4473; SCALAR-NEXT:    movw %di, 4(%rsi)
4474; SCALAR-NEXT:    movw %cx, 2(%rsi)
4475; SCALAR-NEXT:    movw %ax, (%rsi)
4476; SCALAR-NEXT:    movw %bx, 14(%rdx)
4477; SCALAR-NEXT:    movw %r11w, 12(%rdx)
4478; SCALAR-NEXT:    movw %r10w, 10(%rdx)
4479; SCALAR-NEXT:    movw %r9w, 8(%rdx)
4480; SCALAR-NEXT:    movw %r8w, 6(%rdx)
4481; SCALAR-NEXT:    movw %di, 4(%rdx)
4482; SCALAR-NEXT:    movw %cx, 2(%rdx)
4483; SCALAR-NEXT:    movw %ax, (%rdx)
4484; SCALAR-NEXT:    movw %bx, 30(%rdx)
4485; SCALAR-NEXT:    movw %r11w, 28(%rdx)
4486; SCALAR-NEXT:    movw %r10w, 26(%rdx)
4487; SCALAR-NEXT:    movw %r9w, 24(%rdx)
4488; SCALAR-NEXT:    movw %r8w, 22(%rdx)
4489; SCALAR-NEXT:    movw %di, 20(%rdx)
4490; SCALAR-NEXT:    movw %cx, 18(%rdx)
4491; SCALAR-NEXT:    movw %ax, 16(%rdx)
4492; SCALAR-NEXT:    movw %bx, 46(%rdx)
4493; SCALAR-NEXT:    movw %r11w, 44(%rdx)
4494; SCALAR-NEXT:    movw %r10w, 42(%rdx)
4495; SCALAR-NEXT:    movw %r9w, 40(%rdx)
4496; SCALAR-NEXT:    movw %r8w, 38(%rdx)
4497; SCALAR-NEXT:    movw %di, 36(%rdx)
4498; SCALAR-NEXT:    movw %cx, 34(%rdx)
4499; SCALAR-NEXT:    movw %ax, 32(%rdx)
4500; SCALAR-NEXT:    popq %rbx
4501; SCALAR-NEXT:    retq
4502;
4503; SSE2-LABEL: vec384_v8i16:
4504; SSE2:       # %bb.0:
4505; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
4506; SSE2-NEXT:    pxor (%rdi), %xmm0
4507; SSE2-NEXT:    movdqa %xmm0, (%rsi)
4508; SSE2-NEXT:    movdqa %xmm0, (%rdx)
4509; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
4510; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
4511; SSE2-NEXT:    retq
4512;
4513; AVX-LABEL: vec384_v8i16:
4514; AVX:       # %bb.0:
4515; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
4516; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
4517; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
4518; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
4519; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
4520; AVX-NEXT:    vmovdqa %xmm0, 32(%rdx)
4521; AVX-NEXT:    retq
4522  %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64
4523  %in.subvec = xor <8 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
4524  store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
4525  %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0
4526  store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
4527  %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1
4528  store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16
4529  %out.subvec2.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 2
4530  store <8 x i16> %in.subvec, ptr %out.subvec2.ptr, align 32
4531  ret void
4532}
4533
4534define void @vec384_v12i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4535; SCALAR-LABEL: vec384_v12i8:
4536; SCALAR:       # %bb.0:
4537; SCALAR-NEXT:    movq (%rdi), %rax
4538; SCALAR-NEXT:    movl 8(%rdi), %ecx
4539; SCALAR-NEXT:    notl %ecx
4540; SCALAR-NEXT:    notq %rax
4541; SCALAR-NEXT:    movq %rax, (%rsi)
4542; SCALAR-NEXT:    movl %ecx, 8(%rsi)
4543; SCALAR-NEXT:    movl %ecx, 8(%rdx)
4544; SCALAR-NEXT:    movq %rax, (%rdx)
4545; SCALAR-NEXT:    movl %ecx, 24(%rdx)
4546; SCALAR-NEXT:    movq %rax, 16(%rdx)
4547; SCALAR-NEXT:    movl %ecx, 40(%rdx)
4548; SCALAR-NEXT:    movq %rax, 32(%rdx)
4549; SCALAR-NEXT:    movl %ecx, 56(%rdx)
4550; SCALAR-NEXT:    movq %rax, 48(%rdx)
4551; SCALAR-NEXT:    retq
4552;
4553; SSE2-ONLY-LABEL: vec384_v12i8:
4554; SSE2-ONLY:       # %bb.0:
4555; SSE2-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
4556; SSE2-ONLY-NEXT:    pxor (%rdi), %xmm0
4557; SSE2-ONLY-NEXT:    movq %xmm0, (%rsi)
4558; SSE2-ONLY-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4559; SSE2-ONLY-NEXT:    movd %xmm1, 8(%rsi)
4560; SSE2-ONLY-NEXT:    movd %xmm1, 8(%rdx)
4561; SSE2-ONLY-NEXT:    movq %xmm0, (%rdx)
4562; SSE2-ONLY-NEXT:    movd %xmm1, 24(%rdx)
4563; SSE2-ONLY-NEXT:    movq %xmm0, 16(%rdx)
4564; SSE2-ONLY-NEXT:    movd %xmm1, 40(%rdx)
4565; SSE2-ONLY-NEXT:    movq %xmm0, 32(%rdx)
4566; SSE2-ONLY-NEXT:    movd %xmm1, 56(%rdx)
4567; SSE2-ONLY-NEXT:    movq %xmm0, 48(%rdx)
4568; SSE2-ONLY-NEXT:    retq
4569;
4570; SSE3-LABEL: vec384_v12i8:
4571; SSE3:       # %bb.0:
4572; SSE3-NEXT:    pcmpeqd %xmm0, %xmm0
4573; SSE3-NEXT:    pxor (%rdi), %xmm0
4574; SSE3-NEXT:    movq %xmm0, (%rsi)
4575; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4576; SSE3-NEXT:    movd %xmm1, 8(%rsi)
4577; SSE3-NEXT:    movd %xmm1, 8(%rdx)
4578; SSE3-NEXT:    movq %xmm0, (%rdx)
4579; SSE3-NEXT:    movd %xmm1, 24(%rdx)
4580; SSE3-NEXT:    movq %xmm0, 16(%rdx)
4581; SSE3-NEXT:    movd %xmm1, 40(%rdx)
4582; SSE3-NEXT:    movq %xmm0, 32(%rdx)
4583; SSE3-NEXT:    movd %xmm1, 56(%rdx)
4584; SSE3-NEXT:    movq %xmm0, 48(%rdx)
4585; SSE3-NEXT:    retq
4586;
4587; SSSE3-ONLY-LABEL: vec384_v12i8:
4588; SSSE3-ONLY:       # %bb.0:
4589; SSSE3-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
4590; SSSE3-ONLY-NEXT:    pxor (%rdi), %xmm0
4591; SSSE3-ONLY-NEXT:    movq %xmm0, (%rsi)
4592; SSSE3-ONLY-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4593; SSSE3-ONLY-NEXT:    movd %xmm1, 8(%rsi)
4594; SSSE3-ONLY-NEXT:    movd %xmm1, 8(%rdx)
4595; SSSE3-ONLY-NEXT:    movq %xmm0, (%rdx)
4596; SSSE3-ONLY-NEXT:    movd %xmm1, 24(%rdx)
4597; SSSE3-ONLY-NEXT:    movq %xmm0, 16(%rdx)
4598; SSSE3-ONLY-NEXT:    movd %xmm1, 40(%rdx)
4599; SSSE3-ONLY-NEXT:    movq %xmm0, 32(%rdx)
4600; SSSE3-ONLY-NEXT:    movd %xmm1, 56(%rdx)
4601; SSSE3-ONLY-NEXT:    movq %xmm0, 48(%rdx)
4602; SSSE3-ONLY-NEXT:    retq
4603;
4604; SSE41-LABEL: vec384_v12i8:
4605; SSE41:       # %bb.0:
4606; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
4607; SSE41-NEXT:    pxor (%rdi), %xmm0
4608; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rsi)
4609; SSE41-NEXT:    movq %xmm0, (%rsi)
4610; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdx)
4611; SSE41-NEXT:    movq %xmm0, (%rdx)
4612; SSE41-NEXT:    pextrd $2, %xmm0, 24(%rdx)
4613; SSE41-NEXT:    movq %xmm0, 16(%rdx)
4614; SSE41-NEXT:    pextrd $2, %xmm0, 40(%rdx)
4615; SSE41-NEXT:    movq %xmm0, 32(%rdx)
4616; SSE41-NEXT:    pextrd $2, %xmm0, 56(%rdx)
4617; SSE41-NEXT:    movq %xmm0, 48(%rdx)
4618; SSE41-NEXT:    retq
4619;
4620; SSE42-LABEL: vec384_v12i8:
4621; SSE42:       # %bb.0:
4622; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
4623; SSE42-NEXT:    pxor (%rdi), %xmm0
4624; SSE42-NEXT:    pextrd $2, %xmm0, 8(%rsi)
4625; SSE42-NEXT:    movq %xmm0, (%rsi)
4626; SSE42-NEXT:    pextrd $2, %xmm0, 8(%rdx)
4627; SSE42-NEXT:    movq %xmm0, (%rdx)
4628; SSE42-NEXT:    pextrd $2, %xmm0, 24(%rdx)
4629; SSE42-NEXT:    movq %xmm0, 16(%rdx)
4630; SSE42-NEXT:    pextrd $2, %xmm0, 40(%rdx)
4631; SSE42-NEXT:    movq %xmm0, 32(%rdx)
4632; SSE42-NEXT:    pextrd $2, %xmm0, 56(%rdx)
4633; SSE42-NEXT:    movq %xmm0, 48(%rdx)
4634; SSE42-NEXT:    retq
4635;
4636; AVX-LABEL: vec384_v12i8:
4637; AVX:       # %bb.0:
4638; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
4639; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
4640; AVX-NEXT:    vpextrd $2, %xmm0, 8(%rsi)
4641; AVX-NEXT:    vmovq %xmm0, (%rsi)
4642; AVX-NEXT:    vpextrd $2, %xmm0, 8(%rdx)
4643; AVX-NEXT:    vmovq %xmm0, (%rdx)
4644; AVX-NEXT:    vpextrd $2, %xmm0, 24(%rdx)
4645; AVX-NEXT:    vmovq %xmm0, 16(%rdx)
4646; AVX-NEXT:    vpextrd $2, %xmm0, 40(%rdx)
4647; AVX-NEXT:    vmovq %xmm0, 32(%rdx)
4648; AVX-NEXT:    vpextrd $2, %xmm0, 56(%rdx)
4649; AVX-NEXT:    vmovq %xmm0, 48(%rdx)
4650; AVX-NEXT:    retq
4651  %in.subvec.not = load <12 x i8>, ptr %in.subvec.ptr, align 64
4652  %in.subvec = xor <12 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
4653  store <12 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
4654  %out.subvec0.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 0
4655  store <12 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
4656  %out.subvec1.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 1
4657  store <12 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
4658  %out.subvec2.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 2
4659  store <12 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8
4660  %out.subvec3.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 3
4661  store <12 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4
4662  ret void
4663}
4664
4665define void @vec384_v12i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4666; SCALAR-LABEL: vec384_v12i16:
4667; SCALAR:       # %bb.0:
4668; SCALAR-NEXT:    movq (%rdi), %rax
4669; SCALAR-NEXT:    movq 8(%rdi), %rcx
4670; SCALAR-NEXT:    movq 16(%rdi), %rdi
4671; SCALAR-NEXT:    notq %rdi
4672; SCALAR-NEXT:    notq %rcx
4673; SCALAR-NEXT:    notq %rax
4674; SCALAR-NEXT:    movq %rax, (%rsi)
4675; SCALAR-NEXT:    movq %rcx, 8(%rsi)
4676; SCALAR-NEXT:    movq %rdi, 16(%rsi)
4677; SCALAR-NEXT:    movq %rax, (%rdx)
4678; SCALAR-NEXT:    movq %rcx, 8(%rdx)
4679; SCALAR-NEXT:    movq %rdi, 16(%rdx)
4680; SCALAR-NEXT:    movq %rdi, 48(%rdx)
4681; SCALAR-NEXT:    movq %rcx, 40(%rdx)
4682; SCALAR-NEXT:    movq %rax, 32(%rdx)
4683; SCALAR-NEXT:    retq
4684;
4685; SSE2-LABEL: vec384_v12i16:
4686; SSE2:       # %bb.0:
4687; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
4688; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
4689; SSE2-NEXT:    pxor %xmm0, %xmm1
4690; SSE2-NEXT:    pxor (%rdi), %xmm0
4691; SSE2-NEXT:    movdqa %xmm0, (%rsi)
4692; SSE2-NEXT:    movq %xmm1, 16(%rsi)
4693; SSE2-NEXT:    movq %xmm1, 16(%rdx)
4694; SSE2-NEXT:    movdqa %xmm0, (%rdx)
4695; SSE2-NEXT:    movq %xmm1, 48(%rdx)
4696; SSE2-NEXT:    movdqu %xmm0, 32(%rdx)
4697; SSE2-NEXT:    retq
4698;
4699; AVX1-LABEL: vec384_v12i16:
4700; AVX1:       # %bb.0:
4701; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
4702; AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
4703; AVX1-NEXT:    vxorps (%rdi), %ymm0, %ymm0
4704; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4705; AVX1-NEXT:    vmovlps %xmm1, 16(%rsi)
4706; AVX1-NEXT:    vmovaps %xmm0, (%rsi)
4707; AVX1-NEXT:    vmovlps %xmm1, 16(%rdx)
4708; AVX1-NEXT:    vmovaps %xmm0, (%rdx)
4709; AVX1-NEXT:    vmovlps %xmm1, 48(%rdx)
4710; AVX1-NEXT:    vmovups %xmm0, 32(%rdx)
4711; AVX1-NEXT:    vzeroupper
4712; AVX1-NEXT:    retq
4713;
4714; AVX2-LABEL: vec384_v12i16:
4715; AVX2:       # %bb.0:
4716; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
4717; AVX2-NEXT:    vpxor (%rdi), %ymm0, %ymm0
4718; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4719; AVX2-NEXT:    vmovq %xmm1, 16(%rsi)
4720; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
4721; AVX2-NEXT:    vmovq %xmm1, 16(%rdx)
4722; AVX2-NEXT:    vmovdqa %xmm0, (%rdx)
4723; AVX2-NEXT:    vmovq %xmm1, 48(%rdx)
4724; AVX2-NEXT:    vmovdqu %xmm0, 32(%rdx)
4725; AVX2-NEXT:    vzeroupper
4726; AVX2-NEXT:    retq
4727  %in.subvec.not = load <12 x i16>, ptr %in.subvec.ptr, align 64
4728  %in.subvec = xor <12 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
4729  store <12 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
4730  %out.subvec0.ptr = getelementptr <12 x i16>, ptr %out.vec.ptr, i64 0
4731  store <12 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
4732  %out.subvec1.ptr = getelementptr <12 x i16>, ptr %out.vec.ptr, i64 1
4733  store <12 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8
4734  ret void
4735}
4736
4737define void @vec384_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4738; SCALAR-LABEL: vec384_v16i8:
4739; SCALAR:       # %bb.0:
4740; SCALAR-NEXT:    pushq %rbp
4741; SCALAR-NEXT:    pushq %r15
4742; SCALAR-NEXT:    pushq %r14
4743; SCALAR-NEXT:    pushq %r13
4744; SCALAR-NEXT:    pushq %r12
4745; SCALAR-NEXT:    pushq %rbx
4746; SCALAR-NEXT:    movzbl 15(%rdi), %eax
4747; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4748; SCALAR-NEXT:    movzbl 14(%rdi), %eax
4749; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4750; SCALAR-NEXT:    movzbl 13(%rdi), %eax
4751; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4752; SCALAR-NEXT:    movzbl 12(%rdi), %r11d
4753; SCALAR-NEXT:    movzbl 11(%rdi), %r13d
4754; SCALAR-NEXT:    movzbl 10(%rdi), %r12d
4755; SCALAR-NEXT:    movzbl 9(%rdi), %ebp
4756; SCALAR-NEXT:    movzbl 8(%rdi), %r14d
4757; SCALAR-NEXT:    movzbl 7(%rdi), %ebx
4758; SCALAR-NEXT:    movzbl 6(%rdi), %r10d
4759; SCALAR-NEXT:    movzbl 5(%rdi), %r15d
4760; SCALAR-NEXT:    movzbl 4(%rdi), %r9d
4761; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
4762; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
4763; SCALAR-NEXT:    movzbl (%rdi), %eax
4764; SCALAR-NEXT:    movzbl 1(%rdi), %edi
4765; SCALAR-NEXT:    notb %al
4766; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4767; SCALAR-NEXT:    notb %dil
4768; SCALAR-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4769; SCALAR-NEXT:    notb %cl
4770; SCALAR-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4771; SCALAR-NEXT:    notb %r8b
4772; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4773; SCALAR-NEXT:    notb %r9b
4774; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4775; SCALAR-NEXT:    movl %r15d, %r9d
4776; SCALAR-NEXT:    notb %r9b
4777; SCALAR-NEXT:    notb %r10b
4778; SCALAR-NEXT:    notb %bl
4779; SCALAR-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4780; SCALAR-NEXT:    notb %r14b
4781; SCALAR-NEXT:    notb %bpl
4782; SCALAR-NEXT:    movl %ebp, %r15d
4783; SCALAR-NEXT:    notb %r12b
4784; SCALAR-NEXT:    movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4785; SCALAR-NEXT:    notb %r13b
4786; SCALAR-NEXT:    movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4787; SCALAR-NEXT:    notb %r11b
4788; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
4789; SCALAR-NEXT:    notb %dil
4790; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
4791; SCALAR-NEXT:    notb %cl
4792; SCALAR-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4793; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
4794; SCALAR-NEXT:    notb %r8b
4795; SCALAR-NEXT:    movb %r8b, 15(%rsi)
4796; SCALAR-NEXT:    movb %cl, 14(%rsi)
4797; SCALAR-NEXT:    movl %edi, %eax
4798; SCALAR-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4799; SCALAR-NEXT:    movb %dil, 13(%rsi)
4800; SCALAR-NEXT:    movb %r11b, 12(%rsi)
4801; SCALAR-NEXT:    movl %r11d, %ebp
4802; SCALAR-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4803; SCALAR-NEXT:    movb %r13b, 11(%rsi)
4804; SCALAR-NEXT:    movb %r12b, 10(%rsi)
4805; SCALAR-NEXT:    movb %r15b, 9(%rsi)
4806; SCALAR-NEXT:    movb %r14b, 8(%rsi)
4807; SCALAR-NEXT:    movb %bl, 7(%rsi)
4808; SCALAR-NEXT:    movb %r10b, 6(%rsi)
4809; SCALAR-NEXT:    movl %r10d, %ebx
4810; SCALAR-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4811; SCALAR-NEXT:    movb %r9b, 5(%rsi)
4812; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
4813; SCALAR-NEXT:    movb %r11b, 4(%rsi)
4814; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
4815; SCALAR-NEXT:    movb %r12b, 3(%rsi)
4816; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
4817; SCALAR-NEXT:    movb %cl, 2(%rsi)
4818; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload
4819; SCALAR-NEXT:    movb %r13b, 1(%rsi)
4820; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
4821; SCALAR-NEXT:    movb %r10b, (%rsi)
4822; SCALAR-NEXT:    movb %r8b, 15(%rdx)
4823; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
4824; SCALAR-NEXT:    movb %dil, 14(%rdx)
4825; SCALAR-NEXT:    movb %al, 13(%rdx)
4826; SCALAR-NEXT:    movb %bpl, 12(%rdx)
4827; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
4828; SCALAR-NEXT:    movb %al, 11(%rdx)
4829; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
4830; SCALAR-NEXT:    movb %al, 10(%rdx)
4831; SCALAR-NEXT:    movb %r15b, 9(%rdx)
4832; SCALAR-NEXT:    movb %r14b, 8(%rdx)
4833; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
4834; SCALAR-NEXT:    movb %bpl, 7(%rdx)
4835; SCALAR-NEXT:    movb %bl, 6(%rdx)
4836; SCALAR-NEXT:    movb %r9b, 5(%rdx)
4837; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4838; SCALAR-NEXT:    movb %r11b, 4(%rdx)
4839; SCALAR-NEXT:    movb %r12b, 3(%rdx)
4840; SCALAR-NEXT:    movb %cl, 2(%rdx)
4841; SCALAR-NEXT:    movl %r13d, %ebx
4842; SCALAR-NEXT:    movb %r13b, 1(%rdx)
4843; SCALAR-NEXT:    movl %r10d, %esi
4844; SCALAR-NEXT:    movb %r10b, (%rdx)
4845; SCALAR-NEXT:    movb %r8b, 31(%rdx)
4846; SCALAR-NEXT:    movb %dil, 30(%rdx)
4847; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
4848; SCALAR-NEXT:    movb %al, 29(%rdx)
4849; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
4850; SCALAR-NEXT:    movb %r11b, 28(%rdx)
4851; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload
4852; SCALAR-NEXT:    movb %r13b, 27(%rdx)
4853; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
4854; SCALAR-NEXT:    movb %r12b, 26(%rdx)
4855; SCALAR-NEXT:    movb %r15b, 25(%rdx)
4856; SCALAR-NEXT:    movb %r14b, 24(%rdx)
4857; SCALAR-NEXT:    movb %bpl, 23(%rdx)
4858; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
4859; SCALAR-NEXT:    movb %r10b, 22(%rdx)
4860; SCALAR-NEXT:    movb %r9b, 21(%rdx)
4861; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
4862; SCALAR-NEXT:    movb %r9b, 20(%rdx)
4863; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
4864; SCALAR-NEXT:    movb %dil, 19(%rdx)
4865; SCALAR-NEXT:    movb %cl, 18(%rdx)
4866; SCALAR-NEXT:    movb %bl, 17(%rdx)
4867; SCALAR-NEXT:    movb %sil, 16(%rdx)
4868; SCALAR-NEXT:    movb %r8b, 47(%rdx)
4869; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
4870; SCALAR-NEXT:    movb %r8b, 46(%rdx)
4871; SCALAR-NEXT:    movb %al, 45(%rdx)
4872; SCALAR-NEXT:    movb %r11b, 44(%rdx)
4873; SCALAR-NEXT:    movb %r13b, 43(%rdx)
4874; SCALAR-NEXT:    movb %r12b, 42(%rdx)
4875; SCALAR-NEXT:    movb %r15b, 41(%rdx)
4876; SCALAR-NEXT:    movb %r14b, 40(%rdx)
4877; SCALAR-NEXT:    movb %bpl, 39(%rdx)
4878; SCALAR-NEXT:    movb %r10b, 38(%rdx)
4879; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
4880; SCALAR-NEXT:    movb %al, 37(%rdx)
4881; SCALAR-NEXT:    movb %r9b, 36(%rdx)
4882; SCALAR-NEXT:    movb %dil, 35(%rdx)
4883; SCALAR-NEXT:    movb %cl, 34(%rdx)
4884; SCALAR-NEXT:    movb %bl, 33(%rdx)
4885; SCALAR-NEXT:    movb %sil, 32(%rdx)
4886; SCALAR-NEXT:    popq %rbx
4887; SCALAR-NEXT:    popq %r12
4888; SCALAR-NEXT:    popq %r13
4889; SCALAR-NEXT:    popq %r14
4890; SCALAR-NEXT:    popq %r15
4891; SCALAR-NEXT:    popq %rbp
4892; SCALAR-NEXT:    retq
4893;
4894; SSE2-LABEL: vec384_v16i8:
4895; SSE2:       # %bb.0:
4896; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
4897; SSE2-NEXT:    pxor (%rdi), %xmm0
4898; SSE2-NEXT:    movdqa %xmm0, (%rsi)
4899; SSE2-NEXT:    movdqa %xmm0, (%rdx)
4900; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
4901; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
4902; SSE2-NEXT:    retq
4903;
4904; AVX-LABEL: vec384_v16i8:
4905; AVX:       # %bb.0:
4906; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
4907; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
4908; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
4909; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
4910; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
4911; AVX-NEXT:    vmovdqa %xmm0, 32(%rdx)
4912; AVX-NEXT:    retq
4913  %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64
4914  %in.subvec = xor <16 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
4915  store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
4916  %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0
4917  store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
4918  %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1
4919  store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16
4920  %out.subvec2.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 2
4921  store <16 x i8> %in.subvec, ptr %out.subvec2.ptr, align 32
4922  ret void
4923}
4924
4925define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4926; SCALAR-LABEL: vec384_v24i8:
4927; SCALAR:       # %bb.0:
4928; SCALAR-NEXT:    movq (%rdi), %rax
4929; SCALAR-NEXT:    movq 8(%rdi), %rcx
4930; SCALAR-NEXT:    movq 16(%rdi), %rdi
4931; SCALAR-NEXT:    notq %rdi
4932; SCALAR-NEXT:    notq %rcx
4933; SCALAR-NEXT:    notq %rax
4934; SCALAR-NEXT:    movq %rax, (%rsi)
4935; SCALAR-NEXT:    movq %rcx, 8(%rsi)
4936; SCALAR-NEXT:    movq %rdi, 16(%rsi)
4937; SCALAR-NEXT:    movq %rax, (%rdx)
4938; SCALAR-NEXT:    movq %rcx, 8(%rdx)
4939; SCALAR-NEXT:    movq %rdi, 16(%rdx)
4940; SCALAR-NEXT:    movq %rdi, 48(%rdx)
4941; SCALAR-NEXT:    movq %rcx, 40(%rdx)
4942; SCALAR-NEXT:    movq %rax, 32(%rdx)
4943; SCALAR-NEXT:    retq
4944;
4945; SSE2-LABEL: vec384_v24i8:
4946; SSE2:       # %bb.0:
4947; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
4948; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
4949; SSE2-NEXT:    pxor %xmm0, %xmm1
4950; SSE2-NEXT:    pxor (%rdi), %xmm0
4951; SSE2-NEXT:    movdqa %xmm0, (%rsi)
4952; SSE2-NEXT:    movq %xmm1, 16(%rsi)
4953; SSE2-NEXT:    movq %xmm1, 16(%rdx)
4954; SSE2-NEXT:    movdqa %xmm0, (%rdx)
4955; SSE2-NEXT:    movq %xmm1, 48(%rdx)
4956; SSE2-NEXT:    movdqu %xmm0, 32(%rdx)
4957; SSE2-NEXT:    retq
4958;
4959; AVX1-LABEL: vec384_v24i8:
4960; AVX1:       # %bb.0:
4961; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
4962; AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
4963; AVX1-NEXT:    vxorps (%rdi), %ymm0, %ymm0
4964; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4965; AVX1-NEXT:    vmovlps %xmm1, 16(%rsi)
4966; AVX1-NEXT:    vmovaps %xmm0, (%rsi)
4967; AVX1-NEXT:    vmovlps %xmm1, 16(%rdx)
4968; AVX1-NEXT:    vmovaps %xmm0, (%rdx)
4969; AVX1-NEXT:    vmovlps %xmm1, 48(%rdx)
4970; AVX1-NEXT:    vmovups %xmm0, 32(%rdx)
4971; AVX1-NEXT:    vzeroupper
4972; AVX1-NEXT:    retq
4973;
4974; AVX2-LABEL: vec384_v24i8:
4975; AVX2:       # %bb.0:
4976; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
4977; AVX2-NEXT:    vpxor (%rdi), %ymm0, %ymm0
4978; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4979; AVX2-NEXT:    vmovq %xmm1, 16(%rsi)
4980; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
4981; AVX2-NEXT:    vmovq %xmm1, 16(%rdx)
4982; AVX2-NEXT:    vmovdqa %xmm0, (%rdx)
4983; AVX2-NEXT:    vmovq %xmm1, 48(%rdx)
4984; AVX2-NEXT:    vmovdqu %xmm0, 32(%rdx)
4985; AVX2-NEXT:    vzeroupper
4986; AVX2-NEXT:    retq
4987  %in.subvec.not = load <24 x i8>, ptr %in.subvec.ptr, align 64
4988  %in.subvec = xor <24 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
4989  store <24 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
4990  %out.subvec0.ptr = getelementptr <24 x i8>, ptr %out.vec.ptr, i64 0
4991  store <24 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
4992  %out.subvec1.ptr = getelementptr <24 x i8>, ptr %out.vec.ptr, i64 1
4993  store <24 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8
4994  ret void
4995}
4996
4997define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4998; SCALAR-LABEL: vec512_v2i8:
4999; SCALAR:       # %bb.0:
5000; SCALAR-NEXT:    movzbl (%rdi), %eax
5001; SCALAR-NEXT:    movzbl 1(%rdi), %ecx
5002; SCALAR-NEXT:    notb %al
5003; SCALAR-NEXT:    notb %cl
5004; SCALAR-NEXT:    movb %cl, 1(%rsi)
5005; SCALAR-NEXT:    movb %al, (%rsi)
5006; SCALAR-NEXT:    movb %cl, 1(%rdx)
5007; SCALAR-NEXT:    movb %al, (%rdx)
5008; SCALAR-NEXT:    movb %cl, 3(%rdx)
5009; SCALAR-NEXT:    movb %al, 2(%rdx)
5010; SCALAR-NEXT:    movb %cl, 5(%rdx)
5011; SCALAR-NEXT:    movb %al, 4(%rdx)
5012; SCALAR-NEXT:    movb %cl, 7(%rdx)
5013; SCALAR-NEXT:    movb %al, 6(%rdx)
5014; SCALAR-NEXT:    movb %cl, 9(%rdx)
5015; SCALAR-NEXT:    movb %al, 8(%rdx)
5016; SCALAR-NEXT:    movb %cl, 11(%rdx)
5017; SCALAR-NEXT:    movb %al, 10(%rdx)
5018; SCALAR-NEXT:    movb %cl, 13(%rdx)
5019; SCALAR-NEXT:    movb %al, 12(%rdx)
5020; SCALAR-NEXT:    movb %cl, 15(%rdx)
5021; SCALAR-NEXT:    movb %al, 14(%rdx)
5022; SCALAR-NEXT:    movb %cl, 17(%rdx)
5023; SCALAR-NEXT:    movb %al, 16(%rdx)
5024; SCALAR-NEXT:    movb %cl, 19(%rdx)
5025; SCALAR-NEXT:    movb %al, 18(%rdx)
5026; SCALAR-NEXT:    movb %cl, 21(%rdx)
5027; SCALAR-NEXT:    movb %al, 20(%rdx)
5028; SCALAR-NEXT:    movb %cl, 23(%rdx)
5029; SCALAR-NEXT:    movb %al, 22(%rdx)
5030; SCALAR-NEXT:    movb %cl, 25(%rdx)
5031; SCALAR-NEXT:    movb %al, 24(%rdx)
5032; SCALAR-NEXT:    movb %cl, 27(%rdx)
5033; SCALAR-NEXT:    movb %al, 26(%rdx)
5034; SCALAR-NEXT:    movb %cl, 29(%rdx)
5035; SCALAR-NEXT:    movb %al, 28(%rdx)
5036; SCALAR-NEXT:    movb %cl, 31(%rdx)
5037; SCALAR-NEXT:    movb %al, 30(%rdx)
5038; SCALAR-NEXT:    movb %cl, 33(%rdx)
5039; SCALAR-NEXT:    movb %al, 32(%rdx)
5040; SCALAR-NEXT:    movb %cl, 35(%rdx)
5041; SCALAR-NEXT:    movb %al, 34(%rdx)
5042; SCALAR-NEXT:    movb %cl, 37(%rdx)
5043; SCALAR-NEXT:    movb %al, 36(%rdx)
5044; SCALAR-NEXT:    movb %cl, 39(%rdx)
5045; SCALAR-NEXT:    movb %al, 38(%rdx)
5046; SCALAR-NEXT:    movb %cl, 41(%rdx)
5047; SCALAR-NEXT:    movb %al, 40(%rdx)
5048; SCALAR-NEXT:    movb %cl, 43(%rdx)
5049; SCALAR-NEXT:    movb %al, 42(%rdx)
5050; SCALAR-NEXT:    movb %cl, 45(%rdx)
5051; SCALAR-NEXT:    movb %al, 44(%rdx)
5052; SCALAR-NEXT:    movb %cl, 47(%rdx)
5053; SCALAR-NEXT:    movb %al, 46(%rdx)
5054; SCALAR-NEXT:    movb %cl, 49(%rdx)
5055; SCALAR-NEXT:    movb %al, 48(%rdx)
5056; SCALAR-NEXT:    movb %cl, 51(%rdx)
5057; SCALAR-NEXT:    movb %al, 50(%rdx)
5058; SCALAR-NEXT:    movb %cl, 53(%rdx)
5059; SCALAR-NEXT:    movb %al, 52(%rdx)
5060; SCALAR-NEXT:    movb %cl, 55(%rdx)
5061; SCALAR-NEXT:    movb %al, 54(%rdx)
5062; SCALAR-NEXT:    movb %cl, 57(%rdx)
5063; SCALAR-NEXT:    movb %al, 56(%rdx)
5064; SCALAR-NEXT:    movb %cl, 59(%rdx)
5065; SCALAR-NEXT:    movb %al, 58(%rdx)
5066; SCALAR-NEXT:    movb %cl, 61(%rdx)
5067; SCALAR-NEXT:    movb %al, 60(%rdx)
5068; SCALAR-NEXT:    movb %cl, 63(%rdx)
5069; SCALAR-NEXT:    movb %al, 62(%rdx)
5070; SCALAR-NEXT:    retq
5071;
5072; SSE2-ONLY-LABEL: vec512_v2i8:
5073; SSE2-ONLY:       # %bb.0:
5074; SSE2-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
5075; SSE2-ONLY-NEXT:    pxor (%rdi), %xmm0
5076; SSE2-ONLY-NEXT:    movd %xmm0, %eax
5077; SSE2-ONLY-NEXT:    movw %ax, (%rsi)
5078; SSE2-ONLY-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5079; SSE2-ONLY-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5080; SSE2-ONLY-NEXT:    movdqa %xmm0, (%rdx)
5081; SSE2-ONLY-NEXT:    movdqa %xmm0, 16(%rdx)
5082; SSE2-ONLY-NEXT:    movdqa %xmm0, 32(%rdx)
5083; SSE2-ONLY-NEXT:    movdqa %xmm0, 48(%rdx)
5084; SSE2-ONLY-NEXT:    retq
5085;
5086; SSE3-LABEL: vec512_v2i8:
5087; SSE3:       # %bb.0:
5088; SSE3-NEXT:    pcmpeqd %xmm0, %xmm0
5089; SSE3-NEXT:    pxor (%rdi), %xmm0
5090; SSE3-NEXT:    movd %xmm0, %eax
5091; SSE3-NEXT:    movw %ax, (%rsi)
5092; SSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5093; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5094; SSE3-NEXT:    movdqa %xmm0, (%rdx)
5095; SSE3-NEXT:    movdqa %xmm0, 16(%rdx)
5096; SSE3-NEXT:    movdqa %xmm0, 32(%rdx)
5097; SSE3-NEXT:    movdqa %xmm0, 48(%rdx)
5098; SSE3-NEXT:    retq
5099;
5100; SSSE3-ONLY-LABEL: vec512_v2i8:
5101; SSSE3-ONLY:       # %bb.0:
5102; SSSE3-ONLY-NEXT:    pcmpeqd %xmm0, %xmm0
5103; SSSE3-ONLY-NEXT:    pxor (%rdi), %xmm0
5104; SSSE3-ONLY-NEXT:    movd %xmm0, %eax
5105; SSSE3-ONLY-NEXT:    movw %ax, (%rsi)
5106; SSSE3-ONLY-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5107; SSSE3-ONLY-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5108; SSSE3-ONLY-NEXT:    movdqa %xmm0, (%rdx)
5109; SSSE3-ONLY-NEXT:    movdqa %xmm0, 16(%rdx)
5110; SSSE3-ONLY-NEXT:    movdqa %xmm0, 32(%rdx)
5111; SSSE3-ONLY-NEXT:    movdqa %xmm0, 48(%rdx)
5112; SSSE3-ONLY-NEXT:    retq
5113;
5114; SSE41-LABEL: vec512_v2i8:
5115; SSE41:       # %bb.0:
5116; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
5117; SSE41-NEXT:    pxor (%rdi), %xmm0
5118; SSE41-NEXT:    pextrw $0, %xmm0, (%rsi)
5119; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5120; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5121; SSE41-NEXT:    movdqa %xmm0, (%rdx)
5122; SSE41-NEXT:    movdqa %xmm0, 16(%rdx)
5123; SSE41-NEXT:    movdqa %xmm0, 32(%rdx)
5124; SSE41-NEXT:    movdqa %xmm0, 48(%rdx)
5125; SSE41-NEXT:    retq
5126;
5127; SSE42-LABEL: vec512_v2i8:
5128; SSE42:       # %bb.0:
5129; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
5130; SSE42-NEXT:    pxor (%rdi), %xmm0
5131; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
5132; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5133; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5134; SSE42-NEXT:    movdqa %xmm0, (%rdx)
5135; SSE42-NEXT:    movdqa %xmm0, 16(%rdx)
5136; SSE42-NEXT:    movdqa %xmm0, 32(%rdx)
5137; SSE42-NEXT:    movdqa %xmm0, 48(%rdx)
5138; SSE42-NEXT:    retq
5139;
5140; AVX1-LABEL: vec512_v2i8:
5141; AVX1:       # %bb.0:
5142; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
5143; AVX1-NEXT:    vpxor (%rdi), %xmm0, %xmm0
5144; AVX1-NEXT:    vpextrw $0, %xmm0, (%rsi)
5145; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5146; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5147; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
5148; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
5149; AVX1-NEXT:    vmovaps %ymm0, 32(%rdx)
5150; AVX1-NEXT:    vzeroupper
5151; AVX1-NEXT:    retq
5152;
5153; AVX2-ONLY-LABEL: vec512_v2i8:
5154; AVX2-ONLY:       # %bb.0:
5155; AVX2-ONLY-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
5156; AVX2-ONLY-NEXT:    vpxor (%rdi), %xmm0, %xmm0
5157; AVX2-ONLY-NEXT:    vpextrw $0, %xmm0, (%rsi)
5158; AVX2-ONLY-NEXT:    vpbroadcastw %xmm0, %ymm0
5159; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
5160; AVX2-ONLY-NEXT:    vmovdqa %ymm0, 32(%rdx)
5161; AVX2-ONLY-NEXT:    vzeroupper
5162; AVX2-ONLY-NEXT:    retq
5163;
5164; AVX512F-LABEL: vec512_v2i8:
5165; AVX512F:       # %bb.0:
5166; AVX512F-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
5167; AVX512F-NEXT:    vpxor (%rdi), %xmm0, %xmm0
5168; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
5169; AVX512F-NEXT:    vpbroadcastw %xmm0, %ymm0
5170; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
5171; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rdx)
5172; AVX512F-NEXT:    vzeroupper
5173; AVX512F-NEXT:    retq
5174;
5175; AVX512BW-LABEL: vec512_v2i8:
5176; AVX512BW:       # %bb.0:
5177; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
5178; AVX512BW-NEXT:    vpxor (%rdi), %xmm0, %xmm0
5179; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
5180; AVX512BW-NEXT:    vpbroadcastw %xmm0, %zmm0
5181; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
5182; AVX512BW-NEXT:    vzeroupper
5183; AVX512BW-NEXT:    retq
5184  %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
5185  %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
5186  store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
5187  %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
5188  store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
5189  %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
5190  store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
5191  %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2
5192  store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
5193  %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3
5194  store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
5195  %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4
5196  store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8
5197  %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5
5198  store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2
5199  %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6
5200  store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4
5201  %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7
5202  store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2
5203  %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8
5204  store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16
5205  %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9
5206  store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2
5207  %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10
5208  store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4
5209  %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11
5210  store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2
5211  %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12
5212  store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8
5213  %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13
5214  store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2
5215  %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14
5216  store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4
5217  %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15
5218  store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2
5219  %out.subvec16.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 16
5220  store <2 x i8> %in.subvec, ptr %out.subvec16.ptr, align 32
5221  %out.subvec17.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 17
5222  store <2 x i8> %in.subvec, ptr %out.subvec17.ptr, align 2
5223  %out.subvec18.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 18
5224  store <2 x i8> %in.subvec, ptr %out.subvec18.ptr, align 4
5225  %out.subvec19.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 19
5226  store <2 x i8> %in.subvec, ptr %out.subvec19.ptr, align 2
5227  %out.subvec20.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 20
5228  store <2 x i8> %in.subvec, ptr %out.subvec20.ptr, align 8
5229  %out.subvec21.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 21
5230  store <2 x i8> %in.subvec, ptr %out.subvec21.ptr, align 2
5231  %out.subvec22.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 22
5232  store <2 x i8> %in.subvec, ptr %out.subvec22.ptr, align 4
5233  %out.subvec23.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 23
5234  store <2 x i8> %in.subvec, ptr %out.subvec23.ptr, align 2
5235  %out.subvec24.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 24
5236  store <2 x i8> %in.subvec, ptr %out.subvec24.ptr, align 16
5237  %out.subvec25.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 25
5238  store <2 x i8> %in.subvec, ptr %out.subvec25.ptr, align 2
5239  %out.subvec26.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 26
5240  store <2 x i8> %in.subvec, ptr %out.subvec26.ptr, align 4
5241  %out.subvec27.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 27
5242  store <2 x i8> %in.subvec, ptr %out.subvec27.ptr, align 2
5243  %out.subvec28.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 28
5244  store <2 x i8> %in.subvec, ptr %out.subvec28.ptr, align 8
5245  %out.subvec29.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 29
5246  store <2 x i8> %in.subvec, ptr %out.subvec29.ptr, align 2
5247  %out.subvec30.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 30
5248  store <2 x i8> %in.subvec, ptr %out.subvec30.ptr, align 4
5249  %out.subvec31.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 31
5250  store <2 x i8> %in.subvec, ptr %out.subvec31.ptr, align 2
5251  ret void
5252}
5253
5254define void @vec512_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5255; SCALAR-LABEL: vec512_v2i16:
5256; SCALAR:       # %bb.0:
5257; SCALAR-NEXT:    movzwl 2(%rdi), %ecx
5258; SCALAR-NEXT:    movl (%rdi), %eax
5259; SCALAR-NEXT:    notl %eax
5260; SCALAR-NEXT:    notl %ecx
5261; SCALAR-NEXT:    movw %cx, 2(%rsi)
5262; SCALAR-NEXT:    movw %ax, (%rsi)
5263; SCALAR-NEXT:    movw %cx, 2(%rdx)
5264; SCALAR-NEXT:    movw %ax, (%rdx)
5265; SCALAR-NEXT:    movw %cx, 6(%rdx)
5266; SCALAR-NEXT:    movw %ax, 4(%rdx)
5267; SCALAR-NEXT:    movw %cx, 10(%rdx)
5268; SCALAR-NEXT:    movw %ax, 8(%rdx)
5269; SCALAR-NEXT:    movw %cx, 14(%rdx)
5270; SCALAR-NEXT:    movw %ax, 12(%rdx)
5271; SCALAR-NEXT:    movw %cx, 18(%rdx)
5272; SCALAR-NEXT:    movw %ax, 16(%rdx)
5273; SCALAR-NEXT:    movw %cx, 22(%rdx)
5274; SCALAR-NEXT:    movw %ax, 20(%rdx)
5275; SCALAR-NEXT:    movw %cx, 26(%rdx)
5276; SCALAR-NEXT:    movw %ax, 24(%rdx)
5277; SCALAR-NEXT:    movw %cx, 30(%rdx)
5278; SCALAR-NEXT:    movw %ax, 28(%rdx)
5279; SCALAR-NEXT:    movw %cx, 34(%rdx)
5280; SCALAR-NEXT:    movw %ax, 32(%rdx)
5281; SCALAR-NEXT:    movw %cx, 38(%rdx)
5282; SCALAR-NEXT:    movw %ax, 36(%rdx)
5283; SCALAR-NEXT:    movw %cx, 42(%rdx)
5284; SCALAR-NEXT:    movw %ax, 40(%rdx)
5285; SCALAR-NEXT:    movw %cx, 46(%rdx)
5286; SCALAR-NEXT:    movw %ax, 44(%rdx)
5287; SCALAR-NEXT:    movw %cx, 50(%rdx)
5288; SCALAR-NEXT:    movw %ax, 48(%rdx)
5289; SCALAR-NEXT:    movw %cx, 54(%rdx)
5290; SCALAR-NEXT:    movw %ax, 52(%rdx)
5291; SCALAR-NEXT:    movw %cx, 58(%rdx)
5292; SCALAR-NEXT:    movw %ax, 56(%rdx)
5293; SCALAR-NEXT:    movw %cx, 62(%rdx)
5294; SCALAR-NEXT:    movw %ax, 60(%rdx)
5295; SCALAR-NEXT:    retq
5296;
5297; SSE2-LABEL: vec512_v2i16:
5298; SSE2:       # %bb.0:
5299; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
5300; SSE2-NEXT:    pxor (%rdi), %xmm0
5301; SSE2-NEXT:    movd %xmm0, (%rsi)
5302; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5303; SSE2-NEXT:    movdqa %xmm0, (%rdx)
5304; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
5305; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
5306; SSE2-NEXT:    movdqa %xmm0, 48(%rdx)
5307; SSE2-NEXT:    retq
5308;
5309; AVX1-LABEL: vec512_v2i16:
5310; AVX1:       # %bb.0:
5311; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
5312; AVX1-NEXT:    vpxor (%rdi), %xmm0, %xmm0
5313; AVX1-NEXT:    vmovd %xmm0, (%rsi)
5314; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5315; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
5316; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
5317; AVX1-NEXT:    vmovaps %ymm0, 32(%rdx)
5318; AVX1-NEXT:    vzeroupper
5319; AVX1-NEXT:    retq
5320;
5321; AVX2-ONLY-LABEL: vec512_v2i16:
5322; AVX2-ONLY:       # %bb.0:
5323; AVX2-ONLY-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
5324; AVX2-ONLY-NEXT:    vpxor (%rdi), %xmm0, %xmm0
5325; AVX2-ONLY-NEXT:    vmovd %xmm0, (%rsi)
5326; AVX2-ONLY-NEXT:    vpbroadcastd %xmm0, %ymm0
5327; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
5328; AVX2-ONLY-NEXT:    vmovdqa %ymm0, 32(%rdx)
5329; AVX2-ONLY-NEXT:    vzeroupper
5330; AVX2-ONLY-NEXT:    retq
5331;
5332; AVX512-LABEL: vec512_v2i16:
5333; AVX512:       # %bb.0:
5334; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
5335; AVX512-NEXT:    vpxor (%rdi), %xmm0, %xmm0
5336; AVX512-NEXT:    vmovd %xmm0, (%rsi)
5337; AVX512-NEXT:    vpbroadcastd %xmm0, %zmm0
5338; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
5339; AVX512-NEXT:    vzeroupper
5340; AVX512-NEXT:    retq
5341  %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64
5342  %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1>
5343  store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
5344  %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0
5345  store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
5346  %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1
5347  store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
5348  %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2
5349  store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8
5350  %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3
5351  store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4
5352  %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4
5353  store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16
5354  %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5
5355  store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4
5356  %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6
5357  store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8
5358  %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7
5359  store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4
5360  %out.subvec8.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 8
5361  store <2 x i16> %in.subvec, ptr %out.subvec8.ptr, align 32
5362  %out.subvec9.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 9
5363  store <2 x i16> %in.subvec, ptr %out.subvec9.ptr, align 4
5364  %out.subvec10.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 10
5365  store <2 x i16> %in.subvec, ptr %out.subvec10.ptr, align 8
5366  %out.subvec11.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 11
5367  store <2 x i16> %in.subvec, ptr %out.subvec11.ptr, align 4
5368  %out.subvec12.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 12
5369  store <2 x i16> %in.subvec, ptr %out.subvec12.ptr, align 16
5370  %out.subvec13.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 13
5371  store <2 x i16> %in.subvec, ptr %out.subvec13.ptr, align 4
5372  %out.subvec14.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 14
5373  store <2 x i16> %in.subvec, ptr %out.subvec14.ptr, align 8
5374  %out.subvec15.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 15
5375  store <2 x i16> %in.subvec, ptr %out.subvec15.ptr, align 4
5376  ret void
5377}
5378
5379define void @vec512_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5380; SCALAR-LABEL: vec512_v2i32:
5381; SCALAR:       # %bb.0:
5382; SCALAR-NEXT:    movl (%rdi), %eax
5383; SCALAR-NEXT:    movl 4(%rdi), %ecx
5384; SCALAR-NEXT:    notl %eax
5385; SCALAR-NEXT:    notl %ecx
5386; SCALAR-NEXT:    movl %ecx, 4(%rsi)
5387; SCALAR-NEXT:    movl %eax, (%rsi)
5388; SCALAR-NEXT:    movl %ecx, 4(%rdx)
5389; SCALAR-NEXT:    movl %eax, (%rdx)
5390; SCALAR-NEXT:    movl %ecx, 12(%rdx)
5391; SCALAR-NEXT:    movl %eax, 8(%rdx)
5392; SCALAR-NEXT:    movl %ecx, 20(%rdx)
5393; SCALAR-NEXT:    movl %eax, 16(%rdx)
5394; SCALAR-NEXT:    movl %ecx, 28(%rdx)
5395; SCALAR-NEXT:    movl %eax, 24(%rdx)
5396; SCALAR-NEXT:    movl %ecx, 36(%rdx)
5397; SCALAR-NEXT:    movl %eax, 32(%rdx)
5398; SCALAR-NEXT:    movl %ecx, 44(%rdx)
5399; SCALAR-NEXT:    movl %eax, 40(%rdx)
5400; SCALAR-NEXT:    movl %ecx, 52(%rdx)
5401; SCALAR-NEXT:    movl %eax, 48(%rdx)
5402; SCALAR-NEXT:    movl %ecx, 60(%rdx)
5403; SCALAR-NEXT:    movl %eax, 56(%rdx)
5404; SCALAR-NEXT:    retq
5405;
5406; SSE2-LABEL: vec512_v2i32:
5407; SSE2:       # %bb.0:
5408; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
5409; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
5410; SSE2-NEXT:    pxor %xmm0, %xmm1
5411; SSE2-NEXT:    movq %xmm1, (%rsi)
5412; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
5413; SSE2-NEXT:    movdqa %xmm0, (%rdx)
5414; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
5415; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
5416; SSE2-NEXT:    movdqa %xmm0, 48(%rdx)
5417; SSE2-NEXT:    retq
5418;
5419; AVX1-LABEL: vec512_v2i32:
5420; AVX1:       # %bb.0:
5421; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
5422; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
5423; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
5424; AVX1-NEXT:    vmovq %xmm0, (%rsi)
5425; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5426; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
5427; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
5428; AVX1-NEXT:    vmovaps %ymm0, 32(%rdx)
5429; AVX1-NEXT:    vzeroupper
5430; AVX1-NEXT:    retq
5431;
5432; AVX2-ONLY-LABEL: vec512_v2i32:
5433; AVX2-ONLY:       # %bb.0:
5434; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
5435; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
5436; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
5437; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
5438; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %ymm0
5439; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
5440; AVX2-ONLY-NEXT:    vmovdqa %ymm0, 32(%rdx)
5441; AVX2-ONLY-NEXT:    vzeroupper
5442; AVX2-ONLY-NEXT:    retq
5443;
5444; AVX512-LABEL: vec512_v2i32:
5445; AVX512:       # %bb.0:
5446; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
5447; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
5448; AVX512-NEXT:    vmovq %xmm0, (%rsi)
5449; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
5450; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
5451; AVX512-NEXT:    vzeroupper
5452; AVX512-NEXT:    retq
5453  %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
5454  %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
5455  store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
5456  %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0
5457  store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
5458  %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1
5459  store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8
5460  %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2
5461  store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16
5462  %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3
5463  store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8
5464  %out.subvec4.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 4
5465  store <2 x i32> %in.subvec, ptr %out.subvec4.ptr, align 32
5466  %out.subvec5.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 5
5467  store <2 x i32> %in.subvec, ptr %out.subvec5.ptr, align 8
5468  %out.subvec6.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 6
5469  store <2 x i32> %in.subvec, ptr %out.subvec6.ptr, align 16
5470  %out.subvec7.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 7
5471  store <2 x i32> %in.subvec, ptr %out.subvec7.ptr, align 8
5472  ret void
5473}
5474
5475define void @vec512_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5476; SCALAR-LABEL: vec512_v2f32:
5477; SCALAR:       # %bb.0:
5478; SCALAR-NEXT:    movl (%rdi), %eax
5479; SCALAR-NEXT:    movl 4(%rdi), %ecx
5480; SCALAR-NEXT:    notl %eax
5481; SCALAR-NEXT:    notl %ecx
5482; SCALAR-NEXT:    movl %ecx, 4(%rsi)
5483; SCALAR-NEXT:    movl %eax, (%rsi)
5484; SCALAR-NEXT:    movl %ecx, 4(%rdx)
5485; SCALAR-NEXT:    movl %eax, (%rdx)
5486; SCALAR-NEXT:    movl %ecx, 12(%rdx)
5487; SCALAR-NEXT:    movl %eax, 8(%rdx)
5488; SCALAR-NEXT:    movl %ecx, 20(%rdx)
5489; SCALAR-NEXT:    movl %eax, 16(%rdx)
5490; SCALAR-NEXT:    movl %ecx, 28(%rdx)
5491; SCALAR-NEXT:    movl %eax, 24(%rdx)
5492; SCALAR-NEXT:    movl %ecx, 36(%rdx)
5493; SCALAR-NEXT:    movl %eax, 32(%rdx)
5494; SCALAR-NEXT:    movl %ecx, 44(%rdx)
5495; SCALAR-NEXT:    movl %eax, 40(%rdx)
5496; SCALAR-NEXT:    movl %ecx, 52(%rdx)
5497; SCALAR-NEXT:    movl %eax, 48(%rdx)
5498; SCALAR-NEXT:    movl %ecx, 60(%rdx)
5499; SCALAR-NEXT:    movl %eax, 56(%rdx)
5500; SCALAR-NEXT:    retq
5501;
5502; SSE2-LABEL: vec512_v2f32:
5503; SSE2:       # %bb.0:
5504; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
5505; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
5506; SSE2-NEXT:    pxor %xmm0, %xmm1
5507; SSE2-NEXT:    movq %xmm1, (%rsi)
5508; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
5509; SSE2-NEXT:    movdqa %xmm0, (%rdx)
5510; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
5511; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
5512; SSE2-NEXT:    movdqa %xmm0, 48(%rdx)
5513; SSE2-NEXT:    retq
5514;
5515; AVX1-LABEL: vec512_v2f32:
5516; AVX1:       # %bb.0:
5517; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
5518; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
5519; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
5520; AVX1-NEXT:    vmovq %xmm0, (%rsi)
5521; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5522; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
5523; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
5524; AVX1-NEXT:    vmovaps %ymm0, 32(%rdx)
5525; AVX1-NEXT:    vzeroupper
5526; AVX1-NEXT:    retq
5527;
5528; AVX2-ONLY-LABEL: vec512_v2f32:
5529; AVX2-ONLY:       # %bb.0:
5530; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
5531; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
5532; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
5533; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
5534; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %ymm0
5535; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
5536; AVX2-ONLY-NEXT:    vmovdqa %ymm0, 32(%rdx)
5537; AVX2-ONLY-NEXT:    vzeroupper
5538; AVX2-ONLY-NEXT:    retq
5539;
5540; AVX512-LABEL: vec512_v2f32:
5541; AVX512:       # %bb.0:
5542; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
5543; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
5544; AVX512-NEXT:    vmovq %xmm0, (%rsi)
5545; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
5546; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
5547; AVX512-NEXT:    vzeroupper
5548; AVX512-NEXT:    retq
5549  %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
5550  %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
5551  %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float>
5552  store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64
5553  %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0
5554  store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
5555  %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1
5556  store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8
5557  %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2
5558  store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16
5559  %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3
5560  store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8
5561  %out.subvec4.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 4
5562  store <2 x float> %in.subvec, ptr %out.subvec4.ptr, align 32
5563  %out.subvec5.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 5
5564  store <2 x float> %in.subvec, ptr %out.subvec5.ptr, align 8
5565  %out.subvec6.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 6
5566  store <2 x float> %in.subvec, ptr %out.subvec6.ptr, align 16
5567  %out.subvec7.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 7
5568  store <2 x float> %in.subvec, ptr %out.subvec7.ptr, align 8
5569  ret void
5570}
5571
5572define void @vec512_v2i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5573; SCALAR-LABEL: vec512_v2i64:
5574; SCALAR:       # %bb.0:
5575; SCALAR-NEXT:    movq (%rdi), %rax
5576; SCALAR-NEXT:    movq 8(%rdi), %rcx
5577; SCALAR-NEXT:    notq %rax
5578; SCALAR-NEXT:    notq %rcx
5579; SCALAR-NEXT:    movq %rcx, 8(%rsi)
5580; SCALAR-NEXT:    movq %rax, (%rsi)
5581; SCALAR-NEXT:    movq %rcx, 8(%rdx)
5582; SCALAR-NEXT:    movq %rax, (%rdx)
5583; SCALAR-NEXT:    movq %rcx, 24(%rdx)
5584; SCALAR-NEXT:    movq %rax, 16(%rdx)
5585; SCALAR-NEXT:    movq %rcx, 40(%rdx)
5586; SCALAR-NEXT:    movq %rax, 32(%rdx)
5587; SCALAR-NEXT:    movq %rcx, 56(%rdx)
5588; SCALAR-NEXT:    movq %rax, 48(%rdx)
5589; SCALAR-NEXT:    retq
5590;
5591; SSE2-LABEL: vec512_v2i64:
5592; SSE2:       # %bb.0:
5593; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
5594; SSE2-NEXT:    pxor (%rdi), %xmm0
5595; SSE2-NEXT:    movdqa %xmm0, (%rsi)
5596; SSE2-NEXT:    movdqa %xmm0, (%rdx)
5597; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
5598; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
5599; SSE2-NEXT:    movdqa %xmm0, 48(%rdx)
5600; SSE2-NEXT:    retq
5601;
5602; AVX-LABEL: vec512_v2i64:
5603; AVX:       # %bb.0:
5604; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
5605; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
5606; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
5607; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
5608; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
5609; AVX-NEXT:    vmovdqa %xmm0, 32(%rdx)
5610; AVX-NEXT:    vmovdqa %xmm0, 48(%rdx)
5611; AVX-NEXT:    retq
5612  %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
5613  %in.subvec = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
5614  store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64
5615  %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0
5616  store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64
5617  %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1
5618  store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16
5619  %out.subvec2.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 2
5620  store <2 x i64> %in.subvec, ptr %out.subvec2.ptr, align 32
5621  %out.subvec3.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 3
5622  store <2 x i64> %in.subvec, ptr %out.subvec3.ptr, align 16
5623  ret void
5624}
5625
5626define void @vec512_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5627; SCALAR-LABEL: vec512_v2f64:
5628; SCALAR:       # %bb.0:
5629; SCALAR-NEXT:    movq (%rdi), %rax
5630; SCALAR-NEXT:    movq 8(%rdi), %rcx
5631; SCALAR-NEXT:    notq %rax
5632; SCALAR-NEXT:    notq %rcx
5633; SCALAR-NEXT:    movq %rcx, 8(%rsi)
5634; SCALAR-NEXT:    movq %rax, (%rsi)
5635; SCALAR-NEXT:    movq %rcx, 8(%rdx)
5636; SCALAR-NEXT:    movq %rax, (%rdx)
5637; SCALAR-NEXT:    movq %rcx, 24(%rdx)
5638; SCALAR-NEXT:    movq %rax, 16(%rdx)
5639; SCALAR-NEXT:    movq %rcx, 40(%rdx)
5640; SCALAR-NEXT:    movq %rax, 32(%rdx)
5641; SCALAR-NEXT:    movq %rcx, 56(%rdx)
5642; SCALAR-NEXT:    movq %rax, 48(%rdx)
5643; SCALAR-NEXT:    retq
5644;
5645; SSE2-LABEL: vec512_v2f64:
5646; SSE2:       # %bb.0:
5647; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
5648; SSE2-NEXT:    pxor (%rdi), %xmm0
5649; SSE2-NEXT:    movdqa %xmm0, (%rsi)
5650; SSE2-NEXT:    movdqa %xmm0, (%rdx)
5651; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
5652; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
5653; SSE2-NEXT:    movdqa %xmm0, 48(%rdx)
5654; SSE2-NEXT:    retq
5655;
5656; AVX-LABEL: vec512_v2f64:
5657; AVX:       # %bb.0:
5658; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
5659; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
5660; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
5661; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
5662; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
5663; AVX-NEXT:    vmovdqa %xmm0, 32(%rdx)
5664; AVX-NEXT:    vmovdqa %xmm0, 48(%rdx)
5665; AVX-NEXT:    retq
5666  %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
5667  %in.subvec.int = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
5668  %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double>
5669  store <2 x double> %in.subvec, ptr %out.subvec.ptr, align 64
5670  %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0
5671  store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64
5672  %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1
5673  store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16
5674  %out.subvec2.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 2
5675  store <2 x double> %in.subvec, ptr %out.subvec2.ptr, align 32
5676  %out.subvec3.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 3
5677  store <2 x double> %in.subvec, ptr %out.subvec3.ptr, align 16
5678  ret void
5679}
5680
5681define void @vec512_v2i128(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5682; ALL-LABEL: vec512_v2i128:
5683; ALL:       # %bb.0:
5684; ALL-NEXT:    movq 16(%rdi), %rax
5685; ALL-NEXT:    movq 24(%rdi), %rcx
5686; ALL-NEXT:    movq (%rdi), %r8
5687; ALL-NEXT:    movq 8(%rdi), %rdi
5688; ALL-NEXT:    notq %rdi
5689; ALL-NEXT:    notq %r8
5690; ALL-NEXT:    notq %rcx
5691; ALL-NEXT:    notq %rax
5692; ALL-NEXT:    movq %rax, 16(%rsi)
5693; ALL-NEXT:    movq %rcx, 24(%rsi)
5694; ALL-NEXT:    movq %r8, (%rsi)
5695; ALL-NEXT:    movq %rdi, 8(%rsi)
5696; ALL-NEXT:    movq %rax, 16(%rdx)
5697; ALL-NEXT:    movq %rcx, 24(%rdx)
5698; ALL-NEXT:    movq %r8, (%rdx)
5699; ALL-NEXT:    movq %rdi, 8(%rdx)
5700; ALL-NEXT:    movq %rax, 48(%rdx)
5701; ALL-NEXT:    movq %rcx, 56(%rdx)
5702; ALL-NEXT:    movq %r8, 32(%rdx)
5703; ALL-NEXT:    movq %rdi, 40(%rdx)
5704; ALL-NEXT:    retq
5705  %in.subvec.not = load <2 x i128>, ptr %in.subvec.ptr, align 64
5706  %in.subvec = xor <2 x i128> %in.subvec.not, <i128 -1, i128 -1>
5707  store <2 x i128> %in.subvec, ptr %out.subvec.ptr, align 64
5708  %out.subvec0.ptr = getelementptr <2 x i128>, ptr %out.vec.ptr, i64 0
5709  store <2 x i128> %in.subvec, ptr %out.subvec0.ptr, align 64
5710  %out.subvec1.ptr = getelementptr <2 x i128>, ptr %out.vec.ptr, i64 1
5711  store <2 x i128> %in.subvec, ptr %out.subvec1.ptr, align 32
5712  ret void
5713}
5714
5715define void @vec512_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5716; SCALAR-LABEL: vec512_v4i8:
5717; SCALAR:       # %bb.0:
5718; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
5719; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
5720; SCALAR-NEXT:    movzbl (%rdi), %eax
5721; SCALAR-NEXT:    movzbl 1(%rdi), %edi
5722; SCALAR-NEXT:    notb %al
5723; SCALAR-NEXT:    notb %dil
5724; SCALAR-NEXT:    notb %cl
5725; SCALAR-NEXT:    notb %r8b
5726; SCALAR-NEXT:    movb %r8b, 3(%rsi)
5727; SCALAR-NEXT:    movb %cl, 2(%rsi)
5728; SCALAR-NEXT:    movb %dil, 1(%rsi)
5729; SCALAR-NEXT:    movb %al, (%rsi)
5730; SCALAR-NEXT:    movb %r8b, 3(%rdx)
5731; SCALAR-NEXT:    movb %cl, 2(%rdx)
5732; SCALAR-NEXT:    movb %dil, 1(%rdx)
5733; SCALAR-NEXT:    movb %al, (%rdx)
5734; SCALAR-NEXT:    movb %r8b, 7(%rdx)
5735; SCALAR-NEXT:    movb %cl, 6(%rdx)
5736; SCALAR-NEXT:    movb %dil, 5(%rdx)
5737; SCALAR-NEXT:    movb %al, 4(%rdx)
5738; SCALAR-NEXT:    movb %r8b, 11(%rdx)
5739; SCALAR-NEXT:    movb %cl, 10(%rdx)
5740; SCALAR-NEXT:    movb %dil, 9(%rdx)
5741; SCALAR-NEXT:    movb %al, 8(%rdx)
5742; SCALAR-NEXT:    movb %r8b, 15(%rdx)
5743; SCALAR-NEXT:    movb %cl, 14(%rdx)
5744; SCALAR-NEXT:    movb %dil, 13(%rdx)
5745; SCALAR-NEXT:    movb %al, 12(%rdx)
5746; SCALAR-NEXT:    movb %r8b, 19(%rdx)
5747; SCALAR-NEXT:    movb %cl, 18(%rdx)
5748; SCALAR-NEXT:    movb %dil, 17(%rdx)
5749; SCALAR-NEXT:    movb %al, 16(%rdx)
5750; SCALAR-NEXT:    movb %r8b, 23(%rdx)
5751; SCALAR-NEXT:    movb %cl, 22(%rdx)
5752; SCALAR-NEXT:    movb %dil, 21(%rdx)
5753; SCALAR-NEXT:    movb %al, 20(%rdx)
5754; SCALAR-NEXT:    movb %r8b, 27(%rdx)
5755; SCALAR-NEXT:    movb %cl, 26(%rdx)
5756; SCALAR-NEXT:    movb %dil, 25(%rdx)
5757; SCALAR-NEXT:    movb %al, 24(%rdx)
5758; SCALAR-NEXT:    movb %r8b, 31(%rdx)
5759; SCALAR-NEXT:    movb %cl, 30(%rdx)
5760; SCALAR-NEXT:    movb %dil, 29(%rdx)
5761; SCALAR-NEXT:    movb %al, 28(%rdx)
5762; SCALAR-NEXT:    movb %r8b, 35(%rdx)
5763; SCALAR-NEXT:    movb %cl, 34(%rdx)
5764; SCALAR-NEXT:    movb %dil, 33(%rdx)
5765; SCALAR-NEXT:    movb %al, 32(%rdx)
5766; SCALAR-NEXT:    movb %r8b, 39(%rdx)
5767; SCALAR-NEXT:    movb %cl, 38(%rdx)
5768; SCALAR-NEXT:    movb %dil, 37(%rdx)
5769; SCALAR-NEXT:    movb %al, 36(%rdx)
5770; SCALAR-NEXT:    movb %r8b, 43(%rdx)
5771; SCALAR-NEXT:    movb %cl, 42(%rdx)
5772; SCALAR-NEXT:    movb %dil, 41(%rdx)
5773; SCALAR-NEXT:    movb %al, 40(%rdx)
5774; SCALAR-NEXT:    movb %r8b, 47(%rdx)
5775; SCALAR-NEXT:    movb %cl, 46(%rdx)
5776; SCALAR-NEXT:    movb %dil, 45(%rdx)
5777; SCALAR-NEXT:    movb %al, 44(%rdx)
5778; SCALAR-NEXT:    movb %r8b, 51(%rdx)
5779; SCALAR-NEXT:    movb %cl, 50(%rdx)
5780; SCALAR-NEXT:    movb %dil, 49(%rdx)
5781; SCALAR-NEXT:    movb %al, 48(%rdx)
5782; SCALAR-NEXT:    movb %r8b, 55(%rdx)
5783; SCALAR-NEXT:    movb %cl, 54(%rdx)
5784; SCALAR-NEXT:    movb %dil, 53(%rdx)
5785; SCALAR-NEXT:    movb %al, 52(%rdx)
5786; SCALAR-NEXT:    movb %r8b, 59(%rdx)
5787; SCALAR-NEXT:    movb %cl, 58(%rdx)
5788; SCALAR-NEXT:    movb %dil, 57(%rdx)
5789; SCALAR-NEXT:    movb %al, 56(%rdx)
5790; SCALAR-NEXT:    movb %r8b, 63(%rdx)
5791; SCALAR-NEXT:    movb %cl, 62(%rdx)
5792; SCALAR-NEXT:    movb %dil, 61(%rdx)
5793; SCALAR-NEXT:    movb %al, 60(%rdx)
5794; SCALAR-NEXT:    retq
5795;
5796; SSE2-LABEL: vec512_v4i8:
5797; SSE2:       # %bb.0:
5798; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
5799; SSE2-NEXT:    pxor (%rdi), %xmm0
5800; SSE2-NEXT:    movd %xmm0, (%rsi)
5801; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5802; SSE2-NEXT:    movdqa %xmm0, (%rdx)
5803; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
5804; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
5805; SSE2-NEXT:    movdqa %xmm0, 48(%rdx)
5806; SSE2-NEXT:    retq
5807;
5808; AVX1-LABEL: vec512_v4i8:
5809; AVX1:       # %bb.0:
5810; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
5811; AVX1-NEXT:    vpxor (%rdi), %xmm0, %xmm0
5812; AVX1-NEXT:    vmovd %xmm0, (%rsi)
5813; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5814; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
5815; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
5816; AVX1-NEXT:    vmovaps %ymm0, 32(%rdx)
5817; AVX1-NEXT:    vzeroupper
5818; AVX1-NEXT:    retq
5819;
5820; AVX2-ONLY-LABEL: vec512_v4i8:
5821; AVX2-ONLY:       # %bb.0:
5822; AVX2-ONLY-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
5823; AVX2-ONLY-NEXT:    vpxor (%rdi), %xmm0, %xmm0
5824; AVX2-ONLY-NEXT:    vmovd %xmm0, (%rsi)
5825; AVX2-ONLY-NEXT:    vpbroadcastd %xmm0, %ymm0
5826; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
5827; AVX2-ONLY-NEXT:    vmovdqa %ymm0, 32(%rdx)
5828; AVX2-ONLY-NEXT:    vzeroupper
5829; AVX2-ONLY-NEXT:    retq
5830;
5831; AVX512-LABEL: vec512_v4i8:
5832; AVX512:       # %bb.0:
5833; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
5834; AVX512-NEXT:    vpxor (%rdi), %xmm0, %xmm0
5835; AVX512-NEXT:    vmovd %xmm0, (%rsi)
5836; AVX512-NEXT:    vpbroadcastd %xmm0, %zmm0
5837; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
5838; AVX512-NEXT:    vzeroupper
5839; AVX512-NEXT:    retq
5840  %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64
5841  %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1>
5842  store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
5843  %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0
5844  store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
5845  %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1
5846  store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
5847  %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2
5848  store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8
5849  %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3
5850  store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4
5851  %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4
5852  store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16
5853  %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5
5854  store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4
5855  %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6
5856  store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8
5857  %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7
5858  store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4
5859  %out.subvec8.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 8
5860  store <4 x i8> %in.subvec, ptr %out.subvec8.ptr, align 32
5861  %out.subvec9.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 9
5862  store <4 x i8> %in.subvec, ptr %out.subvec9.ptr, align 4
5863  %out.subvec10.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 10
5864  store <4 x i8> %in.subvec, ptr %out.subvec10.ptr, align 8
5865  %out.subvec11.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 11
5866  store <4 x i8> %in.subvec, ptr %out.subvec11.ptr, align 4
5867  %out.subvec12.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 12
5868  store <4 x i8> %in.subvec, ptr %out.subvec12.ptr, align 16
5869  %out.subvec13.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 13
5870  store <4 x i8> %in.subvec, ptr %out.subvec13.ptr, align 4
5871  %out.subvec14.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 14
5872  store <4 x i8> %in.subvec, ptr %out.subvec14.ptr, align 8
5873  %out.subvec15.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 15
5874  store <4 x i8> %in.subvec, ptr %out.subvec15.ptr, align 4
5875  ret void
5876}
5877
5878define void @vec512_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5879; SCALAR-LABEL: vec512_v4i16:
5880; SCALAR:       # %bb.0:
5881; SCALAR-NEXT:    movzwl 6(%rdi), %r8d
5882; SCALAR-NEXT:    movzwl 2(%rdi), %ecx
5883; SCALAR-NEXT:    movl (%rdi), %eax
5884; SCALAR-NEXT:    movl 4(%rdi), %edi
5885; SCALAR-NEXT:    notl %eax
5886; SCALAR-NEXT:    notl %ecx
5887; SCALAR-NEXT:    notl %edi
5888; SCALAR-NEXT:    notl %r8d
5889; SCALAR-NEXT:    movw %r8w, 6(%rsi)
5890; SCALAR-NEXT:    movw %di, 4(%rsi)
5891; SCALAR-NEXT:    movw %cx, 2(%rsi)
5892; SCALAR-NEXT:    movw %ax, (%rsi)
5893; SCALAR-NEXT:    movw %r8w, 6(%rdx)
5894; SCALAR-NEXT:    movw %di, 4(%rdx)
5895; SCALAR-NEXT:    movw %cx, 2(%rdx)
5896; SCALAR-NEXT:    movw %ax, (%rdx)
5897; SCALAR-NEXT:    movw %r8w, 14(%rdx)
5898; SCALAR-NEXT:    movw %di, 12(%rdx)
5899; SCALAR-NEXT:    movw %cx, 10(%rdx)
5900; SCALAR-NEXT:    movw %ax, 8(%rdx)
5901; SCALAR-NEXT:    movw %r8w, 22(%rdx)
5902; SCALAR-NEXT:    movw %di, 20(%rdx)
5903; SCALAR-NEXT:    movw %cx, 18(%rdx)
5904; SCALAR-NEXT:    movw %ax, 16(%rdx)
5905; SCALAR-NEXT:    movw %r8w, 30(%rdx)
5906; SCALAR-NEXT:    movw %di, 28(%rdx)
5907; SCALAR-NEXT:    movw %cx, 26(%rdx)
5908; SCALAR-NEXT:    movw %ax, 24(%rdx)
5909; SCALAR-NEXT:    movw %r8w, 38(%rdx)
5910; SCALAR-NEXT:    movw %di, 36(%rdx)
5911; SCALAR-NEXT:    movw %cx, 34(%rdx)
5912; SCALAR-NEXT:    movw %ax, 32(%rdx)
5913; SCALAR-NEXT:    movw %r8w, 46(%rdx)
5914; SCALAR-NEXT:    movw %di, 44(%rdx)
5915; SCALAR-NEXT:    movw %cx, 42(%rdx)
5916; SCALAR-NEXT:    movw %ax, 40(%rdx)
5917; SCALAR-NEXT:    movw %r8w, 54(%rdx)
5918; SCALAR-NEXT:    movw %di, 52(%rdx)
5919; SCALAR-NEXT:    movw %cx, 50(%rdx)
5920; SCALAR-NEXT:    movw %ax, 48(%rdx)
5921; SCALAR-NEXT:    movw %r8w, 62(%rdx)
5922; SCALAR-NEXT:    movw %di, 60(%rdx)
5923; SCALAR-NEXT:    movw %cx, 58(%rdx)
5924; SCALAR-NEXT:    movw %ax, 56(%rdx)
5925; SCALAR-NEXT:    retq
5926;
5927; SSE2-LABEL: vec512_v4i16:
5928; SSE2:       # %bb.0:
5929; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
5930; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
5931; SSE2-NEXT:    pxor %xmm0, %xmm1
5932; SSE2-NEXT:    movq %xmm1, (%rsi)
5933; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
5934; SSE2-NEXT:    movdqa %xmm0, (%rdx)
5935; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
5936; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
5937; SSE2-NEXT:    movdqa %xmm0, 48(%rdx)
5938; SSE2-NEXT:    retq
5939;
5940; AVX1-LABEL: vec512_v4i16:
5941; AVX1:       # %bb.0:
5942; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
5943; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
5944; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
5945; AVX1-NEXT:    vmovq %xmm0, (%rsi)
5946; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5947; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
5948; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
5949; AVX1-NEXT:    vmovaps %ymm0, 32(%rdx)
5950; AVX1-NEXT:    vzeroupper
5951; AVX1-NEXT:    retq
5952;
5953; AVX2-ONLY-LABEL: vec512_v4i16:
5954; AVX2-ONLY:       # %bb.0:
5955; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
5956; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
5957; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
5958; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
5959; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %ymm0
5960; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
5961; AVX2-ONLY-NEXT:    vmovdqa %ymm0, 32(%rdx)
5962; AVX2-ONLY-NEXT:    vzeroupper
5963; AVX2-ONLY-NEXT:    retq
5964;
5965; AVX512-LABEL: vec512_v4i16:
5966; AVX512:       # %bb.0:
5967; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
5968; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
5969; AVX512-NEXT:    vmovq %xmm0, (%rsi)
5970; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
5971; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
5972; AVX512-NEXT:    vzeroupper
5973; AVX512-NEXT:    retq
5974  %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64
5975  %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1>
5976  store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
5977  %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0
5978  store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
5979  %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1
5980  store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8
5981  %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2
5982  store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16
5983  %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3
5984  store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8
5985  %out.subvec4.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 4
5986  store <4 x i16> %in.subvec, ptr %out.subvec4.ptr, align 32
5987  %out.subvec5.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 5
5988  store <4 x i16> %in.subvec, ptr %out.subvec5.ptr, align 8
5989  %out.subvec6.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 6
5990  store <4 x i16> %in.subvec, ptr %out.subvec6.ptr, align 16
5991  %out.subvec7.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 7
5992  store <4 x i16> %in.subvec, ptr %out.subvec7.ptr, align 8
5993  ret void
5994}
5995
5996define void @vec512_v4i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5997; SCALAR-LABEL: vec512_v4i32:
5998; SCALAR:       # %bb.0:
5999; SCALAR-NEXT:    movaps (%rdi), %xmm0
6000; SCALAR-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6001; SCALAR-NEXT:    movaps %xmm0, (%rsi)
6002; SCALAR-NEXT:    movaps %xmm0, (%rdx)
6003; SCALAR-NEXT:    movaps %xmm0, 16(%rdx)
6004; SCALAR-NEXT:    movaps %xmm0, 32(%rdx)
6005; SCALAR-NEXT:    movaps %xmm0, 48(%rdx)
6006; SCALAR-NEXT:    retq
6007;
6008; SSE2-LABEL: vec512_v4i32:
6009; SSE2:       # %bb.0:
6010; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
6011; SSE2-NEXT:    pxor (%rdi), %xmm0
6012; SSE2-NEXT:    movdqa %xmm0, (%rsi)
6013; SSE2-NEXT:    movdqa %xmm0, (%rdx)
6014; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
6015; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
6016; SSE2-NEXT:    movdqa %xmm0, 48(%rdx)
6017; SSE2-NEXT:    retq
6018;
6019; AVX-LABEL: vec512_v4i32:
6020; AVX:       # %bb.0:
6021; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
6022; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
6023; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
6024; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
6025; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
6026; AVX-NEXT:    vmovdqa %xmm0, 32(%rdx)
6027; AVX-NEXT:    vmovdqa %xmm0, 48(%rdx)
6028; AVX-NEXT:    retq
6029  %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
6030  %in.subvec = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
6031  store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
6032  %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0
6033  store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
6034  %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1
6035  store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16
6036  %out.subvec2.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 2
6037  store <4 x i32> %in.subvec, ptr %out.subvec2.ptr, align 32
6038  %out.subvec3.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 3
6039  store <4 x i32> %in.subvec, ptr %out.subvec3.ptr, align 16
6040  ret void
6041}
6042
6043define void @vec512_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6044; SCALAR-LABEL: vec512_v4f32:
6045; SCALAR:       # %bb.0:
6046; SCALAR-NEXT:    movaps (%rdi), %xmm0
6047; SCALAR-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6048; SCALAR-NEXT:    movaps %xmm0, (%rsi)
6049; SCALAR-NEXT:    movaps %xmm0, (%rdx)
6050; SCALAR-NEXT:    movaps %xmm0, 16(%rdx)
6051; SCALAR-NEXT:    movaps %xmm0, 32(%rdx)
6052; SCALAR-NEXT:    movaps %xmm0, 48(%rdx)
6053; SCALAR-NEXT:    retq
6054;
6055; SSE2-LABEL: vec512_v4f32:
6056; SSE2:       # %bb.0:
6057; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
6058; SSE2-NEXT:    pxor (%rdi), %xmm0
6059; SSE2-NEXT:    movdqa %xmm0, (%rsi)
6060; SSE2-NEXT:    movdqa %xmm0, (%rdx)
6061; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
6062; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
6063; SSE2-NEXT:    movdqa %xmm0, 48(%rdx)
6064; SSE2-NEXT:    retq
6065;
6066; AVX-LABEL: vec512_v4f32:
6067; AVX:       # %bb.0:
6068; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
6069; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
6070; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
6071; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
6072; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
6073; AVX-NEXT:    vmovdqa %xmm0, 32(%rdx)
6074; AVX-NEXT:    vmovdqa %xmm0, 48(%rdx)
6075; AVX-NEXT:    retq
6076  %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
6077  %in.subvec.int = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
6078  %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float>
6079  store <4 x float> %in.subvec, ptr %out.subvec.ptr, align 64
6080  %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0
6081  store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
6082  %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1
6083  store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16
6084  %out.subvec2.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 2
6085  store <4 x float> %in.subvec, ptr %out.subvec2.ptr, align 32
6086  %out.subvec3.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 3
6087  store <4 x float> %in.subvec, ptr %out.subvec3.ptr, align 16
6088  ret void
6089}
6090
6091define void @vec512_v4i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6092; SCALAR-LABEL: vec512_v4i64:
6093; SCALAR:       # %bb.0:
6094; SCALAR-NEXT:    movq 24(%rdi), %rax
6095; SCALAR-NEXT:    movq 16(%rdi), %rcx
6096; SCALAR-NEXT:    movq (%rdi), %r8
6097; SCALAR-NEXT:    movq 8(%rdi), %rdi
6098; SCALAR-NEXT:    notq %r8
6099; SCALAR-NEXT:    notq %rdi
6100; SCALAR-NEXT:    notq %rcx
6101; SCALAR-NEXT:    notq %rax
6102; SCALAR-NEXT:    movq %rax, 24(%rsi)
6103; SCALAR-NEXT:    movq %rcx, 16(%rsi)
6104; SCALAR-NEXT:    movq %rdi, 8(%rsi)
6105; SCALAR-NEXT:    movq %r8, (%rsi)
6106; SCALAR-NEXT:    movq %rax, 24(%rdx)
6107; SCALAR-NEXT:    movq %rcx, 16(%rdx)
6108; SCALAR-NEXT:    movq %rdi, 8(%rdx)
6109; SCALAR-NEXT:    movq %r8, (%rdx)
6110; SCALAR-NEXT:    movq %rax, 56(%rdx)
6111; SCALAR-NEXT:    movq %rcx, 48(%rdx)
6112; SCALAR-NEXT:    movq %rdi, 40(%rdx)
6113; SCALAR-NEXT:    movq %r8, 32(%rdx)
6114; SCALAR-NEXT:    retq
6115;
6116; SSE2-LABEL: vec512_v4i64:
6117; SSE2:       # %bb.0:
6118; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
6119; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
6120; SSE2-NEXT:    pxor %xmm0, %xmm1
6121; SSE2-NEXT:    pxor (%rdi), %xmm0
6122; SSE2-NEXT:    movdqa %xmm0, (%rsi)
6123; SSE2-NEXT:    movdqa %xmm1, 16(%rsi)
6124; SSE2-NEXT:    movdqa %xmm0, (%rdx)
6125; SSE2-NEXT:    movdqa %xmm1, 16(%rdx)
6126; SSE2-NEXT:    movdqa %xmm1, 48(%rdx)
6127; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
6128; SSE2-NEXT:    retq
6129;
6130; AVX1-LABEL: vec512_v4i64:
6131; AVX1:       # %bb.0:
6132; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
6133; AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
6134; AVX1-NEXT:    vxorps (%rdi), %ymm0, %ymm0
6135; AVX1-NEXT:    vmovaps %ymm0, (%rsi)
6136; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
6137; AVX1-NEXT:    vmovaps %ymm0, 32(%rdx)
6138; AVX1-NEXT:    vzeroupper
6139; AVX1-NEXT:    retq
6140;
6141; AVX2-LABEL: vec512_v4i64:
6142; AVX2:       # %bb.0:
6143; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
6144; AVX2-NEXT:    vpxor (%rdi), %ymm0, %ymm0
6145; AVX2-NEXT:    vmovdqa %ymm0, (%rsi)
6146; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
6147; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdx)
6148; AVX2-NEXT:    vzeroupper
6149; AVX2-NEXT:    retq
6150  %in.subvec.not = load <4 x i64>, ptr %in.subvec.ptr, align 64
6151  %in.subvec = xor <4 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1, i64 -1>
6152  store <4 x i64> %in.subvec, ptr %out.subvec.ptr, align 64
6153  %out.subvec0.ptr = getelementptr <4 x i64>, ptr %out.vec.ptr, i64 0
6154  store <4 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64
6155  %out.subvec1.ptr = getelementptr <4 x i64>, ptr %out.vec.ptr, i64 1
6156  store <4 x i64> %in.subvec, ptr %out.subvec1.ptr, align 32
6157  ret void
6158}
6159
6160define void @vec512_v4f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6161; SCALAR-LABEL: vec512_v4f64:
6162; SCALAR:       # %bb.0:
6163; SCALAR-NEXT:    movq 24(%rdi), %rax
6164; SCALAR-NEXT:    movq 16(%rdi), %rcx
6165; SCALAR-NEXT:    movq (%rdi), %r8
6166; SCALAR-NEXT:    movq 8(%rdi), %rdi
6167; SCALAR-NEXT:    notq %r8
6168; SCALAR-NEXT:    notq %rdi
6169; SCALAR-NEXT:    notq %rcx
6170; SCALAR-NEXT:    notq %rax
6171; SCALAR-NEXT:    movq %rax, 24(%rsi)
6172; SCALAR-NEXT:    movq %rcx, 16(%rsi)
6173; SCALAR-NEXT:    movq %rdi, 8(%rsi)
6174; SCALAR-NEXT:    movq %r8, (%rsi)
6175; SCALAR-NEXT:    movq %rax, 24(%rdx)
6176; SCALAR-NEXT:    movq %rcx, 16(%rdx)
6177; SCALAR-NEXT:    movq %rdi, 8(%rdx)
6178; SCALAR-NEXT:    movq %r8, (%rdx)
6179; SCALAR-NEXT:    movq %rax, 56(%rdx)
6180; SCALAR-NEXT:    movq %rcx, 48(%rdx)
6181; SCALAR-NEXT:    movq %rdi, 40(%rdx)
6182; SCALAR-NEXT:    movq %r8, 32(%rdx)
6183; SCALAR-NEXT:    retq
6184;
6185; SSE2-LABEL: vec512_v4f64:
6186; SSE2:       # %bb.0:
6187; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
6188; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
6189; SSE2-NEXT:    pxor %xmm0, %xmm1
6190; SSE2-NEXT:    pxor (%rdi), %xmm0
6191; SSE2-NEXT:    movdqa %xmm0, (%rsi)
6192; SSE2-NEXT:    movdqa %xmm1, 16(%rsi)
6193; SSE2-NEXT:    movdqa %xmm0, (%rdx)
6194; SSE2-NEXT:    movdqa %xmm1, 16(%rdx)
6195; SSE2-NEXT:    movdqa %xmm1, 48(%rdx)
6196; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
6197; SSE2-NEXT:    retq
6198;
6199; AVX1-LABEL: vec512_v4f64:
6200; AVX1:       # %bb.0:
6201; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
6202; AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
6203; AVX1-NEXT:    vxorps (%rdi), %ymm0, %ymm0
6204; AVX1-NEXT:    vmovaps %ymm0, (%rsi)
6205; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
6206; AVX1-NEXT:    vmovaps %ymm0, 32(%rdx)
6207; AVX1-NEXT:    vzeroupper
6208; AVX1-NEXT:    retq
6209;
6210; AVX2-LABEL: vec512_v4f64:
6211; AVX2:       # %bb.0:
6212; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
6213; AVX2-NEXT:    vpxor (%rdi), %ymm0, %ymm0
6214; AVX2-NEXT:    vmovdqa %ymm0, (%rsi)
6215; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
6216; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdx)
6217; AVX2-NEXT:    vzeroupper
6218; AVX2-NEXT:    retq
6219  %in.subvec.not = load <4 x i64>, ptr %in.subvec.ptr, align 64
6220  %in.subvec.int = xor <4 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1, i64 -1>
6221  %in.subvec = bitcast <4 x i64> %in.subvec.int to <4 x double>
6222  store <4 x double> %in.subvec, ptr %out.subvec.ptr, align 64
6223  %out.subvec0.ptr = getelementptr <4 x double>, ptr %out.vec.ptr, i64 0
6224  store <4 x double> %in.subvec, ptr %out.subvec0.ptr, align 64
6225  %out.subvec1.ptr = getelementptr <4 x double>, ptr %out.vec.ptr, i64 1
6226  store <4 x double> %in.subvec, ptr %out.subvec1.ptr, align 32
6227  ret void
6228}
6229
6230define void @vec512_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6231; SCALAR-LABEL: vec512_v8i8:
6232; SCALAR:       # %bb.0:
6233; SCALAR-NEXT:    pushq %rbx
6234; SCALAR-NEXT:    movzbl 7(%rdi), %ebx
6235; SCALAR-NEXT:    movzbl 6(%rdi), %r11d
6236; SCALAR-NEXT:    movzbl 5(%rdi), %r10d
6237; SCALAR-NEXT:    movzbl 4(%rdi), %r9d
6238; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
6239; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
6240; SCALAR-NEXT:    movzbl (%rdi), %eax
6241; SCALAR-NEXT:    movzbl 1(%rdi), %edi
6242; SCALAR-NEXT:    notb %al
6243; SCALAR-NEXT:    notb %dil
6244; SCALAR-NEXT:    notb %cl
6245; SCALAR-NEXT:    notb %r8b
6246; SCALAR-NEXT:    notb %r9b
6247; SCALAR-NEXT:    notb %r10b
6248; SCALAR-NEXT:    notb %r11b
6249; SCALAR-NEXT:    notb %bl
6250; SCALAR-NEXT:    movb %bl, 7(%rsi)
6251; SCALAR-NEXT:    movb %r11b, 6(%rsi)
6252; SCALAR-NEXT:    movb %r10b, 5(%rsi)
6253; SCALAR-NEXT:    movb %r9b, 4(%rsi)
6254; SCALAR-NEXT:    movb %r8b, 3(%rsi)
6255; SCALAR-NEXT:    movb %cl, 2(%rsi)
6256; SCALAR-NEXT:    movb %dil, 1(%rsi)
6257; SCALAR-NEXT:    movb %al, (%rsi)
6258; SCALAR-NEXT:    movb %bl, 7(%rdx)
6259; SCALAR-NEXT:    movb %r11b, 6(%rdx)
6260; SCALAR-NEXT:    movb %r10b, 5(%rdx)
6261; SCALAR-NEXT:    movb %r9b, 4(%rdx)
6262; SCALAR-NEXT:    movb %r8b, 3(%rdx)
6263; SCALAR-NEXT:    movb %cl, 2(%rdx)
6264; SCALAR-NEXT:    movb %dil, 1(%rdx)
6265; SCALAR-NEXT:    movb %al, (%rdx)
6266; SCALAR-NEXT:    movb %bl, 15(%rdx)
6267; SCALAR-NEXT:    movb %r11b, 14(%rdx)
6268; SCALAR-NEXT:    movb %r10b, 13(%rdx)
6269; SCALAR-NEXT:    movb %r9b, 12(%rdx)
6270; SCALAR-NEXT:    movb %r8b, 11(%rdx)
6271; SCALAR-NEXT:    movb %cl, 10(%rdx)
6272; SCALAR-NEXT:    movb %dil, 9(%rdx)
6273; SCALAR-NEXT:    movb %al, 8(%rdx)
6274; SCALAR-NEXT:    movb %bl, 23(%rdx)
6275; SCALAR-NEXT:    movb %r11b, 22(%rdx)
6276; SCALAR-NEXT:    movb %r10b, 21(%rdx)
6277; SCALAR-NEXT:    movb %r9b, 20(%rdx)
6278; SCALAR-NEXT:    movb %r8b, 19(%rdx)
6279; SCALAR-NEXT:    movb %cl, 18(%rdx)
6280; SCALAR-NEXT:    movb %dil, 17(%rdx)
6281; SCALAR-NEXT:    movb %al, 16(%rdx)
6282; SCALAR-NEXT:    movb %bl, 31(%rdx)
6283; SCALAR-NEXT:    movb %r11b, 30(%rdx)
6284; SCALAR-NEXT:    movb %r10b, 29(%rdx)
6285; SCALAR-NEXT:    movb %r9b, 28(%rdx)
6286; SCALAR-NEXT:    movb %r8b, 27(%rdx)
6287; SCALAR-NEXT:    movb %cl, 26(%rdx)
6288; SCALAR-NEXT:    movb %dil, 25(%rdx)
6289; SCALAR-NEXT:    movb %al, 24(%rdx)
6290; SCALAR-NEXT:    movb %bl, 39(%rdx)
6291; SCALAR-NEXT:    movb %r11b, 38(%rdx)
6292; SCALAR-NEXT:    movb %r10b, 37(%rdx)
6293; SCALAR-NEXT:    movb %r9b, 36(%rdx)
6294; SCALAR-NEXT:    movb %r8b, 35(%rdx)
6295; SCALAR-NEXT:    movb %cl, 34(%rdx)
6296; SCALAR-NEXT:    movb %dil, 33(%rdx)
6297; SCALAR-NEXT:    movb %al, 32(%rdx)
6298; SCALAR-NEXT:    movb %bl, 47(%rdx)
6299; SCALAR-NEXT:    movb %r11b, 46(%rdx)
6300; SCALAR-NEXT:    movb %r10b, 45(%rdx)
6301; SCALAR-NEXT:    movb %r9b, 44(%rdx)
6302; SCALAR-NEXT:    movb %r8b, 43(%rdx)
6303; SCALAR-NEXT:    movb %cl, 42(%rdx)
6304; SCALAR-NEXT:    movb %dil, 41(%rdx)
6305; SCALAR-NEXT:    movb %al, 40(%rdx)
6306; SCALAR-NEXT:    movb %bl, 55(%rdx)
6307; SCALAR-NEXT:    movb %r11b, 54(%rdx)
6308; SCALAR-NEXT:    movb %r10b, 53(%rdx)
6309; SCALAR-NEXT:    movb %r9b, 52(%rdx)
6310; SCALAR-NEXT:    movb %r8b, 51(%rdx)
6311; SCALAR-NEXT:    movb %cl, 50(%rdx)
6312; SCALAR-NEXT:    movb %dil, 49(%rdx)
6313; SCALAR-NEXT:    movb %al, 48(%rdx)
6314; SCALAR-NEXT:    movb %bl, 63(%rdx)
6315; SCALAR-NEXT:    movb %r11b, 62(%rdx)
6316; SCALAR-NEXT:    movb %r10b, 61(%rdx)
6317; SCALAR-NEXT:    movb %r9b, 60(%rdx)
6318; SCALAR-NEXT:    movb %r8b, 59(%rdx)
6319; SCALAR-NEXT:    movb %cl, 58(%rdx)
6320; SCALAR-NEXT:    movb %dil, 57(%rdx)
6321; SCALAR-NEXT:    movb %al, 56(%rdx)
6322; SCALAR-NEXT:    popq %rbx
6323; SCALAR-NEXT:    retq
6324;
6325; SSE2-LABEL: vec512_v8i8:
6326; SSE2:       # %bb.0:
6327; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
6328; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
6329; SSE2-NEXT:    pxor %xmm0, %xmm1
6330; SSE2-NEXT:    movq %xmm1, (%rsi)
6331; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
6332; SSE2-NEXT:    movdqa %xmm0, (%rdx)
6333; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
6334; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
6335; SSE2-NEXT:    movdqa %xmm0, 48(%rdx)
6336; SSE2-NEXT:    retq
6337;
6338; AVX1-LABEL: vec512_v8i8:
6339; AVX1:       # %bb.0:
6340; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
6341; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
6342; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
6343; AVX1-NEXT:    vmovq %xmm0, (%rsi)
6344; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6345; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6346; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
6347; AVX1-NEXT:    vmovaps %ymm0, 32(%rdx)
6348; AVX1-NEXT:    vzeroupper
6349; AVX1-NEXT:    retq
6350;
6351; AVX2-ONLY-LABEL: vec512_v8i8:
6352; AVX2-ONLY:       # %bb.0:
6353; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
6354; AVX2-ONLY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
6355; AVX2-ONLY-NEXT:    vpxor %xmm1, %xmm0, %xmm0
6356; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
6357; AVX2-ONLY-NEXT:    vpbroadcastq %xmm0, %ymm0
6358; AVX2-ONLY-NEXT:    vmovdqa %ymm0, (%rdx)
6359; AVX2-ONLY-NEXT:    vmovdqa %ymm0, 32(%rdx)
6360; AVX2-ONLY-NEXT:    vzeroupper
6361; AVX2-ONLY-NEXT:    retq
6362;
6363; AVX512-LABEL: vec512_v8i8:
6364; AVX512:       # %bb.0:
6365; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
6366; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
6367; AVX512-NEXT:    vmovq %xmm0, (%rsi)
6368; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
6369; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
6370; AVX512-NEXT:    vzeroupper
6371; AVX512-NEXT:    retq
6372  %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64
6373  %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
6374  store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
6375  %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0
6376  store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
6377  %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1
6378  store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8
6379  %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2
6380  store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16
6381  %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3
6382  store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8
6383  %out.subvec4.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 4
6384  store <8 x i8> %in.subvec, ptr %out.subvec4.ptr, align 32
6385  %out.subvec5.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 5
6386  store <8 x i8> %in.subvec, ptr %out.subvec5.ptr, align 8
6387  %out.subvec6.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 6
6388  store <8 x i8> %in.subvec, ptr %out.subvec6.ptr, align 16
6389  %out.subvec7.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 7
6390  store <8 x i8> %in.subvec, ptr %out.subvec7.ptr, align 8
6391  ret void
6392}
6393
6394define void @vec512_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6395; SCALAR-LABEL: vec512_v8i16:
6396; SCALAR:       # %bb.0:
6397; SCALAR-NEXT:    pushq %rbx
6398; SCALAR-NEXT:    movzwl 14(%rdi), %ebx
6399; SCALAR-NEXT:    movl 12(%rdi), %r11d
6400; SCALAR-NEXT:    movzwl 10(%rdi), %r10d
6401; SCALAR-NEXT:    movl 8(%rdi), %r9d
6402; SCALAR-NEXT:    movzwl 6(%rdi), %r8d
6403; SCALAR-NEXT:    movzwl 2(%rdi), %ecx
6404; SCALAR-NEXT:    movl (%rdi), %eax
6405; SCALAR-NEXT:    movl 4(%rdi), %edi
6406; SCALAR-NEXT:    notl %eax
6407; SCALAR-NEXT:    notl %ecx
6408; SCALAR-NEXT:    notl %edi
6409; SCALAR-NEXT:    notl %r8d
6410; SCALAR-NEXT:    notl %r9d
6411; SCALAR-NEXT:    notl %r10d
6412; SCALAR-NEXT:    notl %r11d
6413; SCALAR-NEXT:    notl %ebx
6414; SCALAR-NEXT:    movw %bx, 14(%rsi)
6415; SCALAR-NEXT:    movw %r11w, 12(%rsi)
6416; SCALAR-NEXT:    movw %r10w, 10(%rsi)
6417; SCALAR-NEXT:    movw %r9w, 8(%rsi)
6418; SCALAR-NEXT:    movw %r8w, 6(%rsi)
6419; SCALAR-NEXT:    movw %di, 4(%rsi)
6420; SCALAR-NEXT:    movw %cx, 2(%rsi)
6421; SCALAR-NEXT:    movw %ax, (%rsi)
6422; SCALAR-NEXT:    movw %bx, 14(%rdx)
6423; SCALAR-NEXT:    movw %r11w, 12(%rdx)
6424; SCALAR-NEXT:    movw %r10w, 10(%rdx)
6425; SCALAR-NEXT:    movw %r9w, 8(%rdx)
6426; SCALAR-NEXT:    movw %r8w, 6(%rdx)
6427; SCALAR-NEXT:    movw %di, 4(%rdx)
6428; SCALAR-NEXT:    movw %cx, 2(%rdx)
6429; SCALAR-NEXT:    movw %ax, (%rdx)
6430; SCALAR-NEXT:    movw %bx, 30(%rdx)
6431; SCALAR-NEXT:    movw %r11w, 28(%rdx)
6432; SCALAR-NEXT:    movw %r10w, 26(%rdx)
6433; SCALAR-NEXT:    movw %r9w, 24(%rdx)
6434; SCALAR-NEXT:    movw %r8w, 22(%rdx)
6435; SCALAR-NEXT:    movw %di, 20(%rdx)
6436; SCALAR-NEXT:    movw %cx, 18(%rdx)
6437; SCALAR-NEXT:    movw %ax, 16(%rdx)
6438; SCALAR-NEXT:    movw %bx, 46(%rdx)
6439; SCALAR-NEXT:    movw %r11w, 44(%rdx)
6440; SCALAR-NEXT:    movw %r10w, 42(%rdx)
6441; SCALAR-NEXT:    movw %r9w, 40(%rdx)
6442; SCALAR-NEXT:    movw %r8w, 38(%rdx)
6443; SCALAR-NEXT:    movw %di, 36(%rdx)
6444; SCALAR-NEXT:    movw %cx, 34(%rdx)
6445; SCALAR-NEXT:    movw %ax, 32(%rdx)
6446; SCALAR-NEXT:    movw %bx, 62(%rdx)
6447; SCALAR-NEXT:    movw %r11w, 60(%rdx)
6448; SCALAR-NEXT:    movw %r10w, 58(%rdx)
6449; SCALAR-NEXT:    movw %r9w, 56(%rdx)
6450; SCALAR-NEXT:    movw %r8w, 54(%rdx)
6451; SCALAR-NEXT:    movw %di, 52(%rdx)
6452; SCALAR-NEXT:    movw %cx, 50(%rdx)
6453; SCALAR-NEXT:    movw %ax, 48(%rdx)
6454; SCALAR-NEXT:    popq %rbx
6455; SCALAR-NEXT:    retq
6456;
6457; SSE2-LABEL: vec512_v8i16:
6458; SSE2:       # %bb.0:
6459; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
6460; SSE2-NEXT:    pxor (%rdi), %xmm0
6461; SSE2-NEXT:    movdqa %xmm0, (%rsi)
6462; SSE2-NEXT:    movdqa %xmm0, (%rdx)
6463; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
6464; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
6465; SSE2-NEXT:    movdqa %xmm0, 48(%rdx)
6466; SSE2-NEXT:    retq
6467;
6468; AVX-LABEL: vec512_v8i16:
6469; AVX:       # %bb.0:
6470; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
6471; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
6472; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
6473; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
6474; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
6475; AVX-NEXT:    vmovdqa %xmm0, 32(%rdx)
6476; AVX-NEXT:    vmovdqa %xmm0, 48(%rdx)
6477; AVX-NEXT:    retq
6478  %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64
6479  %in.subvec = xor <8 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
6480  store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
6481  %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0
6482  store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
6483  %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1
6484  store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16
6485  %out.subvec2.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 2
6486  store <8 x i16> %in.subvec, ptr %out.subvec2.ptr, align 32
6487  %out.subvec3.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 3
6488  store <8 x i16> %in.subvec, ptr %out.subvec3.ptr, align 16
6489  ret void
6490}
6491
6492define void @vec512_v8i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6493; SCALAR-LABEL: vec512_v8i32:
6494; SCALAR:       # %bb.0:
6495; SCALAR-NEXT:    pushq %rbx
6496; SCALAR-NEXT:    movl 28(%rdi), %ebx
6497; SCALAR-NEXT:    movl 24(%rdi), %r11d
6498; SCALAR-NEXT:    movl 20(%rdi), %r10d
6499; SCALAR-NEXT:    movl 16(%rdi), %r9d
6500; SCALAR-NEXT:    movl 12(%rdi), %r8d
6501; SCALAR-NEXT:    movl 8(%rdi), %ecx
6502; SCALAR-NEXT:    movl (%rdi), %eax
6503; SCALAR-NEXT:    movl 4(%rdi), %edi
6504; SCALAR-NEXT:    notl %eax
6505; SCALAR-NEXT:    notl %edi
6506; SCALAR-NEXT:    notl %ecx
6507; SCALAR-NEXT:    notl %r8d
6508; SCALAR-NEXT:    notl %r9d
6509; SCALAR-NEXT:    notl %r10d
6510; SCALAR-NEXT:    notl %r11d
6511; SCALAR-NEXT:    notl %ebx
6512; SCALAR-NEXT:    movl %ebx, 28(%rsi)
6513; SCALAR-NEXT:    movl %r11d, 24(%rsi)
6514; SCALAR-NEXT:    movl %r10d, 20(%rsi)
6515; SCALAR-NEXT:    movl %r9d, 16(%rsi)
6516; SCALAR-NEXT:    movl %r8d, 12(%rsi)
6517; SCALAR-NEXT:    movl %ecx, 8(%rsi)
6518; SCALAR-NEXT:    movl %edi, 4(%rsi)
6519; SCALAR-NEXT:    movl %eax, (%rsi)
6520; SCALAR-NEXT:    movl %ebx, 28(%rdx)
6521; SCALAR-NEXT:    movl %r11d, 24(%rdx)
6522; SCALAR-NEXT:    movl %r10d, 20(%rdx)
6523; SCALAR-NEXT:    movl %r9d, 16(%rdx)
6524; SCALAR-NEXT:    movl %r8d, 12(%rdx)
6525; SCALAR-NEXT:    movl %ecx, 8(%rdx)
6526; SCALAR-NEXT:    movl %edi, 4(%rdx)
6527; SCALAR-NEXT:    movl %eax, (%rdx)
6528; SCALAR-NEXT:    movl %ebx, 60(%rdx)
6529; SCALAR-NEXT:    movl %r11d, 56(%rdx)
6530; SCALAR-NEXT:    movl %r10d, 52(%rdx)
6531; SCALAR-NEXT:    movl %r9d, 48(%rdx)
6532; SCALAR-NEXT:    movl %r8d, 44(%rdx)
6533; SCALAR-NEXT:    movl %ecx, 40(%rdx)
6534; SCALAR-NEXT:    movl %edi, 36(%rdx)
6535; SCALAR-NEXT:    movl %eax, 32(%rdx)
6536; SCALAR-NEXT:    popq %rbx
6537; SCALAR-NEXT:    retq
6538;
6539; SSE2-LABEL: vec512_v8i32:
6540; SSE2:       # %bb.0:
6541; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
6542; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
6543; SSE2-NEXT:    pxor %xmm0, %xmm1
6544; SSE2-NEXT:    pxor (%rdi), %xmm0
6545; SSE2-NEXT:    movdqa %xmm0, (%rsi)
6546; SSE2-NEXT:    movdqa %xmm1, 16(%rsi)
6547; SSE2-NEXT:    movdqa %xmm0, (%rdx)
6548; SSE2-NEXT:    movdqa %xmm1, 16(%rdx)
6549; SSE2-NEXT:    movdqa %xmm1, 48(%rdx)
6550; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
6551; SSE2-NEXT:    retq
6552;
6553; AVX1-LABEL: vec512_v8i32:
6554; AVX1:       # %bb.0:
6555; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
6556; AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
6557; AVX1-NEXT:    vxorps (%rdi), %ymm0, %ymm0
6558; AVX1-NEXT:    vmovaps %ymm0, (%rsi)
6559; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
6560; AVX1-NEXT:    vmovaps %ymm0, 32(%rdx)
6561; AVX1-NEXT:    vzeroupper
6562; AVX1-NEXT:    retq
6563;
6564; AVX2-LABEL: vec512_v8i32:
6565; AVX2:       # %bb.0:
6566; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
6567; AVX2-NEXT:    vpxor (%rdi), %ymm0, %ymm0
6568; AVX2-NEXT:    vmovdqa %ymm0, (%rsi)
6569; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
6570; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdx)
6571; AVX2-NEXT:    vzeroupper
6572; AVX2-NEXT:    retq
6573  %in.subvec.not = load <8 x i32>, ptr %in.subvec.ptr, align 64
6574  %in.subvec = xor <8 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
6575  store <8 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
6576  %out.subvec0.ptr = getelementptr <8 x i32>, ptr %out.vec.ptr, i64 0
6577  store <8 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
6578  %out.subvec1.ptr = getelementptr <8 x i32>, ptr %out.vec.ptr, i64 1
6579  store <8 x i32> %in.subvec, ptr %out.subvec1.ptr, align 32
6580  ret void
6581}
6582
6583define void @vec512_v8f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6584; SCALAR-LABEL: vec512_v8f32:
6585; SCALAR:       # %bb.0:
6586; SCALAR-NEXT:    pushq %rbx
6587; SCALAR-NEXT:    movl 28(%rdi), %ebx
6588; SCALAR-NEXT:    movl 24(%rdi), %r11d
6589; SCALAR-NEXT:    movl 20(%rdi), %r10d
6590; SCALAR-NEXT:    movl 16(%rdi), %r9d
6591; SCALAR-NEXT:    movl 12(%rdi), %r8d
6592; SCALAR-NEXT:    movl 8(%rdi), %ecx
6593; SCALAR-NEXT:    movl (%rdi), %eax
6594; SCALAR-NEXT:    movl 4(%rdi), %edi
6595; SCALAR-NEXT:    notl %eax
6596; SCALAR-NEXT:    notl %edi
6597; SCALAR-NEXT:    notl %ecx
6598; SCALAR-NEXT:    notl %r8d
6599; SCALAR-NEXT:    notl %r9d
6600; SCALAR-NEXT:    notl %r10d
6601; SCALAR-NEXT:    notl %r11d
6602; SCALAR-NEXT:    notl %ebx
6603; SCALAR-NEXT:    movl %ebx, 28(%rsi)
6604; SCALAR-NEXT:    movl %r11d, 24(%rsi)
6605; SCALAR-NEXT:    movl %r10d, 20(%rsi)
6606; SCALAR-NEXT:    movl %r9d, 16(%rsi)
6607; SCALAR-NEXT:    movl %r8d, 12(%rsi)
6608; SCALAR-NEXT:    movl %ecx, 8(%rsi)
6609; SCALAR-NEXT:    movl %edi, 4(%rsi)
6610; SCALAR-NEXT:    movl %eax, (%rsi)
6611; SCALAR-NEXT:    movl %ebx, 28(%rdx)
6612; SCALAR-NEXT:    movl %r11d, 24(%rdx)
6613; SCALAR-NEXT:    movl %r10d, 20(%rdx)
6614; SCALAR-NEXT:    movl %r9d, 16(%rdx)
6615; SCALAR-NEXT:    movl %r8d, 12(%rdx)
6616; SCALAR-NEXT:    movl %ecx, 8(%rdx)
6617; SCALAR-NEXT:    movl %edi, 4(%rdx)
6618; SCALAR-NEXT:    movl %eax, (%rdx)
6619; SCALAR-NEXT:    movl %ebx, 60(%rdx)
6620; SCALAR-NEXT:    movl %r11d, 56(%rdx)
6621; SCALAR-NEXT:    movl %r10d, 52(%rdx)
6622; SCALAR-NEXT:    movl %r9d, 48(%rdx)
6623; SCALAR-NEXT:    movl %r8d, 44(%rdx)
6624; SCALAR-NEXT:    movl %ecx, 40(%rdx)
6625; SCALAR-NEXT:    movl %edi, 36(%rdx)
6626; SCALAR-NEXT:    movl %eax, 32(%rdx)
6627; SCALAR-NEXT:    popq %rbx
6628; SCALAR-NEXT:    retq
6629;
6630; SSE2-LABEL: vec512_v8f32:
6631; SSE2:       # %bb.0:
6632; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
6633; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
6634; SSE2-NEXT:    pxor %xmm0, %xmm1
6635; SSE2-NEXT:    pxor (%rdi), %xmm0
6636; SSE2-NEXT:    movdqa %xmm0, (%rsi)
6637; SSE2-NEXT:    movdqa %xmm1, 16(%rsi)
6638; SSE2-NEXT:    movdqa %xmm0, (%rdx)
6639; SSE2-NEXT:    movdqa %xmm1, 16(%rdx)
6640; SSE2-NEXT:    movdqa %xmm1, 48(%rdx)
6641; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
6642; SSE2-NEXT:    retq
6643;
6644; AVX1-LABEL: vec512_v8f32:
6645; AVX1:       # %bb.0:
6646; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
6647; AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
6648; AVX1-NEXT:    vxorps (%rdi), %ymm0, %ymm0
6649; AVX1-NEXT:    vmovaps %ymm0, (%rsi)
6650; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
6651; AVX1-NEXT:    vmovaps %ymm0, 32(%rdx)
6652; AVX1-NEXT:    vzeroupper
6653; AVX1-NEXT:    retq
6654;
6655; AVX2-LABEL: vec512_v8f32:
6656; AVX2:       # %bb.0:
6657; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
6658; AVX2-NEXT:    vpxor (%rdi), %ymm0, %ymm0
6659; AVX2-NEXT:    vmovdqa %ymm0, (%rsi)
6660; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
6661; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdx)
6662; AVX2-NEXT:    vzeroupper
6663; AVX2-NEXT:    retq
6664  %in.subvec.not = load <8 x i32>, ptr %in.subvec.ptr, align 64
6665  %in.subvec.int = xor <8 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
6666  %in.subvec = bitcast <8 x i32> %in.subvec.int to <8 x float>
6667  store <8 x float> %in.subvec, ptr %out.subvec.ptr, align 64
6668  %out.subvec0.ptr = getelementptr <8 x float>, ptr %out.vec.ptr, i64 0
6669  store <8 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
6670  %out.subvec1.ptr = getelementptr <8 x float>, ptr %out.vec.ptr, i64 1
6671  store <8 x float> %in.subvec, ptr %out.subvec1.ptr, align 32
6672  ret void
6673}
6674
6675define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6676; SCALAR-LABEL: vec512_v16i8:
6677; SCALAR:       # %bb.0:
6678; SCALAR-NEXT:    pushq %rbp
6679; SCALAR-NEXT:    pushq %r15
6680; SCALAR-NEXT:    pushq %r14
6681; SCALAR-NEXT:    pushq %r13
6682; SCALAR-NEXT:    pushq %r12
6683; SCALAR-NEXT:    pushq %rbx
6684; SCALAR-NEXT:    movzbl 15(%rdi), %eax
6685; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6686; SCALAR-NEXT:    movzbl 14(%rdi), %eax
6687; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6688; SCALAR-NEXT:    movzbl 13(%rdi), %eax
6689; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6690; SCALAR-NEXT:    movzbl 12(%rdi), %r10d
6691; SCALAR-NEXT:    movzbl 11(%rdi), %r13d
6692; SCALAR-NEXT:    movzbl 10(%rdi), %r12d
6693; SCALAR-NEXT:    movzbl 9(%rdi), %r15d
6694; SCALAR-NEXT:    movzbl 8(%rdi), %r14d
6695; SCALAR-NEXT:    movzbl 7(%rdi), %ebp
6696; SCALAR-NEXT:    movzbl 6(%rdi), %r11d
6697; SCALAR-NEXT:    movzbl 5(%rdi), %ebx
6698; SCALAR-NEXT:    movzbl 4(%rdi), %r9d
6699; SCALAR-NEXT:    movzbl 3(%rdi), %r8d
6700; SCALAR-NEXT:    movzbl 2(%rdi), %ecx
6701; SCALAR-NEXT:    movzbl (%rdi), %eax
6702; SCALAR-NEXT:    movzbl 1(%rdi), %edi
6703; SCALAR-NEXT:    notb %al
6704; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6705; SCALAR-NEXT:    notb %dil
6706; SCALAR-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6707; SCALAR-NEXT:    notb %cl
6708; SCALAR-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6709; SCALAR-NEXT:    notb %r8b
6710; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6711; SCALAR-NEXT:    notb %r9b
6712; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6713; SCALAR-NEXT:    movl %ebx, %r9d
6714; SCALAR-NEXT:    notb %r9b
6715; SCALAR-NEXT:    notb %r11b
6716; SCALAR-NEXT:    movl %r11d, %ebx
6717; SCALAR-NEXT:    notb %bpl
6718; SCALAR-NEXT:    notb %r14b
6719; SCALAR-NEXT:    notb %r15b
6720; SCALAR-NEXT:    movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6721; SCALAR-NEXT:    notb %r12b
6722; SCALAR-NEXT:    movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6723; SCALAR-NEXT:    notb %r13b
6724; SCALAR-NEXT:    notb %r10b
6725; SCALAR-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6726; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
6727; SCALAR-NEXT:    notb %dil
6728; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
6729; SCALAR-NEXT:    notb %r8b
6730; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
6731; SCALAR-NEXT:    notb %r11b
6732; SCALAR-NEXT:    movb %r11b, 15(%rsi)
6733; SCALAR-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6734; SCALAR-NEXT:    movb %r8b, 14(%rsi)
6735; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6736; SCALAR-NEXT:    movl %edi, %eax
6737; SCALAR-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6738; SCALAR-NEXT:    movb %dil, 13(%rsi)
6739; SCALAR-NEXT:    movb %r10b, 12(%rsi)
6740; SCALAR-NEXT:    movb %r13b, 11(%rsi)
6741; SCALAR-NEXT:    movb %r12b, 10(%rsi)
6742; SCALAR-NEXT:    movb %r15b, 9(%rsi)
6743; SCALAR-NEXT:    movb %r14b, 8(%rsi)
6744; SCALAR-NEXT:    movl %r14d, %r12d
6745; SCALAR-NEXT:    movb %bpl, 7(%rsi)
6746; SCALAR-NEXT:    movl %ebp, %r14d
6747; SCALAR-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6748; SCALAR-NEXT:    movb %bl, 6(%rsi)
6749; SCALAR-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6750; SCALAR-NEXT:    movb %r9b, 5(%rsi)
6751; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
6752; SCALAR-NEXT:    movb %cl, 4(%rsi)
6753; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
6754; SCALAR-NEXT:    movb %bpl, 3(%rsi)
6755; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
6756; SCALAR-NEXT:    movb %dil, 2(%rsi)
6757; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
6758; SCALAR-NEXT:    movb %cl, 1(%rsi)
6759; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
6760; SCALAR-NEXT:    movb %r10b, (%rsi)
6761; SCALAR-NEXT:    movb %r11b, 15(%rdx)
6762; SCALAR-NEXT:    movb %r8b, 14(%rdx)
6763; SCALAR-NEXT:    movb %al, 13(%rdx)
6764; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
6765; SCALAR-NEXT:    movb %al, 12(%rdx)
6766; SCALAR-NEXT:    movb %r13b, 11(%rdx)
6767; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
6768; SCALAR-NEXT:    movb %r15b, 10(%rdx)
6769; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
6770; SCALAR-NEXT:    movb %sil, 9(%rdx)
6771; SCALAR-NEXT:    movb %r12b, 8(%rdx)
6772; SCALAR-NEXT:    movb %r14b, 7(%rdx)
6773; SCALAR-NEXT:    movb %bl, 6(%rdx)
6774; SCALAR-NEXT:    movb %r9b, 5(%rdx)
6775; SCALAR-NEXT:    movl %r9d, %r11d
6776; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6777; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
6778; SCALAR-NEXT:    movb %r8b, 4(%rdx)
6779; SCALAR-NEXT:    movb %bpl, 3(%rdx)
6780; SCALAR-NEXT:    movb %dil, 2(%rdx)
6781; SCALAR-NEXT:    movb %cl, 1(%rdx)
6782; SCALAR-NEXT:    movl %ecx, %r14d
6783; SCALAR-NEXT:    movl %r10d, %esi
6784; SCALAR-NEXT:    movb %r10b, (%rdx)
6785; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
6786; SCALAR-NEXT:    movb %cl, 31(%rdx)
6787; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
6788; SCALAR-NEXT:    movb %r9b, 30(%rdx)
6789; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
6790; SCALAR-NEXT:    movb %dil, 29(%rdx)
6791; SCALAR-NEXT:    movb %al, 28(%rdx)
6792; SCALAR-NEXT:    movl %eax, %r10d
6793; SCALAR-NEXT:    movb %r13b, 27(%rdx)
6794; SCALAR-NEXT:    movb %r15b, 26(%rdx)
6795; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
6796; SCALAR-NEXT:    movb %r15b, 25(%rdx)
6797; SCALAR-NEXT:    movl %r12d, %ebp
6798; SCALAR-NEXT:    movb %r12b, 24(%rdx)
6799; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
6800; SCALAR-NEXT:    movb %bl, 23(%rdx)
6801; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
6802; SCALAR-NEXT:    movb %al, 22(%rdx)
6803; SCALAR-NEXT:    movb %r11b, 21(%rdx)
6804; SCALAR-NEXT:    movb %r8b, 20(%rdx)
6805; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
6806; SCALAR-NEXT:    movb %r8b, 19(%rdx)
6807; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
6808; SCALAR-NEXT:    movb %r8b, 18(%rdx)
6809; SCALAR-NEXT:    movb %r14b, 17(%rdx)
6810; SCALAR-NEXT:    movb %sil, 16(%rdx)
6811; SCALAR-NEXT:    movl %esi, %r11d
6812; SCALAR-NEXT:    movb %cl, 47(%rdx)
6813; SCALAR-NEXT:    movb %r9b, 46(%rdx)
6814; SCALAR-NEXT:    movb %dil, 45(%rdx)
6815; SCALAR-NEXT:    movb %r10b, 44(%rdx)
6816; SCALAR-NEXT:    movb %r13b, 43(%rdx)
6817; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
6818; SCALAR-NEXT:    movb %r12b, 42(%rdx)
6819; SCALAR-NEXT:    movb %r15b, 41(%rdx)
6820; SCALAR-NEXT:    movl %ebp, %r14d
6821; SCALAR-NEXT:    movb %bpl, 40(%rdx)
6822; SCALAR-NEXT:    movl %ebx, %ebp
6823; SCALAR-NEXT:    movb %bl, 39(%rdx)
6824; SCALAR-NEXT:    movl %eax, %ebx
6825; SCALAR-NEXT:    movb %al, 38(%rdx)
6826; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
6827; SCALAR-NEXT:    movb %cl, 37(%rdx)
6828; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
6829; SCALAR-NEXT:    movb %al, 36(%rdx)
6830; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
6831; SCALAR-NEXT:    movb %sil, 35(%rdx)
6832; SCALAR-NEXT:    movb %r8b, 34(%rdx)
6833; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
6834; SCALAR-NEXT:    movb %r9b, 33(%rdx)
6835; SCALAR-NEXT:    movb %r11b, 32(%rdx)
6836; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
6837; SCALAR-NEXT:    movb %r11b, 63(%rdx)
6838; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
6839; SCALAR-NEXT:    movb %r11b, 62(%rdx)
6840; SCALAR-NEXT:    movb %dil, 61(%rdx)
6841; SCALAR-NEXT:    movb %r10b, 60(%rdx)
6842; SCALAR-NEXT:    movb %r13b, 59(%rdx)
6843; SCALAR-NEXT:    movb %r12b, 58(%rdx)
6844; SCALAR-NEXT:    movb %r15b, 57(%rdx)
6845; SCALAR-NEXT:    movb %r14b, 56(%rdx)
6846; SCALAR-NEXT:    movb %bpl, 55(%rdx)
6847; SCALAR-NEXT:    movb %bl, 54(%rdx)
6848; SCALAR-NEXT:    movb %cl, 53(%rdx)
6849; SCALAR-NEXT:    movb %al, 52(%rdx)
6850; SCALAR-NEXT:    movb %sil, 51(%rdx)
6851; SCALAR-NEXT:    movb %r8b, 50(%rdx)
6852; SCALAR-NEXT:    movb %r9b, 49(%rdx)
6853; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
6854; SCALAR-NEXT:    movb %al, 48(%rdx)
6855; SCALAR-NEXT:    popq %rbx
6856; SCALAR-NEXT:    popq %r12
6857; SCALAR-NEXT:    popq %r13
6858; SCALAR-NEXT:    popq %r14
6859; SCALAR-NEXT:    popq %r15
6860; SCALAR-NEXT:    popq %rbp
6861; SCALAR-NEXT:    retq
6862;
6863; SSE2-LABEL: vec512_v16i8:
6864; SSE2:       # %bb.0:
6865; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
6866; SSE2-NEXT:    pxor (%rdi), %xmm0
6867; SSE2-NEXT:    movdqa %xmm0, (%rsi)
6868; SSE2-NEXT:    movdqa %xmm0, (%rdx)
6869; SSE2-NEXT:    movdqa %xmm0, 16(%rdx)
6870; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
6871; SSE2-NEXT:    movdqa %xmm0, 48(%rdx)
6872; SSE2-NEXT:    retq
6873;
6874; AVX-LABEL: vec512_v16i8:
6875; AVX:       # %bb.0:
6876; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
6877; AVX-NEXT:    vpxor (%rdi), %xmm0, %xmm0
6878; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
6879; AVX-NEXT:    vmovdqa %xmm0, (%rdx)
6880; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
6881; AVX-NEXT:    vmovdqa %xmm0, 32(%rdx)
6882; AVX-NEXT:    vmovdqa %xmm0, 48(%rdx)
6883; AVX-NEXT:    retq
6884  %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64
6885  %in.subvec = xor <16 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
6886  store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
6887  %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0
6888  store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
6889  %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1
6890  store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16
6891  %out.subvec2.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 2
6892  store <16 x i8> %in.subvec, ptr %out.subvec2.ptr, align 32
6893  %out.subvec3.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 3
6894  store <16 x i8> %in.subvec, ptr %out.subvec3.ptr, align 16
6895  ret void
6896}
6897
6898define void @vec512_v16i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6899; SCALAR-LABEL: vec512_v16i16:
6900; SCALAR:       # %bb.0:
6901; SCALAR-NEXT:    pushq %rbp
6902; SCALAR-NEXT:    pushq %r15
6903; SCALAR-NEXT:    pushq %r14
6904; SCALAR-NEXT:    pushq %r13
6905; SCALAR-NEXT:    pushq %r12
6906; SCALAR-NEXT:    pushq %rbx
6907; SCALAR-NEXT:    movzwl 30(%rdi), %eax
6908; SCALAR-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6909; SCALAR-NEXT:    movl 28(%rdi), %eax
6910; SCALAR-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6911; SCALAR-NEXT:    movzwl 26(%rdi), %eax
6912; SCALAR-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6913; SCALAR-NEXT:    movl 24(%rdi), %r13d
6914; SCALAR-NEXT:    movzwl 22(%rdi), %r12d
6915; SCALAR-NEXT:    movl 20(%rdi), %r15d
6916; SCALAR-NEXT:    movzwl 18(%rdi), %r14d
6917; SCALAR-NEXT:    movl 16(%rdi), %ebx
6918; SCALAR-NEXT:    movzwl 14(%rdi), %r11d
6919; SCALAR-NEXT:    movl 12(%rdi), %r10d
6920; SCALAR-NEXT:    movzwl 10(%rdi), %r9d
6921; SCALAR-NEXT:    movl 8(%rdi), %r8d
6922; SCALAR-NEXT:    movzwl 6(%rdi), %ecx
6923; SCALAR-NEXT:    movzwl 2(%rdi), %ebp
6924; SCALAR-NEXT:    movl (%rdi), %eax
6925; SCALAR-NEXT:    movl 4(%rdi), %edi
6926; SCALAR-NEXT:    notl %eax
6927; SCALAR-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6928; SCALAR-NEXT:    notl %ebp
6929; SCALAR-NEXT:    movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6930; SCALAR-NEXT:    notl %edi
6931; SCALAR-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6932; SCALAR-NEXT:    notl %ecx
6933; SCALAR-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6934; SCALAR-NEXT:    notl %r8d
6935; SCALAR-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6936; SCALAR-NEXT:    notl %r9d
6937; SCALAR-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6938; SCALAR-NEXT:    movl %r10d, %edi
6939; SCALAR-NEXT:    notl %edi
6940; SCALAR-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6941; SCALAR-NEXT:    notl %r11d
6942; SCALAR-NEXT:    movl %r11d, %r9d
6943; SCALAR-NEXT:    notl %ebx
6944; SCALAR-NEXT:    movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6945; SCALAR-NEXT:    notl %r14d
6946; SCALAR-NEXT:    notl %r15d
6947; SCALAR-NEXT:    notl %r12d
6948; SCALAR-NEXT:    notl %r13d
6949; SCALAR-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6950; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload
6951; SCALAR-NEXT:    notl %r10d
6952; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Reload
6953; SCALAR-NEXT:    notl %r11d
6954; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
6955; SCALAR-NEXT:    notl %r8d
6956; SCALAR-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6957; SCALAR-NEXT:    movw %r8w, 30(%rsi)
6958; SCALAR-NEXT:    movw %r11w, 28(%rsi)
6959; SCALAR-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6960; SCALAR-NEXT:    movw %r10w, 26(%rsi)
6961; SCALAR-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6962; SCALAR-NEXT:    movw %r13w, 24(%rsi)
6963; SCALAR-NEXT:    movw %r12w, 22(%rsi)
6964; SCALAR-NEXT:    movw %r15w, 20(%rsi)
6965; SCALAR-NEXT:    movw %r14w, 18(%rsi)
6966; SCALAR-NEXT:    movw %bx, 16(%rsi)
6967; SCALAR-NEXT:    movw %r9w, 14(%rsi)
6968; SCALAR-NEXT:    movw %di, 12(%rsi)
6969; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Reload
6970; SCALAR-NEXT:    movw %bp, 10(%rsi)
6971; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
6972; SCALAR-NEXT:    movw %di, 8(%rsi)
6973; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
6974; SCALAR-NEXT:    movw %cx, 6(%rsi)
6975; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
6976; SCALAR-NEXT:    movw %r8w, 4(%rsi)
6977; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
6978; SCALAR-NEXT:    movw %ax, 2(%rsi)
6979; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
6980; SCALAR-NEXT:    movw %bx, (%rsi)
6981; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload
6982; SCALAR-NEXT:    movw %r13w, 30(%rdx)
6983; SCALAR-NEXT:    movw %r11w, 28(%rdx)
6984; SCALAR-NEXT:    movw %r10w, 26(%rdx)
6985; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
6986; SCALAR-NEXT:    movw %si, 24(%rdx)
6987; SCALAR-NEXT:    movw %r12w, 22(%rdx)
6988; SCALAR-NEXT:    movw %r15w, 20(%rdx)
6989; SCALAR-NEXT:    movw %r14w, 18(%rdx)
6990; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Reload
6991; SCALAR-NEXT:    movw %r11w, 16(%rdx)
6992; SCALAR-NEXT:    movw %r9w, 14(%rdx)
6993; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload
6994; SCALAR-NEXT:    movw %r10w, 12(%rdx)
6995; SCALAR-NEXT:    movw %bp, 10(%rdx)
6996; SCALAR-NEXT:    movw %di, 8(%rdx)
6997; SCALAR-NEXT:    movw %cx, 6(%rdx)
6998; SCALAR-NEXT:    movw %r8w, 4(%rdx)
6999; SCALAR-NEXT:    movw %ax, 2(%rdx)
7000; SCALAR-NEXT:    movl %ebx, %esi
7001; SCALAR-NEXT:    movw %si, (%rdx)
7002; SCALAR-NEXT:    movw %r13w, 62(%rdx)
7003; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
7004; SCALAR-NEXT:    movw %bx, 60(%rdx)
7005; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
7006; SCALAR-NEXT:    movw %bx, 58(%rdx)
7007; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
7008; SCALAR-NEXT:    movw %bx, 56(%rdx)
7009; SCALAR-NEXT:    movw %r12w, 54(%rdx)
7010; SCALAR-NEXT:    movw %r15w, 52(%rdx)
7011; SCALAR-NEXT:    movw %r14w, 50(%rdx)
7012; SCALAR-NEXT:    movw %r11w, 48(%rdx)
7013; SCALAR-NEXT:    movw %r9w, 46(%rdx)
7014; SCALAR-NEXT:    movw %r10w, 44(%rdx)
7015; SCALAR-NEXT:    movw %bp, 42(%rdx)
7016; SCALAR-NEXT:    movw %di, 40(%rdx)
7017; SCALAR-NEXT:    movw %cx, 38(%rdx)
7018; SCALAR-NEXT:    movw %r8w, 36(%rdx)
7019; SCALAR-NEXT:    movw %ax, 34(%rdx)
7020; SCALAR-NEXT:    movw %si, 32(%rdx)
7021; SCALAR-NEXT:    popq %rbx
7022; SCALAR-NEXT:    popq %r12
7023; SCALAR-NEXT:    popq %r13
7024; SCALAR-NEXT:    popq %r14
7025; SCALAR-NEXT:    popq %r15
7026; SCALAR-NEXT:    popq %rbp
7027; SCALAR-NEXT:    retq
7028;
7029; SSE2-LABEL: vec512_v16i16:
7030; SSE2:       # %bb.0:
7031; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
7032; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
7033; SSE2-NEXT:    pxor %xmm0, %xmm1
7034; SSE2-NEXT:    pxor (%rdi), %xmm0
7035; SSE2-NEXT:    movdqa %xmm0, (%rsi)
7036; SSE2-NEXT:    movdqa %xmm1, 16(%rsi)
7037; SSE2-NEXT:    movdqa %xmm0, (%rdx)
7038; SSE2-NEXT:    movdqa %xmm1, 16(%rdx)
7039; SSE2-NEXT:    movdqa %xmm1, 48(%rdx)
7040; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
7041; SSE2-NEXT:    retq
7042;
7043; AVX1-LABEL: vec512_v16i16:
7044; AVX1:       # %bb.0:
7045; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
7046; AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
7047; AVX1-NEXT:    vxorps (%rdi), %ymm0, %ymm0
7048; AVX1-NEXT:    vmovaps %ymm0, (%rsi)
7049; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
7050; AVX1-NEXT:    vmovaps %ymm0, 32(%rdx)
7051; AVX1-NEXT:    vzeroupper
7052; AVX1-NEXT:    retq
7053;
7054; AVX2-LABEL: vec512_v16i16:
7055; AVX2:       # %bb.0:
7056; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
7057; AVX2-NEXT:    vpxor (%rdi), %ymm0, %ymm0
7058; AVX2-NEXT:    vmovdqa %ymm0, (%rsi)
7059; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
7060; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdx)
7061; AVX2-NEXT:    vzeroupper
7062; AVX2-NEXT:    retq
7063  %in.subvec.not = load <16 x i16>, ptr %in.subvec.ptr, align 64
7064  %in.subvec = xor <16 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
7065  store <16 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
7066  %out.subvec0.ptr = getelementptr <16 x i16>, ptr %out.vec.ptr, i64 0
7067  store <16 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
7068  %out.subvec1.ptr = getelementptr <16 x i16>, ptr %out.vec.ptr, i64 1
7069  store <16 x i16> %in.subvec, ptr %out.subvec1.ptr, align 32
7070  ret void
7071}
7072
7073define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
7074; SCALAR-LABEL: vec512_v32i8:
7075; SCALAR:       # %bb.0:
7076; SCALAR-NEXT:    pushq %rbp
7077; SCALAR-NEXT:    pushq %r15
7078; SCALAR-NEXT:    pushq %r14
7079; SCALAR-NEXT:    pushq %r13
7080; SCALAR-NEXT:    pushq %r12
7081; SCALAR-NEXT:    pushq %rbx
7082; SCALAR-NEXT:    movzbl 16(%rdi), %eax
7083; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7084; SCALAR-NEXT:    movzbl 15(%rdi), %eax
7085; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7086; SCALAR-NEXT:    movzbl 14(%rdi), %eax
7087; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7088; SCALAR-NEXT:    movzbl 13(%rdi), %eax
7089; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7090; SCALAR-NEXT:    movzbl 12(%rdi), %r13d
7091; SCALAR-NEXT:    movzbl 11(%rdi), %eax
7092; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7093; SCALAR-NEXT:    movzbl 10(%rdi), %r12d
7094; SCALAR-NEXT:    movzbl 9(%rdi), %r15d
7095; SCALAR-NEXT:    movzbl 8(%rdi), %r14d
7096; SCALAR-NEXT:    movzbl 7(%rdi), %ebp
7097; SCALAR-NEXT:    movzbl 6(%rdi), %ebx
7098; SCALAR-NEXT:    movzbl 5(%rdi), %r11d
7099; SCALAR-NEXT:    movzbl 4(%rdi), %r10d
7100; SCALAR-NEXT:    movzbl 3(%rdi), %r9d
7101; SCALAR-NEXT:    movzbl 2(%rdi), %r8d
7102; SCALAR-NEXT:    movzbl (%rdi), %eax
7103; SCALAR-NEXT:    movzbl 1(%rdi), %ecx
7104; SCALAR-NEXT:    notb %al
7105; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7106; SCALAR-NEXT:    notb %cl
7107; SCALAR-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7108; SCALAR-NEXT:    notb %r8b
7109; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7110; SCALAR-NEXT:    notb %r9b
7111; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7112; SCALAR-NEXT:    notb %r10b
7113; SCALAR-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7114; SCALAR-NEXT:    notb %r11b
7115; SCALAR-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7116; SCALAR-NEXT:    notb %bl
7117; SCALAR-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7118; SCALAR-NEXT:    notb %bpl
7119; SCALAR-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7120; SCALAR-NEXT:    notb %r14b
7121; SCALAR-NEXT:    movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7122; SCALAR-NEXT:    notb %r15b
7123; SCALAR-NEXT:    movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7124; SCALAR-NEXT:    notb %r12b
7125; SCALAR-NEXT:    movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7126; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
7127; SCALAR-NEXT:    notb %r11b
7128; SCALAR-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7129; SCALAR-NEXT:    notb %r13b
7130; SCALAR-NEXT:    movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7131; SCALAR-NEXT:    notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
7132; SCALAR-NEXT:    notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
7133; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
7134; SCALAR-NEXT:    notb %r8b
7135; SCALAR-NEXT:    notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
7136; SCALAR-NEXT:    movzbl 17(%rdi), %eax
7137; SCALAR-NEXT:    notb %al
7138; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7139; SCALAR-NEXT:    movzbl 18(%rdi), %eax
7140; SCALAR-NEXT:    notb %al
7141; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7142; SCALAR-NEXT:    movzbl 19(%rdi), %eax
7143; SCALAR-NEXT:    notb %al
7144; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7145; SCALAR-NEXT:    movzbl 20(%rdi), %eax
7146; SCALAR-NEXT:    notb %al
7147; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7148; SCALAR-NEXT:    movzbl 21(%rdi), %ebp
7149; SCALAR-NEXT:    notb %bpl
7150; SCALAR-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7151; SCALAR-NEXT:    movzbl 22(%rdi), %ebx
7152; SCALAR-NEXT:    notb %bl
7153; SCALAR-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7154; SCALAR-NEXT:    movzbl 23(%rdi), %r10d
7155; SCALAR-NEXT:    notb %r10b
7156; SCALAR-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7157; SCALAR-NEXT:    movzbl 24(%rdi), %r9d
7158; SCALAR-NEXT:    notb %r9b
7159; SCALAR-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7160; SCALAR-NEXT:    movzbl 25(%rdi), %ecx
7161; SCALAR-NEXT:    notb %cl
7162; SCALAR-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7163; SCALAR-NEXT:    movzbl 26(%rdi), %r14d
7164; SCALAR-NEXT:    notb %r14b
7165; SCALAR-NEXT:    movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7166; SCALAR-NEXT:    movzbl 27(%rdi), %r15d
7167; SCALAR-NEXT:    notb %r15b
7168; SCALAR-NEXT:    movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7169; SCALAR-NEXT:    movzbl 28(%rdi), %r12d
7170; SCALAR-NEXT:    notb %r12b
7171; SCALAR-NEXT:    movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7172; SCALAR-NEXT:    movzbl 29(%rdi), %r13d
7173; SCALAR-NEXT:    notb %r13b
7174; SCALAR-NEXT:    movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7175; SCALAR-NEXT:    movzbl 30(%rdi), %eax
7176; SCALAR-NEXT:    notb %al
7177; SCALAR-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7178; SCALAR-NEXT:    movzbl 31(%rdi), %edi
7179; SCALAR-NEXT:    notb %dil
7180; SCALAR-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7181; SCALAR-NEXT:    movb %dil, 31(%rsi)
7182; SCALAR-NEXT:    movb %al, 30(%rsi)
7183; SCALAR-NEXT:    movb %r13b, 29(%rsi)
7184; SCALAR-NEXT:    movb %r12b, 28(%rsi)
7185; SCALAR-NEXT:    movb %r15b, 27(%rsi)
7186; SCALAR-NEXT:    movb %r14b, 26(%rsi)
7187; SCALAR-NEXT:    movb %cl, 25(%rsi)
7188; SCALAR-NEXT:    movb %r9b, 24(%rsi)
7189; SCALAR-NEXT:    movb %r10b, 23(%rsi)
7190; SCALAR-NEXT:    movb %bl, 22(%rsi)
7191; SCALAR-NEXT:    movb %bpl, 21(%rsi)
7192; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
7193; SCALAR-NEXT:    movb %bpl, 20(%rsi)
7194; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7195; SCALAR-NEXT:    movb %al, 19(%rsi)
7196; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7197; SCALAR-NEXT:    movb %al, 18(%rsi)
7198; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7199; SCALAR-NEXT:    movb %al, 17(%rsi)
7200; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
7201; SCALAR-NEXT:    movb %cl, 16(%rsi)
7202; SCALAR-NEXT:    movb %r8b, 15(%rsi)
7203; SCALAR-NEXT:    movl %r8d, %r14d
7204; SCALAR-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7205; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
7206; SCALAR-NEXT:    movb %bl, 14(%rsi)
7207; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7208; SCALAR-NEXT:    movb %al, 13(%rsi)
7209; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7210; SCALAR-NEXT:    movb %al, 12(%rsi)
7211; SCALAR-NEXT:    movb %r11b, 11(%rsi)
7212; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
7213; SCALAR-NEXT:    movb %dil, 10(%rsi)
7214; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
7215; SCALAR-NEXT:    movb %dil, 9(%rsi)
7216; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
7217; SCALAR-NEXT:    movb %dil, 8(%rsi)
7218; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
7219; SCALAR-NEXT:    movb %r11b, 7(%rsi)
7220; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload
7221; SCALAR-NEXT:    movb %r13b, 6(%rsi)
7222; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
7223; SCALAR-NEXT:    movb %r10b, 5(%rsi)
7224; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
7225; SCALAR-NEXT:    movb %r12b, 4(%rsi)
7226; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
7227; SCALAR-NEXT:    movb %r9b, 3(%rsi)
7228; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
7229; SCALAR-NEXT:    movb %r15b, 2(%rsi)
7230; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
7231; SCALAR-NEXT:    movb %r8b, 1(%rsi)
7232; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
7233; SCALAR-NEXT:    movb %dil, (%rsi)
7234; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7235; SCALAR-NEXT:    movb %sil, 31(%rdx)
7236; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7237; SCALAR-NEXT:    movb %sil, 30(%rdx)
7238; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7239; SCALAR-NEXT:    movb %sil, 29(%rdx)
7240; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7241; SCALAR-NEXT:    movb %sil, 28(%rdx)
7242; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7243; SCALAR-NEXT:    movb %sil, 27(%rdx)
7244; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7245; SCALAR-NEXT:    movb %sil, 26(%rdx)
7246; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7247; SCALAR-NEXT:    movb %sil, 25(%rdx)
7248; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7249; SCALAR-NEXT:    movb %sil, 24(%rdx)
7250; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7251; SCALAR-NEXT:    movb %sil, 23(%rdx)
7252; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7253; SCALAR-NEXT:    movb %sil, 22(%rdx)
7254; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7255; SCALAR-NEXT:    movb %sil, 21(%rdx)
7256; SCALAR-NEXT:    movb %bpl, 20(%rdx)
7257; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7258; SCALAR-NEXT:    movb %sil, 19(%rdx)
7259; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7260; SCALAR-NEXT:    movb %sil, 18(%rdx)
7261; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7262; SCALAR-NEXT:    movb %sil, 17(%rdx)
7263; SCALAR-NEXT:    movb %cl, 16(%rdx)
7264; SCALAR-NEXT:    movb %r14b, 15(%rdx)
7265; SCALAR-NEXT:    movb %bl, 14(%rdx)
7266; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
7267; SCALAR-NEXT:    movb %cl, 13(%rdx)
7268; SCALAR-NEXT:    movb %al, 12(%rdx)
7269; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7270; SCALAR-NEXT:    movb %sil, 11(%rdx)
7271; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
7272; SCALAR-NEXT:    movb %bl, 10(%rdx)
7273; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload
7274; SCALAR-NEXT:    movb %r14b, 9(%rdx)
7275; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
7276; SCALAR-NEXT:    movb %bpl, 8(%rdx)
7277; SCALAR-NEXT:    movb %r11b, 7(%rdx)
7278; SCALAR-NEXT:    movb %r13b, 6(%rdx)
7279; SCALAR-NEXT:    movb %r10b, 5(%rdx)
7280; SCALAR-NEXT:    movb %r12b, 4(%rdx)
7281; SCALAR-NEXT:    movb %r9b, 3(%rdx)
7282; SCALAR-NEXT:    movb %r15b, 2(%rdx)
7283; SCALAR-NEXT:    movb %r8b, 1(%rdx)
7284; SCALAR-NEXT:    movb %dil, (%rdx)
7285; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7286; SCALAR-NEXT:    movb %al, 63(%rdx)
7287; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7288; SCALAR-NEXT:    movb %al, 62(%rdx)
7289; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7290; SCALAR-NEXT:    movb %al, 61(%rdx)
7291; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7292; SCALAR-NEXT:    movb %al, 60(%rdx)
7293; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7294; SCALAR-NEXT:    movb %al, 59(%rdx)
7295; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7296; SCALAR-NEXT:    movb %al, 58(%rdx)
7297; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7298; SCALAR-NEXT:    movb %al, 57(%rdx)
7299; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7300; SCALAR-NEXT:    movb %al, 56(%rdx)
7301; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7302; SCALAR-NEXT:    movb %al, 55(%rdx)
7303; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7304; SCALAR-NEXT:    movb %al, 54(%rdx)
7305; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7306; SCALAR-NEXT:    movb %al, 53(%rdx)
7307; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7308; SCALAR-NEXT:    movb %al, 52(%rdx)
7309; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7310; SCALAR-NEXT:    movb %al, 51(%rdx)
7311; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7312; SCALAR-NEXT:    movb %al, 50(%rdx)
7313; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7314; SCALAR-NEXT:    movb %al, 49(%rdx)
7315; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7316; SCALAR-NEXT:    movb %al, 48(%rdx)
7317; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7318; SCALAR-NEXT:    movb %al, 47(%rdx)
7319; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7320; SCALAR-NEXT:    movb %al, 46(%rdx)
7321; SCALAR-NEXT:    movb %cl, 45(%rdx)
7322; SCALAR-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7323; SCALAR-NEXT:    movb %al, 44(%rdx)
7324; SCALAR-NEXT:    movb %sil, 43(%rdx)
7325; SCALAR-NEXT:    movb %bl, 42(%rdx)
7326; SCALAR-NEXT:    movb %r14b, 41(%rdx)
7327; SCALAR-NEXT:    movb %bpl, 40(%rdx)
7328; SCALAR-NEXT:    movb %r11b, 39(%rdx)
7329; SCALAR-NEXT:    movb %r13b, 38(%rdx)
7330; SCALAR-NEXT:    movb %r10b, 37(%rdx)
7331; SCALAR-NEXT:    movb %r12b, 36(%rdx)
7332; SCALAR-NEXT:    movb %r9b, 35(%rdx)
7333; SCALAR-NEXT:    movb %r15b, 34(%rdx)
7334; SCALAR-NEXT:    movb %r8b, 33(%rdx)
7335; SCALAR-NEXT:    movb %dil, 32(%rdx)
7336; SCALAR-NEXT:    popq %rbx
7337; SCALAR-NEXT:    popq %r12
7338; SCALAR-NEXT:    popq %r13
7339; SCALAR-NEXT:    popq %r14
7340; SCALAR-NEXT:    popq %r15
7341; SCALAR-NEXT:    popq %rbp
7342; SCALAR-NEXT:    retq
7343;
7344; SSE2-LABEL: vec512_v32i8:
7345; SSE2:       # %bb.0:
7346; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
7347; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
7348; SSE2-NEXT:    pxor %xmm0, %xmm1
7349; SSE2-NEXT:    pxor (%rdi), %xmm0
7350; SSE2-NEXT:    movdqa %xmm0, (%rsi)
7351; SSE2-NEXT:    movdqa %xmm1, 16(%rsi)
7352; SSE2-NEXT:    movdqa %xmm0, (%rdx)
7353; SSE2-NEXT:    movdqa %xmm1, 16(%rdx)
7354; SSE2-NEXT:    movdqa %xmm1, 48(%rdx)
7355; SSE2-NEXT:    movdqa %xmm0, 32(%rdx)
7356; SSE2-NEXT:    retq
7357;
7358; AVX1-LABEL: vec512_v32i8:
7359; AVX1:       # %bb.0:
7360; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
7361; AVX1-NEXT:    vcmptrueps %ymm0, %ymm0, %ymm0
7362; AVX1-NEXT:    vxorps (%rdi), %ymm0, %ymm0
7363; AVX1-NEXT:    vmovaps %ymm0, (%rsi)
7364; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
7365; AVX1-NEXT:    vmovaps %ymm0, 32(%rdx)
7366; AVX1-NEXT:    vzeroupper
7367; AVX1-NEXT:    retq
7368;
7369; AVX2-LABEL: vec512_v32i8:
7370; AVX2:       # %bb.0:
7371; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
7372; AVX2-NEXT:    vpxor (%rdi), %ymm0, %ymm0
7373; AVX2-NEXT:    vmovdqa %ymm0, (%rsi)
7374; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
7375; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdx)
7376; AVX2-NEXT:    vzeroupper
7377; AVX2-NEXT:    retq
7378  %in.subvec.not = load <32 x i8>, ptr %in.subvec.ptr, align 64
7379  %in.subvec = xor <32 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
7380  store <32 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
7381  %out.subvec0.ptr = getelementptr <32 x i8>, ptr %out.vec.ptr, i64 0
7382  store <32 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
7383  %out.subvec1.ptr = getelementptr <32 x i8>, ptr %out.vec.ptr, i64 1
7384  store <32 x i8> %in.subvec, ptr %out.subvec1.ptr, align 32
7385  ret void
7386}
7387;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
7388; SSSE3: {{.*}}
7389