xref: /llvm-project/llvm/test/CodeGen/X86/sext-vsetcc.ll (revision 69ffa7be3bda5547d7a41233f86b88539616e386)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512
5
6declare void @use_v8i1(<8 x i1>)
7declare void @use_v8i8(<8 x i8>)
8
9define <8 x i16> @cmp_ne_load_const(ptr %x) nounwind {
10; SSE-LABEL: cmp_ne_load_const:
11; SSE:       # %bb.0:
12; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
13; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
14; SSE-NEXT:    pxor %xmm1, %xmm1
15; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
16; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
17; SSE-NEXT:    pxor %xmm1, %xmm0
18; SSE-NEXT:    retq
19;
20; AVX-LABEL: cmp_ne_load_const:
21; AVX:       # %bb.0:
22; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
23; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
24; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
25; AVX-NEXT:    retq
26  %loadx = load <8 x i8>, ptr %x
27  %icmp = icmp ne <8 x i8> %loadx, zeroinitializer
28  %sext = sext <8 x i1> %icmp to <8 x i16>
29  ret <8 x i16> %sext
30}
31
32; negative test - simple loads only
33
34define <8 x i16> @cmp_ne_load_const_volatile(ptr %x) nounwind {
35; SSE-LABEL: cmp_ne_load_const_volatile:
36; SSE:       # %bb.0:
37; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
38; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
39; SSE-NEXT:    pxor %xmm1, %xmm1
40; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
41; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
42; SSE-NEXT:    pxor %xmm1, %xmm0
43; SSE-NEXT:    retq
44;
45; AVX2-LABEL: cmp_ne_load_const_volatile:
46; AVX2:       # %bb.0:
47; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
48; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
49; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
50; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
51; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
52; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
53; AVX2-NEXT:    retq
54;
55; AVX512-LABEL: cmp_ne_load_const_volatile:
56; AVX512:       # %bb.0:
57; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
58; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
59; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
60; AVX512-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
61; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
62; AVX512-NEXT:    vzeroupper
63; AVX512-NEXT:    retq
64  %loadx = load volatile <8 x i8>, ptr %x
65  %icmp = icmp ne <8 x i8> %loadx, zeroinitializer
66  %sext = sext <8 x i1> %icmp to <8 x i16>
67  ret <8 x i16> %sext
68}
69
70; negative test - don't create extra load
71
72define <8 x i16> @cmp_ne_load_const_extra_use1(ptr %x) nounwind {
73; SSE-LABEL: cmp_ne_load_const_extra_use1:
74; SSE:       # %bb.0:
75; SSE-NEXT:    subq $24, %rsp
76; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
77; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
78; SSE-NEXT:    callq use_v8i8@PLT
79; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
80; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
81; SSE-NEXT:    pxor %xmm1, %xmm1
82; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
83; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
84; SSE-NEXT:    pxor %xmm1, %xmm0
85; SSE-NEXT:    addq $24, %rsp
86; SSE-NEXT:    retq
87;
88; AVX2-LABEL: cmp_ne_load_const_extra_use1:
89; AVX2:       # %bb.0:
90; AVX2-NEXT:    subq $24, %rsp
91; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
92; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
93; AVX2-NEXT:    callq use_v8i8@PLT
94; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
95; AVX2-NEXT:    vpcmpeqb (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
96; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
97; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
98; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
99; AVX2-NEXT:    addq $24, %rsp
100; AVX2-NEXT:    retq
101;
102; AVX512-LABEL: cmp_ne_load_const_extra_use1:
103; AVX512:       # %bb.0:
104; AVX512-NEXT:    subq $24, %rsp
105; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
106; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
107; AVX512-NEXT:    callq use_v8i8@PLT
108; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
109; AVX512-NEXT:    vpcmpeqb (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
110; AVX512-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
111; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
112; AVX512-NEXT:    addq $24, %rsp
113; AVX512-NEXT:    vzeroupper
114; AVX512-NEXT:    retq
115  %loadx = load <8 x i8>, ptr %x
116  call void @use_v8i8(<8 x i8> %loadx)
117  %icmp = icmp ne <8 x i8> %loadx, zeroinitializer
118  %sext = sext <8 x i1> %icmp to <8 x i16>
119  ret <8 x i16> %sext
120}
121
122; negative test - don't create extra compare
123
124define <8 x i16> @cmp_ne_load_const_extra_use2(ptr %x) nounwind {
125; SSE-LABEL: cmp_ne_load_const_extra_use2:
126; SSE:       # %bb.0:
127; SSE-NEXT:    subq $24, %rsp
128; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
129; SSE-NEXT:    pxor %xmm1, %xmm1
130; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
131; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
132; SSE-NEXT:    pxor %xmm1, %xmm0
133; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
134; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
135; SSE-NEXT:    callq use_v8i1@PLT
136; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
137; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
138; SSE-NEXT:    addq $24, %rsp
139; SSE-NEXT:    retq
140;
141; AVX2-LABEL: cmp_ne_load_const_extra_use2:
142; AVX2:       # %bb.0:
143; AVX2-NEXT:    subq $24, %rsp
144; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
145; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
146; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
147; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
148; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
149; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
150; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
151; AVX2-NEXT:    callq use_v8i1@PLT
152; AVX2-NEXT:    vpmovsxbw (%rsp), %xmm0 # 16-byte Folded Reload
153; AVX2-NEXT:    addq $24, %rsp
154; AVX2-NEXT:    retq
155;
156; AVX512-LABEL: cmp_ne_load_const_extra_use2:
157; AVX512:       # %bb.0:
158; AVX512-NEXT:    subq $72, %rsp
159; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
160; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
161; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
162; AVX512-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
163; AVX512-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
164; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
165; AVX512-NEXT:    vzeroupper
166; AVX512-NEXT:    callq use_v8i1@PLT
167; AVX512-NEXT:    vpmovsxbw (%rsp), %xmm0 # 16-byte Folded Reload
168; AVX512-NEXT:    addq $72, %rsp
169; AVX512-NEXT:    retq
170  %loadx = load <8 x i8>, ptr %x
171  %icmp = icmp ne <8 x i8> %loadx, zeroinitializer
172  call void @use_v8i1(<8 x i1> %icmp)
173  %sext = sext <8 x i1> %icmp to <8 x i16>
174  ret <8 x i16> %sext
175}
176
177; negative test - not free extend
178
179define <8 x i16> @cmp_ne_no_load_const(i64 %x) nounwind {
180; SSE-LABEL: cmp_ne_no_load_const:
181; SSE:       # %bb.0:
182; SSE-NEXT:    movq %rdi, %xmm0
183; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
184; SSE-NEXT:    pxor %xmm1, %xmm1
185; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
186; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
187; SSE-NEXT:    pxor %xmm1, %xmm0
188; SSE-NEXT:    retq
189;
190; AVX2-LABEL: cmp_ne_no_load_const:
191; AVX2:       # %bb.0:
192; AVX2-NEXT:    vmovq %rdi, %xmm0
193; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
194; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
195; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
196; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
197; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
198; AVX2-NEXT:    retq
199;
200; AVX512-LABEL: cmp_ne_no_load_const:
201; AVX512:       # %bb.0:
202; AVX512-NEXT:    vmovq %rdi, %xmm0
203; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
204; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
205; AVX512-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
206; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
207; AVX512-NEXT:    vzeroupper
208; AVX512-NEXT:    retq
209  %t = bitcast i64 %x to <8 x i8>
210  %icmp = icmp ne <8 x i8> %t, zeroinitializer
211  %sext = sext <8 x i1> %icmp to <8 x i16>
212  ret <8 x i16> %sext
213}
214
215define <4 x i32> @cmp_ult_load_const(ptr %x) nounwind {
216; SSE-LABEL: cmp_ult_load_const:
217; SSE:       # %bb.0:
218; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
219; SSE-NEXT:    movd {{.*#+}} xmm1 = [42,214,0,255,0,0,0,0,0,0,0,0,0,0,0,0]
220; SSE-NEXT:    pmaxub %xmm0, %xmm1
221; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
222; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
223; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
224; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
225; SSE-NEXT:    pxor %xmm1, %xmm0
226; SSE-NEXT:    retq
227;
228; AVX-LABEL: cmp_ult_load_const:
229; AVX:       # %bb.0:
230; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
231; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = [42,214,0,255]
232; AVX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
233; AVX-NEXT:    retq
234  %loadx = load <4 x i8>, ptr %x
235  %icmp = icmp ult <4 x i8> %loadx, <i8 42, i8 -42, i8 0, i8 -1>
236  %sext = sext <4 x i1> %icmp to <4 x i32>
237  ret <4 x i32> %sext
238}
239
240; negative test - type must be legal
241
242define <3 x i32> @cmp_ult_load_const_bad_type(ptr %x) nounwind {
243; SSE-LABEL: cmp_ult_load_const_bad_type:
244; SSE:       # %bb.0:
245; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
246; SSE-NEXT:    movd {{.*#+}} xmm1 = [42,214,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
247; SSE-NEXT:    pmaxub %xmm0, %xmm1
248; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
249; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
250; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
251; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
252; SSE-NEXT:    pxor %xmm1, %xmm0
253; SSE-NEXT:    retq
254;
255; AVX2-LABEL: cmp_ult_load_const_bad_type:
256; AVX2:       # %bb.0:
257; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
258; AVX2-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
259; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
260; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
261; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
262; AVX2-NEXT:    vpmovsxbd %xmm0, %xmm0
263; AVX2-NEXT:    retq
264;
265; AVX512-LABEL: cmp_ult_load_const_bad_type:
266; AVX512:       # %bb.0:
267; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
268; AVX512-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
269; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
270; AVX512-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
271; AVX512-NEXT:    vpmovsxbd %xmm0, %xmm0
272; AVX512-NEXT:    vzeroupper
273; AVX512-NEXT:    retq
274  %loadx = load <3 x i8>, ptr %x
275  %icmp = icmp ult <3 x i8> %loadx, <i8 42, i8 -42, i8 0>
276  %sext = sext <3 x i1> %icmp to <3 x i32>
277  ret <3 x i32> %sext
278}
279
280; Signed compare needs signed extend.
281
282define <4 x i32> @cmp_slt_load_const(ptr %x) nounwind {
283; SSE-LABEL: cmp_slt_load_const:
284; SSE:       # %bb.0:
285; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
286; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
287; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
288; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [10794,10794,54998,54998,0,0,65535,65535]
289; SSE-NEXT:    pcmpgtb %xmm1, %xmm0
290; SSE-NEXT:    retq
291;
292; AVX-LABEL: cmp_slt_load_const:
293; AVX:       # %bb.0:
294; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
295; AVX-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [42,4294967254,0,4294967295]
296; AVX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
297; AVX-NEXT:    retq
298  %loadx = load <4 x i8>, ptr %x
299  %icmp = icmp slt <4 x i8> %loadx, <i8 42, i8 -42, i8 0, i8 -1>
300  %sext = sext <4 x i1> %icmp to <4 x i32>
301  ret <4 x i32> %sext
302}
303
304define <2 x i64> @cmp_ne_zextload(ptr %x, ptr %y) nounwind {
305; SSE-LABEL: cmp_ne_zextload:
306; SSE:       # %bb.0:
307; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
308; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
309; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
310; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
311; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
312; SSE-NEXT:    pxor %xmm1, %xmm0
313; SSE-NEXT:    retq
314;
315; AVX2-LABEL: cmp_ne_zextload:
316; AVX2:       # %bb.0:
317; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
318; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
319; AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
320; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
321; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
322; AVX2-NEXT:    retq
323;
324; AVX512-LABEL: cmp_ne_zextload:
325; AVX512:       # %bb.0:
326; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
327; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
328; AVX512-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
329; AVX512-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
330; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
331; AVX512-NEXT:    vzeroupper
332; AVX512-NEXT:    retq
333  %loadx = load <2 x i32>, ptr %x
334  %loady = load <2 x i32>, ptr %y
335  %icmp = icmp ne <2 x i32> %loadx, %loady
336  %sext = sext <2 x i1> %icmp to <2 x i64>
337  ret <2 x i64> %sext
338}
339
340define <8 x i16> @cmp_ugt_zextload(ptr %x, ptr %y) nounwind {
341; SSE-LABEL: cmp_ugt_zextload:
342; SSE:       # %bb.0:
343; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
344; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
345; SSE-NEXT:    pminub %xmm0, %xmm1
346; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
347; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
348; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
349; SSE-NEXT:    pxor %xmm1, %xmm0
350; SSE-NEXT:    retq
351;
352; AVX-LABEL: cmp_ugt_zextload:
353; AVX:       # %bb.0:
354; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
355; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
356; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
357; AVX-NEXT:    retq
358  %loadx = load <8 x i8>, ptr %x
359  %loady = load <8 x i8>, ptr %y
360  %icmp = icmp ugt <8 x i8> %loadx, %loady
361  %sext = sext <8 x i1> %icmp to <8 x i16>
362  ret <8 x i16> %sext
363}
364
365; Signed compare needs signed extends.
366
367define <8 x i16> @cmp_sgt_zextload(ptr %x, ptr %y) nounwind {
368; SSE-LABEL: cmp_sgt_zextload:
369; SSE:       # %bb.0:
370; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
371; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
372; SSE-NEXT:    pcmpgtb %xmm1, %xmm0
373; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
374; SSE-NEXT:    retq
375;
376; AVX-LABEL: cmp_sgt_zextload:
377; AVX:       # %bb.0:
378; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
379; AVX-NEXT:    vpmovsxbw (%rsi), %xmm1
380; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
381; AVX-NEXT:    retq
382  %loadx = load <8 x i8>, ptr %x
383  %loady = load <8 x i8>, ptr %y
384  %icmp = icmp sgt <8 x i8> %loadx, %loady
385  %sext = sext <8 x i1> %icmp to <8 x i16>
386  ret <8 x i16> %sext
387}
388
389; negative test - don't change a legal op
390; TODO: Or should we? We can eliminate the vpmovsxwd at the cost of a 256-bit ymm vpcmpeqw.
391
392define <8 x i32> @cmp_ne_zextload_from_legal_op(ptr %x, ptr %y) {
393; SSE-LABEL: cmp_ne_zextload_from_legal_op:
394; SSE:       # %bb.0:
395; SSE-NEXT:    movdqa (%rdi), %xmm0
396; SSE-NEXT:    pcmpeqw (%rsi), %xmm0
397; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
398; SSE-NEXT:    pxor %xmm0, %xmm1
399; SSE-NEXT:    movdqa %xmm1, %xmm0
400; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
401; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
402; SSE-NEXT:    retq
403;
404; AVX2-LABEL: cmp_ne_zextload_from_legal_op:
405; AVX2:       # %bb.0:
406; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
407; AVX2-NEXT:    vpcmpeqw (%rsi), %xmm0, %xmm0
408; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
409; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
410; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
411; AVX2-NEXT:    retq
412;
413; AVX512-LABEL: cmp_ne_zextload_from_legal_op:
414; AVX512:       # %bb.0:
415; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
416; AVX512-NEXT:    vpcmpeqw (%rsi), %xmm0, %xmm0
417; AVX512-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
418; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
419; AVX512-NEXT:    retq
420  %loadx = load <8 x i16>, ptr %x
421  %loady = load <8 x i16>, ptr %y
422  %icmp = icmp ne <8 x i16> %loadx, %loady
423  %sext = sext <8 x i1> %icmp to <8 x i32>
424  ret <8 x i32> %sext
425}
426
427; Both uses of the load can be absorbed by the zext-load, so we eliminate the explicit casts.
428
429define <8 x i32> @PR50055(ptr %src, ptr %dst) nounwind {
430; SSE-LABEL: PR50055:
431; SSE:       # %bb.0:
432; SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
433; SSE-NEXT:    pxor %xmm3, %xmm3
434; SSE-NEXT:    movdqa %xmm2, %xmm1
435; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
436; SSE-NEXT:    movdqa %xmm1, %xmm0
437; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
438; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
439; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
440; SSE-NEXT:    pcmpeqb %xmm3, %xmm2
441; SSE-NEXT:    pcmpeqd %xmm3, %xmm3
442; SSE-NEXT:    pxor %xmm2, %xmm3
443; SSE-NEXT:    movdqa %xmm3, %xmm2
444; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
445; SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
446; SSE-NEXT:    movdqa %xmm3, 16(%rsi)
447; SSE-NEXT:    movdqa %xmm2, (%rsi)
448; SSE-NEXT:    retq
449;
450; AVX-LABEL: PR50055:
451; AVX:       # %bb.0:
452; AVX-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
453; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
454; AVX-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm1
455; AVX-NEXT:    vmovdqa %ymm1, (%rsi)
456; AVX-NEXT:    retq
457  %load = load <8 x i8>, ptr %src
458  %zext = zext <8 x i8> %load to <8 x i32>
459  %icmp = icmp ne <8 x i8> %load, zeroinitializer
460  %sext = sext <8 x i1> %icmp to <8 x i32>
461  store <8 x i32> %sext, ptr %dst
462  ret <8 x i32> %zext
463}
464
465; negative test - extra uses must be absorbable by a zext-load.
466
467define <8 x i16> @multi_use_narrower_size(ptr %src, ptr %dst) nounwind {
468; SSE-LABEL: multi_use_narrower_size:
469; SSE:       # %bb.0:
470; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
471; SSE-NEXT:    pxor %xmm2, %xmm2
472; SSE-NEXT:    movdqa %xmm1, %xmm0
473; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
474; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
475; SSE-NEXT:    pcmpeqb %xmm2, %xmm1
476; SSE-NEXT:    movdqa %xmm1, %xmm2
477; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
478; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
479; SSE-NEXT:    movdqa %xmm1, 16(%rsi)
480; SSE-NEXT:    movdqa %xmm2, (%rsi)
481; SSE-NEXT:    retq
482;
483; AVX-LABEL: multi_use_narrower_size:
484; AVX:       # %bb.0:
485; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
486; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
487; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
488; AVX-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
489; AVX-NEXT:    vpmovsxbd %xmm1, %ymm1
490; AVX-NEXT:    vmovdqa %ymm1, (%rsi)
491; AVX-NEXT:    vzeroupper
492; AVX-NEXT:    retq
493  %load = load <8 x i8>, ptr %src
494  %zext = zext <8 x i8> %load to <8 x i16>
495  %icmp = icmp eq <8 x i8> %load, zeroinitializer
496  %sext = sext <8 x i1> %icmp to <8 x i32>
497  store <8 x i32> %sext, ptr %dst
498  ret <8 x i16> %zext
499}
500
501; negative test - extra uses must be absorbable by a zext-load.
502
503define <8 x i32> @multi_use_wider_size(ptr %src, ptr %dst) nounwind {
504; SSE-LABEL: multi_use_wider_size:
505; SSE:       # %bb.0:
506; SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
507; SSE-NEXT:    pxor %xmm3, %xmm3
508; SSE-NEXT:    movdqa %xmm2, %xmm1
509; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
510; SSE-NEXT:    movdqa %xmm1, %xmm0
511; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
512; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
513; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
514; SSE-NEXT:    pcmpeqb %xmm3, %xmm2
515; SSE-NEXT:    movdqa %xmm2, (%rsi)
516; SSE-NEXT:    retq
517;
518; AVX-LABEL: multi_use_wider_size:
519; AVX:       # %bb.0:
520; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
521; AVX-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
522; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
523; AVX-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
524; AVX-NEXT:    vpmovsxbw %xmm1, %xmm1
525; AVX-NEXT:    vmovdqa %xmm1, (%rsi)
526; AVX-NEXT:    retq
527  %load = load <8 x i8>, ptr %src
528  %zext = zext <8 x i8> %load to <8 x i32>
529  %icmp = icmp eq <8 x i8> %load, zeroinitializer
530  %sext = sext <8 x i1> %icmp to <8 x i16>
531  store <8 x i16> %sext, ptr %dst
532  ret <8 x i32> %zext
533}
534
535define <4 x i64> @PR50055_signed(ptr %src, ptr %dst) {
536; SSE-LABEL: PR50055_signed:
537; SSE:       # %bb.0:
538; SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
539; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
540; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
541; SSE-NEXT:    psrad $24, %xmm0
542; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
543; SSE-NEXT:    psrad $24, %xmm1
544; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
545; SSE-NEXT:    pxor %xmm3, %xmm3
546; SSE-NEXT:    pcmpgtb %xmm3, %xmm2
547; SSE-NEXT:    movdqa %xmm2, %xmm3
548; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
549; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
550; SSE-NEXT:    movdqa %xmm2, 16(%rsi)
551; SSE-NEXT:    movdqa %xmm3, (%rsi)
552; SSE-NEXT:    retq
553;
554; AVX-LABEL: PR50055_signed:
555; AVX:       # %bb.0:
556; AVX-NEXT:    vpmovsxbd (%rdi), %ymm0
557; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
558; AVX-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm1
559; AVX-NEXT:    vmovdqa %ymm1, (%rsi)
560; AVX-NEXT:    retq
561  %t1 = load <8 x i8>, ptr %src, align 1
562  %conv = sext <8 x i8> %t1 to <8 x i32>
563  %t2 = bitcast <8 x i32> %conv to <4 x i64>
564  %cmp = icmp sgt <8 x i8> %t1, zeroinitializer
565  %sext = sext <8 x i1> %cmp to <8 x i32>
566  store <8 x i32> %sext, ptr %dst, align 32
567  ret <4 x i64> %t2
568}
569
570define <8 x i32> @PR63946(<8 x i32> %a0, <8 x i32> %b0) nounwind {
571; SSE-LABEL: PR63946:
572; SSE:       # %bb.0: # %entry
573; SSE-NEXT:    movdqa %xmm1, %xmm4
574; SSE-NEXT:    movdqa %xmm0, %xmm13
575; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,2,3,0]
576; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm3[1,2,3,0]
577; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
578; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[2,3,0,1]
579; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[3,0,1,2]
580; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[3,0,1,2]
581; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
582; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
583; SSE-NEXT:    movdqa %xmm9, %xmm11
584; SSE-NEXT:    pcmpeqd %xmm4, %xmm11
585; SSE-NEXT:    movdqa %xmm7, %xmm12
586; SSE-NEXT:    movdqa %xmm8, %xmm10
587; SSE-NEXT:    movdqa %xmm5, %xmm15
588; SSE-NEXT:    pcmpeqd %xmm4, %xmm15
589; SSE-NEXT:    movdqa %xmm1, %xmm14
590; SSE-NEXT:    pcmpeqd %xmm4, %xmm14
591; SSE-NEXT:    pcmpeqd %xmm4, %xmm2
592; SSE-NEXT:    pcmpeqd %xmm4, %xmm7
593; SSE-NEXT:    pcmpeqd %xmm4, %xmm8
594; SSE-NEXT:    movdqa %xmm6, %xmm0
595; SSE-NEXT:    pcmpeqd %xmm4, %xmm6
596; SSE-NEXT:    pcmpeqd %xmm3, %xmm4
597; SSE-NEXT:    por %xmm4, %xmm11
598; SSE-NEXT:    pcmpeqd %xmm13, %xmm12
599; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
600; SSE-NEXT:    pcmpeqd %xmm13, %xmm10
601; SSE-NEXT:    pcmpeqd %xmm13, %xmm0
602; SSE-NEXT:    por %xmm15, %xmm2
603; SSE-NEXT:    por %xmm11, %xmm2
604; SSE-NEXT:    pcmpeqd %xmm13, %xmm3
605; SSE-NEXT:    por %xmm3, %xmm10
606; SSE-NEXT:    por %xmm12, %xmm10
607; SSE-NEXT:    por %xmm14, %xmm7
608; SSE-NEXT:    pcmpeqd %xmm13, %xmm9
609; SSE-NEXT:    por %xmm0, %xmm9
610; SSE-NEXT:    pcmpeqd %xmm13, %xmm5
611; SSE-NEXT:    por %xmm9, %xmm5
612; SSE-NEXT:    por %xmm10, %xmm5
613; SSE-NEXT:    por %xmm7, %xmm8
614; SSE-NEXT:    por %xmm2, %xmm8
615; SSE-NEXT:    packssdw %xmm8, %xmm5
616; SSE-NEXT:    pcmpeqd %xmm13, %xmm1
617; SSE-NEXT:    packssdw %xmm6, %xmm1
618; SSE-NEXT:    por %xmm5, %xmm1
619; SSE-NEXT:    movdqa %xmm1, %xmm0
620; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
621; SSE-NEXT:    pslld $31, %xmm0
622; SSE-NEXT:    psrad $31, %xmm0
623; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
624; SSE-NEXT:    pslld $31, %xmm1
625; SSE-NEXT:    psrad $31, %xmm1
626; SSE-NEXT:    retq
627;
628; AVX2-LABEL: PR63946:
629; AVX2:       # %bb.0: # %entry
630; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,2,3,0,5,6,7,4]
631; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[2,3,0,1,6,7,4,5]
632; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[3,0,1,2,7,4,5,6]
633; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
634; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
635; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
636; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1]
637; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm1
638; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm2, %ymm2
639; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
640; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm3, %ymm2
641; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm4, %ymm3
642; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm5, %ymm4
643; AVX2-NEXT:    vpor %ymm4, %ymm2, %ymm2
644; AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
645; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm6, %ymm2
646; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
647; AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
648; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm7, %ymm2
649; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm8, %ymm0
650; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
651; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
652; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
653; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
654; AVX2-NEXT:    retq
655;
656; AVX512-LABEL: PR63946:
657; AVX512:       # %bb.0: # %entry
658; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
659; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
660; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,2,3,0,5,6,7,4]
661; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[2,3,0,1,6,7,4,5]
662; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[3,0,1,2,7,4,5,6]
663; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
664; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
665; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
666; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1]
667; AVX512-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
668; AVX512-NEXT:    vpcmpeqd %zmm0, %zmm2, %k1
669; AVX512-NEXT:    vpcmpeqd %zmm0, %zmm3, %k3
670; AVX512-NEXT:    vpcmpeqd %zmm0, %zmm4, %k2
671; AVX512-NEXT:    vpcmpeqd %zmm0, %zmm5, %k4
672; AVX512-NEXT:    vpcmpeqd %zmm0, %zmm6, %k5
673; AVX512-NEXT:    vpcmpeqd %zmm0, %zmm7, %k6
674; AVX512-NEXT:    vpcmpeqd %zmm0, %zmm8, %k7
675; AVX512-NEXT:    korw %k0, %k1, %k0
676; AVX512-NEXT:    korw %k3, %k0, %k0
677; AVX512-NEXT:    korw %k4, %k0, %k0
678; AVX512-NEXT:    korw %k2, %k0, %k0
679; AVX512-NEXT:    korw %k5, %k0, %k0
680; AVX512-NEXT:    korw %k6, %k0, %k0
681; AVX512-NEXT:    korw %k7, %k0, %k1
682; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
683; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
684; AVX512-NEXT:    retq
685entry:
686  %shuffle = shufflevector <8 x i32> %b0, <8 x i32> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4>
687  %shuffle1 = shufflevector <8 x i32> %b0, <8 x i32> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
688  %shuffle2 = shufflevector <8 x i32> %shuffle, <8 x i32> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
689  %shuffle3 = shufflevector <8 x i32> %b0, <8 x i32> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
690  %shuffle4 = shufflevector <8 x i32> %shuffle, <8 x i32> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
691  %shuffle5 = shufflevector <8 x i32> %shuffle1, <8 x i32> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
692  %shuffle6 = shufflevector <8 x i32> %shuffle2, <8 x i32> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
693  %cmp = icmp eq <8 x i32> %a0, %b0
694  %cmp7 = icmp eq <8 x i32> %shuffle, %a0
695  %cmp9 = icmp eq <8 x i32> %shuffle1, %a0
696  %cmp11 = icmp eq <8 x i32> %shuffle2, %a0
697  %cmp13 = icmp eq <8 x i32> %shuffle3, %a0
698  %cmp15 = icmp eq <8 x i32> %shuffle4, %a0
699  %cmp17 = icmp eq <8 x i32> %shuffle5, %a0
700  %cmp19 = icmp eq <8 x i32> %shuffle6, %a0
701  %or2365 = or <8 x i1> %cmp7, %cmp
702  %or2264 = or <8 x i1> %or2365, %cmp9
703  %or2567 = or <8 x i1> %or2264, %cmp13
704  %or2163 = or <8 x i1> %or2567, %cmp11
705  %or62 = or <8 x i1> %or2163, %cmp15
706  %or2466 = or <8 x i1> %or62, %cmp17
707  %or2668 = or <8 x i1> %or2466, %cmp19
708  %or26 = sext <8 x i1> %or2668 to <8 x i32>
709  ret <8 x i32> %or26
710}
711