xref: /llvm-project/llvm/test/CodeGen/X86/combine-movmsk.ll (revision b0bea80ab479e9bb016fcdb62d7d0eceec2b28e3)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+prefer-movmsk-over-vtest | FileCheck %s --check-prefixes=ADL
7
8declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>)
9declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>)
10declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>)
11
12; Use widest possible vector for movmsk comparisons (PR37087)
13
14define i1 @movmskps_noneof_bitcast_v2f64(<2 x double> %a0) {
15; SSE-LABEL: movmskps_noneof_bitcast_v2f64:
16; SSE:       # %bb.0:
17; SSE-NEXT:    xorpd %xmm1, %xmm1
18; SSE-NEXT:    cmpeqpd %xmm0, %xmm1
19; SSE-NEXT:    movmskpd %xmm1, %eax
20; SSE-NEXT:    testl %eax, %eax
21; SSE-NEXT:    sete %al
22; SSE-NEXT:    retq
23;
24; AVX-LABEL: movmskps_noneof_bitcast_v2f64:
25; AVX:       # %bb.0:
26; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
27; AVX-NEXT:    vcmpeqpd %xmm0, %xmm1, %xmm0
28; AVX-NEXT:    vtestpd %xmm0, %xmm0
29; AVX-NEXT:    sete %al
30; AVX-NEXT:    retq
31;
32; ADL-LABEL: movmskps_noneof_bitcast_v2f64:
33; ADL:       # %bb.0:
34; ADL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
35; ADL-NEXT:    vcmpeqpd %xmm0, %xmm1, %xmm0
36; ADL-NEXT:    vmovmskpd %xmm0, %eax
37; ADL-NEXT:    testl %eax, %eax
38; ADL-NEXT:    sete %al
39; ADL-NEXT:    retq
40  %1 = fcmp oeq <2 x double> zeroinitializer, %a0
41  %2 = sext <2 x i1> %1 to <2 x i64>
42  %3 = bitcast <2 x i64> %2 to <4 x float>
43  %4 = tail call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %3)
44  %5 = icmp eq i32 %4, 0
45  ret i1 %5
46}
47
48define i1 @movmskps_allof_bitcast_v2f64(<2 x double> %a0) {
49; SSE-LABEL: movmskps_allof_bitcast_v2f64:
50; SSE:       # %bb.0:
51; SSE-NEXT:    xorpd %xmm1, %xmm1
52; SSE-NEXT:    cmpeqpd %xmm0, %xmm1
53; SSE-NEXT:    movmskpd %xmm1, %eax
54; SSE-NEXT:    cmpl $3, %eax
55; SSE-NEXT:    sete %al
56; SSE-NEXT:    retq
57;
58; AVX-LABEL: movmskps_allof_bitcast_v2f64:
59; AVX:       # %bb.0:
60; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
61; AVX-NEXT:    vcmpeqpd %xmm0, %xmm1, %xmm0
62; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
63; AVX-NEXT:    vtestpd %xmm1, %xmm0
64; AVX-NEXT:    setb %al
65; AVX-NEXT:    retq
66;
67; ADL-LABEL: movmskps_allof_bitcast_v2f64:
68; ADL:       # %bb.0:
69; ADL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
70; ADL-NEXT:    vcmpeqpd %xmm0, %xmm1, %xmm0
71; ADL-NEXT:    vmovmskpd %xmm0, %eax
72; ADL-NEXT:    cmpl $3, %eax
73; ADL-NEXT:    sete %al
74; ADL-NEXT:    retq
75  %1 = fcmp oeq <2 x double> zeroinitializer, %a0
76  %2 = sext <2 x i1> %1 to <2 x i64>
77  %3 = bitcast <2 x i64> %2 to <4 x float>
78  %4 = tail call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %3)
79  %5 = icmp eq i32 %4, 15
80  ret i1 %5
81}
82
83define i1 @pmovmskb_noneof_bitcast_v2i64(<2 x i64> %a0) {
84; SSE2-LABEL: pmovmskb_noneof_bitcast_v2i64:
85; SSE2:       # %bb.0:
86; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
87; SSE2-NEXT:    movmskps %xmm0, %eax
88; SSE2-NEXT:    testl %eax, %eax
89; SSE2-NEXT:    sete %al
90; SSE2-NEXT:    retq
91;
92; SSE42-LABEL: pmovmskb_noneof_bitcast_v2i64:
93; SSE42:       # %bb.0:
94; SSE42-NEXT:    movmskpd %xmm0, %eax
95; SSE42-NEXT:    testl %eax, %eax
96; SSE42-NEXT:    sete %al
97; SSE42-NEXT:    retq
98;
99; AVX-LABEL: pmovmskb_noneof_bitcast_v2i64:
100; AVX:       # %bb.0:
101; AVX-NEXT:    vtestpd %xmm0, %xmm0
102; AVX-NEXT:    sete %al
103; AVX-NEXT:    retq
104;
105; ADL-LABEL: pmovmskb_noneof_bitcast_v2i64:
106; ADL:       # %bb.0:
107; ADL-NEXT:    vmovmskpd %xmm0, %eax
108; ADL-NEXT:    testl %eax, %eax
109; ADL-NEXT:    sete %al
110; ADL-NEXT:    retq
111  %1 = icmp sgt <2 x i64> zeroinitializer, %a0
112  %2 = sext <2 x i1> %1 to <2 x i64>
113  %3 = bitcast <2 x i64> %2 to <16 x i8>
114  %4 = tail call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %3)
115  %5 = icmp eq i32 %4, 0
116  ret i1 %5
117}
118
119define i1 @pmovmskb_allof_bitcast_v2i64(<2 x i64> %a0) {
120; SSE2-LABEL: pmovmskb_allof_bitcast_v2i64:
121; SSE2:       # %bb.0:
122; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
123; SSE2-NEXT:    movmskps %xmm0, %eax
124; SSE2-NEXT:    cmpl $15, %eax
125; SSE2-NEXT:    sete %al
126; SSE2-NEXT:    retq
127;
128; SSE42-LABEL: pmovmskb_allof_bitcast_v2i64:
129; SSE42:       # %bb.0:
130; SSE42-NEXT:    movmskpd %xmm0, %eax
131; SSE42-NEXT:    cmpl $3, %eax
132; SSE42-NEXT:    sete %al
133; SSE42-NEXT:    retq
134;
135; AVX-LABEL: pmovmskb_allof_bitcast_v2i64:
136; AVX:       # %bb.0:
137; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
138; AVX-NEXT:    vtestpd %xmm1, %xmm0
139; AVX-NEXT:    setb %al
140; AVX-NEXT:    retq
141;
142; ADL-LABEL: pmovmskb_allof_bitcast_v2i64:
143; ADL:       # %bb.0:
144; ADL-NEXT:    vmovmskpd %xmm0, %eax
145; ADL-NEXT:    cmpl $3, %eax
146; ADL-NEXT:    sete %al
147; ADL-NEXT:    retq
148  %1 = icmp sgt <2 x i64> zeroinitializer, %a0
149  %2 = sext <2 x i1> %1 to <2 x i64>
150  %3 = bitcast <2 x i64> %2 to <16 x i8>
151  %4 = tail call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %3)
152  %5 = icmp eq i32 %4, 65535
153  ret i1 %5
154}
155
156define i1 @pmovmskb_noneof_bitcast_v4f32(<4 x float> %a0) {
157; SSE-LABEL: pmovmskb_noneof_bitcast_v4f32:
158; SSE:       # %bb.0:
159; SSE-NEXT:    xorps %xmm1, %xmm1
160; SSE-NEXT:    cmpeqps %xmm0, %xmm1
161; SSE-NEXT:    movmskps %xmm1, %eax
162; SSE-NEXT:    testl %eax, %eax
163; SSE-NEXT:    sete %al
164; SSE-NEXT:    retq
165;
166; AVX-LABEL: pmovmskb_noneof_bitcast_v4f32:
167; AVX:       # %bb.0:
168; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
169; AVX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
170; AVX-NEXT:    vtestps %xmm0, %xmm0
171; AVX-NEXT:    sete %al
172; AVX-NEXT:    retq
173;
174; ADL-LABEL: pmovmskb_noneof_bitcast_v4f32:
175; ADL:       # %bb.0:
176; ADL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
177; ADL-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
178; ADL-NEXT:    vmovmskps %xmm0, %eax
179; ADL-NEXT:    testl %eax, %eax
180; ADL-NEXT:    sete %al
181; ADL-NEXT:    retq
182  %1 = fcmp oeq <4 x float> %a0, zeroinitializer
183  %2 = sext <4 x i1> %1 to <4 x i32>
184  %3 = bitcast <4 x i32> %2 to <16 x i8>
185  %4 = tail call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %3)
186  %5 = icmp eq i32 %4, 0
187  ret i1 %5
188}
189
190define i1 @pmovmskb_allof_bitcast_v4f32(<4 x float> %a0) {
191; SSE-LABEL: pmovmskb_allof_bitcast_v4f32:
192; SSE:       # %bb.0:
193; SSE-NEXT:    xorps %xmm1, %xmm1
194; SSE-NEXT:    cmpeqps %xmm0, %xmm1
195; SSE-NEXT:    movmskps %xmm1, %eax
196; SSE-NEXT:    cmpl $15, %eax
197; SSE-NEXT:    sete %al
198; SSE-NEXT:    retq
199;
200; AVX-LABEL: pmovmskb_allof_bitcast_v4f32:
201; AVX:       # %bb.0:
202; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
203; AVX-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
204; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
205; AVX-NEXT:    vtestps %xmm1, %xmm0
206; AVX-NEXT:    setb %al
207; AVX-NEXT:    retq
208;
209; ADL-LABEL: pmovmskb_allof_bitcast_v4f32:
210; ADL:       # %bb.0:
211; ADL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
212; ADL-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
213; ADL-NEXT:    vmovmskps %xmm0, %eax
214; ADL-NEXT:    cmpl $15, %eax
215; ADL-NEXT:    sete %al
216; ADL-NEXT:    retq
217  %1 = fcmp oeq <4 x float> %a0, zeroinitializer
218  %2 = sext <4 x i1> %1 to <4 x i32>
219  %3 = bitcast <4 x i32> %2 to <16 x i8>
220  %4 = tail call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %3)
221  %5 = icmp eq i32 %4, 65535
222  ret i1 %5
223}
224
225; MOVMSK(ICMP_SGT(X,-1)) -> NOT(MOVMSK(X)))
226define i1 @movmskps_allof_v4i32_positive(<4 x i32> %a0) {
227; SSE-LABEL: movmskps_allof_v4i32_positive:
228; SSE:       # %bb.0:
229; SSE-NEXT:    movmskps %xmm0, %eax
230; SSE-NEXT:    xorl $15, %eax
231; SSE-NEXT:    cmpl $15, %eax
232; SSE-NEXT:    sete %al
233; SSE-NEXT:    retq
234;
235; AVX-LABEL: movmskps_allof_v4i32_positive:
236; AVX:       # %bb.0:
237; AVX-NEXT:    vmovmskps %xmm0, %eax
238; AVX-NEXT:    xorl $15, %eax
239; AVX-NEXT:    cmpl $15, %eax
240; AVX-NEXT:    sete %al
241; AVX-NEXT:    retq
242;
243; ADL-LABEL: movmskps_allof_v4i32_positive:
244; ADL:       # %bb.0:
245; ADL-NEXT:    vmovmskps %xmm0, %eax
246; ADL-NEXT:    xorl $15, %eax
247; ADL-NEXT:    cmpl $15, %eax
248; ADL-NEXT:    sete %al
249; ADL-NEXT:    retq
250  %1 = icmp sgt <4 x i32> %a0, <i32 -1, i32 -1, i32 -1, i32 -1>
251  %2 = sext <4 x i1> %1 to <4 x i32>
252  %3 = bitcast <4 x i32> %2 to <4 x float>
253  %4 = tail call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %3)
254  %5 = icmp eq i32 %4, 15
255  ret i1 %5
256}
257
258define i1 @pmovmskb_noneof_v16i8_positive(<16 x i8> %a0) {
259; SSE-LABEL: pmovmskb_noneof_v16i8_positive:
260; SSE:       # %bb.0:
261; SSE-NEXT:    pmovmskb %xmm0, %eax
262; SSE-NEXT:    xorl $65535, %eax # imm = 0xFFFF
263; SSE-NEXT:    sete %al
264; SSE-NEXT:    retq
265;
266; AVX-LABEL: pmovmskb_noneof_v16i8_positive:
267; AVX:       # %bb.0:
268; AVX-NEXT:    vpmovmskb %xmm0, %eax
269; AVX-NEXT:    xorl $65535, %eax # imm = 0xFFFF
270; AVX-NEXT:    sete %al
271; AVX-NEXT:    retq
272;
273; ADL-LABEL: pmovmskb_noneof_v16i8_positive:
274; ADL:       # %bb.0:
275; ADL-NEXT:    vpmovmskb %xmm0, %eax
276; ADL-NEXT:    xorl $65535, %eax # imm = 0xFFFF
277; ADL-NEXT:    sete %al
278; ADL-NEXT:    retq
279  %1 = icmp sgt <16 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
280  %2 = sext <16 x i1> %1 to <16 x i8>
281  %3 = tail call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %2)
282  %4 = icmp eq i32 %3, 0
283  ret i1 %4
284}
285
286; MOVMSK(CMPEQ(AND(X,C1),0)) -> MOVMSK(NOT(SHL(X,C2)))
287define i32 @movmskpd_pow2_mask(<2 x i64> %a0) {
288; SSE2-LABEL: movmskpd_pow2_mask:
289; SSE2:       # %bb.0:
290; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
291; SSE2-NEXT:    pxor %xmm1, %xmm1
292; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
293; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
294; SSE2-NEXT:    pand %xmm1, %xmm0
295; SSE2-NEXT:    movmskpd %xmm0, %eax
296; SSE2-NEXT:    retq
297;
298; SSE42-LABEL: movmskpd_pow2_mask:
299; SSE42:       # %bb.0:
300; SSE42-NEXT:    movmskpd %xmm0, %eax
301; SSE42-NEXT:    xorl $3, %eax
302; SSE42-NEXT:    retq
303;
304; AVX-LABEL: movmskpd_pow2_mask:
305; AVX:       # %bb.0:
306; AVX-NEXT:    vmovmskpd %xmm0, %eax
307; AVX-NEXT:    xorl $3, %eax
308; AVX-NEXT:    retq
309;
310; ADL-LABEL: movmskpd_pow2_mask:
311; ADL:       # %bb.0:
312; ADL-NEXT:    vmovmskpd %xmm0, %eax
313; ADL-NEXT:    xorl $3, %eax
314; ADL-NEXT:    retq
315  %1 = and <2 x i64> %a0, <i64 -9223372036854775808, i64 -9223372036854775808>
316  %2 = icmp eq <2 x i64> %1, zeroinitializer
317  %3 = sext <2 x i1> %2 to <2 x i64>
318  %4 = bitcast <2 x i64> %3 to <2 x double>
319  %5 = tail call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %4)
320  ret i32 %5
321}
322
323define i32 @movmskps_pow2_mask(<4 x i32> %a0) {
324; SSE-LABEL: movmskps_pow2_mask:
325; SSE:       # %bb.0:
326; SSE-NEXT:    pslld $29, %xmm0
327; SSE-NEXT:    movmskps %xmm0, %eax
328; SSE-NEXT:    xorl $15, %eax
329; SSE-NEXT:    retq
330;
331; AVX-LABEL: movmskps_pow2_mask:
332; AVX:       # %bb.0:
333; AVX-NEXT:    vpslld $29, %xmm0, %xmm0
334; AVX-NEXT:    vmovmskps %xmm0, %eax
335; AVX-NEXT:    xorl $15, %eax
336; AVX-NEXT:    retq
337;
338; ADL-LABEL: movmskps_pow2_mask:
339; ADL:       # %bb.0:
340; ADL-NEXT:    vpslld $29, %xmm0, %xmm0
341; ADL-NEXT:    vmovmskps %xmm0, %eax
342; ADL-NEXT:    xorl $15, %eax
343; ADL-NEXT:    retq
344  %1 = and <4 x i32> %a0, <i32 4, i32 4, i32 4, i32 4>
345  %2 = icmp eq <4 x i32> %1, zeroinitializer
346  %3 = sext <4 x i1> %2 to <4 x i32>
347  %4 = bitcast <4 x i32> %3 to <4 x float>
348  %5 = tail call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %4)
349  ret i32 %5
350}
351
352define i32 @pmovmskb_pow2_mask(<16 x i8> %a0) {
353; SSE-LABEL: pmovmskb_pow2_mask:
354; SSE:       # %bb.0:
355; SSE-NEXT:    psllw $7, %xmm0
356; SSE-NEXT:    pmovmskb %xmm0, %eax
357; SSE-NEXT:    xorl $65535, %eax # imm = 0xFFFF
358; SSE-NEXT:    retq
359;
360; AVX-LABEL: pmovmskb_pow2_mask:
361; AVX:       # %bb.0:
362; AVX-NEXT:    vpsllw $7, %xmm0, %xmm0
363; AVX-NEXT:    vpmovmskb %xmm0, %eax
364; AVX-NEXT:    xorl $65535, %eax # imm = 0xFFFF
365; AVX-NEXT:    retq
366;
367; ADL-LABEL: pmovmskb_pow2_mask:
368; ADL:       # %bb.0:
369; ADL-NEXT:    vpsllw $7, %xmm0, %xmm0
370; ADL-NEXT:    vpmovmskb %xmm0, %eax
371; ADL-NEXT:    xorl $65535, %eax # imm = 0xFFFF
372; ADL-NEXT:    retq
373  %1 = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
374  %2 = icmp eq <16 x i8> %1, zeroinitializer
375  %3 = sext <16 x i1> %2 to <16 x i8>
376  %4 = tail call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %3)
377  ret i32 %4
378}
379
380; AND(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(AND(X,Y))
381; XOR(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(XOR(X,Y))
382; OR(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(OR(X,Y))
383; if the elements are the same width.
384
385define i32 @and_movmskpd_movmskpd(<2 x double> %a0, <2 x i64> %a1) {
386; SSE-LABEL: and_movmskpd_movmskpd:
387; SSE:       # %bb.0:
388; SSE-NEXT:    xorpd %xmm2, %xmm2
389; SSE-NEXT:    cmpeqpd %xmm0, %xmm2
390; SSE-NEXT:    andpd %xmm1, %xmm2
391; SSE-NEXT:    movmskpd %xmm2, %eax
392; SSE-NEXT:    retq
393;
394; AVX-LABEL: and_movmskpd_movmskpd:
395; AVX:       # %bb.0:
396; AVX-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
397; AVX-NEXT:    vcmpeqpd %xmm0, %xmm2, %xmm0
398; AVX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
399; AVX-NEXT:    vmovmskpd %xmm0, %eax
400; AVX-NEXT:    retq
401;
402; ADL-LABEL: and_movmskpd_movmskpd:
403; ADL:       # %bb.0:
404; ADL-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
405; ADL-NEXT:    vcmpeqpd %xmm0, %xmm2, %xmm0
406; ADL-NEXT:    vandpd %xmm1, %xmm0, %xmm0
407; ADL-NEXT:    vmovmskpd %xmm0, %eax
408; ADL-NEXT:    retq
409  %1 = fcmp oeq <2 x double> zeroinitializer, %a0
410  %2 = sext <2 x i1> %1 to <2 x i64>
411  %3 = bitcast <2 x i64> %2 to <2 x double>
412  %4 = tail call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %3)
413  %5 = icmp sgt <2 x i64> zeroinitializer, %a1
414  %6 = bitcast <2 x i1> %5 to i2
415  %7 = zext i2 %6 to i32
416  %8 = and i32 %4, %7
417  ret i32 %8
418}
419
420define i32 @xor_movmskps_movmskps(<4 x float> %a0, <4 x i32> %a1) {
421; SSE-LABEL: xor_movmskps_movmskps:
422; SSE:       # %bb.0:
423; SSE-NEXT:    xorps %xmm2, %xmm2
424; SSE-NEXT:    cmpeqps %xmm0, %xmm2
425; SSE-NEXT:    xorps %xmm1, %xmm2
426; SSE-NEXT:    movmskps %xmm2, %eax
427; SSE-NEXT:    retq
428;
429; AVX-LABEL: xor_movmskps_movmskps:
430; AVX:       # %bb.0:
431; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
432; AVX-NEXT:    vcmpeqps %xmm0, %xmm2, %xmm0
433; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
434; AVX-NEXT:    vmovmskps %xmm0, %eax
435; AVX-NEXT:    retq
436;
437; ADL-LABEL: xor_movmskps_movmskps:
438; ADL:       # %bb.0:
439; ADL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
440; ADL-NEXT:    vcmpeqps %xmm0, %xmm2, %xmm0
441; ADL-NEXT:    vxorps %xmm1, %xmm0, %xmm0
442; ADL-NEXT:    vmovmskps %xmm0, %eax
443; ADL-NEXT:    retq
444  %1 = fcmp oeq <4 x float> zeroinitializer, %a0
445  %2 = sext <4 x i1> %1 to <4 x i32>
446  %3 = bitcast <4 x i32> %2 to <4 x float>
447  %4 = tail call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %3)
448  %5 = ashr <4 x i32> %a1, <i32 31, i32 31, i32 31, i32 31>
449  %6 = bitcast <4 x i32> %5 to <4 x float>
450  %7 = tail call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %6)
451  %8 = xor i32 %4, %7
452  ret i32 %8
453}
454
455define i32 @or_pmovmskb_pmovmskb(<16 x i8> %a0, <8 x i16> %a1) {
456; SSE-LABEL: or_pmovmskb_pmovmskb:
457; SSE:       # %bb.0:
458; SSE-NEXT:    pxor %xmm2, %xmm2
459; SSE-NEXT:    pcmpeqb %xmm0, %xmm2
460; SSE-NEXT:    psraw $15, %xmm1
461; SSE-NEXT:    por %xmm2, %xmm1
462; SSE-NEXT:    pmovmskb %xmm1, %eax
463; SSE-NEXT:    retq
464;
465; AVX-LABEL: or_pmovmskb_pmovmskb:
466; AVX:       # %bb.0:
467; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
468; AVX-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
469; AVX-NEXT:    vpsraw $15, %xmm1, %xmm1
470; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
471; AVX-NEXT:    vpmovmskb %xmm0, %eax
472; AVX-NEXT:    retq
473;
474; ADL-LABEL: or_pmovmskb_pmovmskb:
475; ADL:       # %bb.0:
476; ADL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
477; ADL-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
478; ADL-NEXT:    vpsraw $15, %xmm1, %xmm1
479; ADL-NEXT:    vpor %xmm1, %xmm0, %xmm0
480; ADL-NEXT:    vpmovmskb %xmm0, %eax
481; ADL-NEXT:    retq
482  %1 = icmp eq <16 x i8> zeroinitializer, %a0
483  %2 = sext <16 x i1> %1 to <16 x i8>
484  %3 = tail call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %2)
485  %4 = ashr <8 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
486  %5 = bitcast <8 x i16> %4 to <16 x i8>
487  %6 = tail call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %5)
488  %7 = or i32 %3, %6
489  ret i32 %7
490}
491
492; We can't fold to ptest if we're not checking every pcmpeq result
493define i32 @movmskps_ptest_numelts_mismatch(<16 x i8> %a0) {
494; SSE-LABEL: movmskps_ptest_numelts_mismatch:
495; SSE:       # %bb.0:
496; SSE-NEXT:    pxor %xmm1, %xmm1
497; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
498; SSE-NEXT:    movmskps %xmm1, %ecx
499; SSE-NEXT:    xorl %eax, %eax
500; SSE-NEXT:    cmpl $15, %ecx
501; SSE-NEXT:    sete %al
502; SSE-NEXT:    negl %eax
503; SSE-NEXT:    retq
504;
505; AVX-LABEL: movmskps_ptest_numelts_mismatch:
506; AVX:       # %bb.0:
507; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
508; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
509; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
510; AVX-NEXT:    xorl %eax, %eax
511; AVX-NEXT:    vtestps %xmm1, %xmm0
512; AVX-NEXT:    sbbl %eax, %eax
513; AVX-NEXT:    retq
514;
515; ADL-LABEL: movmskps_ptest_numelts_mismatch:
516; ADL:       # %bb.0:
517; ADL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
518; ADL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
519; ADL-NEXT:    vmovmskps %xmm0, %ecx
520; ADL-NEXT:    xorl %eax, %eax
521; ADL-NEXT:    cmpl $15, %ecx
522; ADL-NEXT:    sete %al
523; ADL-NEXT:    negl %eax
524; ADL-NEXT:    retq
525  %1 = icmp eq <16 x i8> %a0, zeroinitializer
526  %2 = sext <16 x i1> %1 to <16 x i8>
527  %3 = bitcast <16 x i8> %2 to <4 x float>
528  %4 = tail call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %3)
529  %5 = icmp eq i32 %4, 15
530  %6 = sext i1 %5 to i32
531  ret i32 %6
532}
533