xref: /llvm-project/llvm/test/CodeGen/X86/ushl_sat_vec.ll (revision 7b3bbd83c0c24087072ec5b22a76799ab31f87d5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
3; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx2 | FileCheck %s --check-prefix=X64-AVX2
4; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
5
6declare <2 x i64> @llvm.ushl.sat.v2i64(<2 x i64>, <2 x i64>)
7declare <4 x i32> @llvm.ushl.sat.v4i32(<4 x i32>, <4 x i32>)
8declare <8 x i16> @llvm.ushl.sat.v8i16(<8 x i16>, <8 x i16>)
9declare <16 x i8> @llvm.ushl.sat.v16i8(<16 x i8>, <16 x i8>)
10
11define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
12; X64-LABEL: vec_v2i64:
13; X64:       # %bb.0:
14; X64-NEXT:    movdqa %xmm0, %xmm2
15; X64-NEXT:    psllq %xmm1, %xmm2
16; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
17; X64-NEXT:    movdqa %xmm0, %xmm4
18; X64-NEXT:    psllq %xmm3, %xmm4
19; X64-NEXT:    movdqa %xmm4, %xmm5
20; X64-NEXT:    movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
21; X64-NEXT:    psrlq %xmm1, %xmm2
22; X64-NEXT:    psrlq %xmm3, %xmm5
23; X64-NEXT:    movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1]
24; X64-NEXT:    pcmpeqd %xmm5, %xmm0
25; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
26; X64-NEXT:    pand %xmm1, %xmm0
27; X64-NEXT:    pcmpeqd %xmm1, %xmm1
28; X64-NEXT:    pxor %xmm1, %xmm0
29; X64-NEXT:    por %xmm4, %xmm0
30; X64-NEXT:    retq
31;
32; X64-AVX2-LABEL: vec_v2i64:
33; X64-AVX2:       # %bb.0:
34; X64-AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm2
35; X64-AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm1
36; X64-AVX2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
37; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
38; X64-AVX2-NEXT:    vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
39; X64-AVX2-NEXT:    retq
40;
41; X86-LABEL: vec_v2i64:
42; X86:       # %bb.0:
43; X86-NEXT:    pushl %ebp
44; X86-NEXT:    pushl %ebx
45; X86-NEXT:    pushl %edi
46; X86-NEXT:    pushl %esi
47; X86-NEXT:    subl $16, %esp
48; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
49; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
50; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
51; X86-NEXT:    movl %esi, %eax
52; X86-NEXT:    shll %cl, %eax
53; X86-NEXT:    shldl %cl, %esi, %edx
54; X86-NEXT:    xorl %ebx, %ebx
55; X86-NEXT:    testb $32, %cl
56; X86-NEXT:    cmovnel %eax, %edx
57; X86-NEXT:    cmovnel %ebx, %eax
58; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
59; X86-NEXT:    movl %edx, %eax
60; X86-NEXT:    movl %edx, %ebp
61; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
62; X86-NEXT:    shrl %cl, %eax
63; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
64; X86-NEXT:    testb $32, %cl
65; X86-NEXT:    cmovnel %ebx, %eax
66; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
67; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
68; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
69; X86-NEXT:    movl %eax, %esi
70; X86-NEXT:    movb %ch, %cl
71; X86-NEXT:    shll %cl, %esi
72; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
73; X86-NEXT:    shldl %cl, %eax, %edx
74; X86-NEXT:    testb $32, %ch
75; X86-NEXT:    cmovnel %esi, %edx
76; X86-NEXT:    cmovnel %ebx, %esi
77; X86-NEXT:    movl %edx, %edi
78; X86-NEXT:    shrl %cl, %edi
79; X86-NEXT:    testb $32, %ch
80; X86-NEXT:    cmovel %edi, %ebx
81; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
82; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
83; X86-NEXT:    shrdl %cl, %ebp, %eax
84; X86-NEXT:    testb $32, %cl
85; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
86; X86-NEXT:    movl %esi, %ebp
87; X86-NEXT:    movb %ch, %cl
88; X86-NEXT:    shrdl %cl, %edx, %ebp
89; X86-NEXT:    testb $32, %ch
90; X86-NEXT:    cmovnel %edi, %ebp
91; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
92; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
93; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
94; X86-NEXT:    orl %eax, %ecx
95; X86-NEXT:    movl $-1, %ecx
96; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
97; X86-NEXT:    cmovnel %ecx, %edi
98; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
99; X86-NEXT:    cmovnel %ecx, %eax
100; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebp
101; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
102; X86-NEXT:    orl %ebp, %ebx
103; X86-NEXT:    cmovnel %ecx, %esi
104; X86-NEXT:    cmovnel %ecx, %edx
105; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
106; X86-NEXT:    movl %edx, 12(%ecx)
107; X86-NEXT:    movl %esi, 8(%ecx)
108; X86-NEXT:    movl %eax, 4(%ecx)
109; X86-NEXT:    movl %edi, (%ecx)
110; X86-NEXT:    movl %ecx, %eax
111; X86-NEXT:    addl $16, %esp
112; X86-NEXT:    popl %esi
113; X86-NEXT:    popl %edi
114; X86-NEXT:    popl %ebx
115; X86-NEXT:    popl %ebp
116; X86-NEXT:    retl $4
117  %tmp = call <2 x i64> @llvm.ushl.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
118  ret <2 x i64> %tmp
119}
120
121define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
122; X64-LABEL: vec_v4i32:
123; X64:       # %bb.0:
124; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
125; X64-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
126; X64-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
127; X64-NEXT:    pslld $23, %xmm1
128; X64-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
129; X64-NEXT:    cvttps2dq %xmm1, %xmm1
130; X64-NEXT:    movdqa %xmm0, %xmm5
131; X64-NEXT:    pmuludq %xmm1, %xmm5
132; X64-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
133; X64-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
134; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
135; X64-NEXT:    pmuludq %xmm7, %xmm1
136; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
137; X64-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
138; X64-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
139; X64-NEXT:    movdqa %xmm6, %xmm7
140; X64-NEXT:    psrld %xmm1, %xmm7
141; X64-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
142; X64-NEXT:    movdqa %xmm5, %xmm2
143; X64-NEXT:    psrld %xmm1, %xmm2
144; X64-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1]
145; X64-NEXT:    movdqa %xmm6, %xmm1
146; X64-NEXT:    psrld %xmm3, %xmm1
147; X64-NEXT:    psrld %xmm4, %xmm5
148; X64-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0]
149; X64-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3]
150; X64-NEXT:    pcmpeqd %xmm5, %xmm0
151; X64-NEXT:    pcmpeqd %xmm1, %xmm1
152; X64-NEXT:    pxor %xmm1, %xmm0
153; X64-NEXT:    por %xmm6, %xmm0
154; X64-NEXT:    retq
155;
156; X64-AVX2-LABEL: vec_v4i32:
157; X64-AVX2:       # %bb.0:
158; X64-AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2
159; X64-AVX2-NEXT:    vpsrlvd %xmm1, %xmm2, %xmm1
160; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
161; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
162; X64-AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
163; X64-AVX2-NEXT:    retq
164;
165; X86-LABEL: vec_v4i32:
166; X86:       # %bb.0:
167; X86-NEXT:    pushl %ebp
168; X86-NEXT:    pushl %ebx
169; X86-NEXT:    pushl %edi
170; X86-NEXT:    pushl %esi
171; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
172; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
173; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
174; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
175; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
176; X86-NEXT:    movl %ebx, %esi
177; X86-NEXT:    shll %cl, %esi
178; X86-NEXT:    movl %esi, %ebp
179; X86-NEXT:    shrl %cl, %ebp
180; X86-NEXT:    cmpl %ebp, %ebx
181; X86-NEXT:    movl $-1, %edx
182; X86-NEXT:    cmovnel %edx, %esi
183; X86-NEXT:    movl $-1, %ebx
184; X86-NEXT:    movl %edi, %edx
185; X86-NEXT:    movb %ah, %cl
186; X86-NEXT:    shll %cl, %edx
187; X86-NEXT:    movl %edx, %ebp
188; X86-NEXT:    shrl %cl, %ebp
189; X86-NEXT:    cmpl %ebp, %edi
190; X86-NEXT:    cmovnel %ebx, %edx
191; X86-NEXT:    movl $-1, %eax
192; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
193; X86-NEXT:    movb %ch, %cl
194; X86-NEXT:    shll %cl, %edi
195; X86-NEXT:    movl %edi, %ebp
196; X86-NEXT:    shrl %cl, %ebp
197; X86-NEXT:    cmpl %ebp, {{[0-9]+}}(%esp)
198; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
199; X86-NEXT:    cmovnel %eax, %edi
200; X86-NEXT:    movl %ebx, %ebp
201; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
202; X86-NEXT:    shll %cl, %ebp
203; X86-NEXT:    movl %ebp, %eax
204; X86-NEXT:    shrl %cl, %eax
205; X86-NEXT:    cmpl %eax, %ebx
206; X86-NEXT:    movl $-1, %eax
207; X86-NEXT:    cmovnel %eax, %ebp
208; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
209; X86-NEXT:    movl %ebp, 12(%eax)
210; X86-NEXT:    movl %edi, 8(%eax)
211; X86-NEXT:    movl %edx, 4(%eax)
212; X86-NEXT:    movl %esi, (%eax)
213; X86-NEXT:    popl %esi
214; X86-NEXT:    popl %edi
215; X86-NEXT:    popl %ebx
216; X86-NEXT:    popl %ebp
217; X86-NEXT:    retl $4
218  %tmp = call <4 x i32> @llvm.ushl.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
219  ret <4 x i32> %tmp
220}
221
222define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
223; X64-LABEL: vec_v8i16:
224; X64:       # %bb.0:
225; X64-NEXT:    movdqa %xmm1, %xmm2
226; X64-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
227; X64-NEXT:    pslld $23, %xmm2
228; X64-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
229; X64-NEXT:    paddd %xmm3, %xmm2
230; X64-NEXT:    cvttps2dq %xmm2, %xmm4
231; X64-NEXT:    pslld $16, %xmm4
232; X64-NEXT:    psrad $16, %xmm4
233; X64-NEXT:    movdqa %xmm1, %xmm2
234; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
235; X64-NEXT:    pslld $23, %xmm2
236; X64-NEXT:    paddd %xmm3, %xmm2
237; X64-NEXT:    cvttps2dq %xmm2, %xmm2
238; X64-NEXT:    pslld $16, %xmm2
239; X64-NEXT:    psrad $16, %xmm2
240; X64-NEXT:    packssdw %xmm4, %xmm2
241; X64-NEXT:    pmullw %xmm0, %xmm2
242; X64-NEXT:    psllw $12, %xmm1
243; X64-NEXT:    movdqa %xmm1, %xmm3
244; X64-NEXT:    psraw $15, %xmm3
245; X64-NEXT:    movdqa %xmm2, %xmm4
246; X64-NEXT:    psrlw $8, %xmm4
247; X64-NEXT:    pand %xmm3, %xmm4
248; X64-NEXT:    pandn %xmm2, %xmm3
249; X64-NEXT:    por %xmm4, %xmm3
250; X64-NEXT:    paddw %xmm1, %xmm1
251; X64-NEXT:    movdqa %xmm1, %xmm4
252; X64-NEXT:    psraw $15, %xmm4
253; X64-NEXT:    movdqa %xmm4, %xmm5
254; X64-NEXT:    pandn %xmm3, %xmm5
255; X64-NEXT:    psrlw $4, %xmm3
256; X64-NEXT:    pand %xmm4, %xmm3
257; X64-NEXT:    por %xmm5, %xmm3
258; X64-NEXT:    paddw %xmm1, %xmm1
259; X64-NEXT:    movdqa %xmm1, %xmm4
260; X64-NEXT:    psraw $15, %xmm4
261; X64-NEXT:    movdqa %xmm4, %xmm5
262; X64-NEXT:    pandn %xmm3, %xmm5
263; X64-NEXT:    psrlw $2, %xmm3
264; X64-NEXT:    pand %xmm4, %xmm3
265; X64-NEXT:    por %xmm5, %xmm3
266; X64-NEXT:    paddw %xmm1, %xmm1
267; X64-NEXT:    psraw $15, %xmm1
268; X64-NEXT:    movdqa %xmm1, %xmm4
269; X64-NEXT:    pandn %xmm3, %xmm4
270; X64-NEXT:    psrlw $1, %xmm3
271; X64-NEXT:    pand %xmm1, %xmm3
272; X64-NEXT:    por %xmm4, %xmm3
273; X64-NEXT:    pcmpeqw %xmm3, %xmm0
274; X64-NEXT:    pcmpeqd %xmm1, %xmm1
275; X64-NEXT:    pxor %xmm1, %xmm0
276; X64-NEXT:    por %xmm2, %xmm0
277; X64-NEXT:    retq
278;
279; X64-AVX2-LABEL: vec_v8i16:
280; X64-AVX2:       # %bb.0:
281; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
282; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
283; X64-AVX2-NEXT:    vpsllvd %ymm1, %ymm2, %ymm2
284; X64-AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
285; X64-AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
286; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
287; X64-AVX2-NEXT:    vpsrlvd %ymm1, %ymm3, %ymm1
288; X64-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
289; X64-AVX2-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
290; X64-AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
291; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
292; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
293; X64-AVX2-NEXT:    vpor %xmm2, %xmm0, %xmm0
294; X64-AVX2-NEXT:    vzeroupper
295; X64-AVX2-NEXT:    retq
296;
297; X86-LABEL: vec_v8i16:
298; X86:       # %bb.0:
299; X86-NEXT:    pushl %ebp
300; X86-NEXT:    pushl %ebx
301; X86-NEXT:    pushl %edi
302; X86-NEXT:    pushl %esi
303; X86-NEXT:    subl $12, %esp
304; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
305; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
306; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
307; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
308; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
309; X86-NEXT:    movl %eax, %ebx
310; X86-NEXT:    shll %cl, %ebx
311; X86-NEXT:    movzwl %bx, %edi
312; X86-NEXT:    shrl %cl, %edi
313; X86-NEXT:    cmpw %di, %ax
314; X86-NEXT:    movl $65535, %eax # imm = 0xFFFF
315; X86-NEXT:    cmovnel %eax, %ebx
316; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
317; X86-NEXT:    movl %esi, %eax
318; X86-NEXT:    movl %edx, %ecx
319; X86-NEXT:    shll %cl, %eax
320; X86-NEXT:    movzwl %ax, %edi
321; X86-NEXT:    shrl %cl, %edi
322; X86-NEXT:    cmpw %di, %si
323; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
324; X86-NEXT:    movl $65535, %esi # imm = 0xFFFF
325; X86-NEXT:    cmovnel %esi, %eax
326; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
327; X86-NEXT:    movl %ebp, %eax
328; X86-NEXT:    shll %cl, %eax
329; X86-NEXT:    movzwl %ax, %edx
330; X86-NEXT:    shrl %cl, %edx
331; X86-NEXT:    cmpw %dx, %bp
332; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
333; X86-NEXT:    cmovnel %esi, %eax
334; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
335; X86-NEXT:    movl $65535, %eax # imm = 0xFFFF
336; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
337; X86-NEXT:    movl %esi, %ebp
338; X86-NEXT:    shll %cl, %ebp
339; X86-NEXT:    movzwl %bp, %edx
340; X86-NEXT:    shrl %cl, %edx
341; X86-NEXT:    cmpw %dx, %si
342; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
343; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
344; X86-NEXT:    cmovnel %eax, %ebp
345; X86-NEXT:    movl %edx, %ebx
346; X86-NEXT:    shll %cl, %ebx
347; X86-NEXT:    movzwl %bx, %esi
348; X86-NEXT:    shrl %cl, %esi
349; X86-NEXT:    cmpw %si, %dx
350; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
351; X86-NEXT:    movl $65535, %esi # imm = 0xFFFF
352; X86-NEXT:    cmovnel %esi, %ebx
353; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
354; X86-NEXT:    movl %edx, %edi
355; X86-NEXT:    shll %cl, %edi
356; X86-NEXT:    movzwl %di, %eax
357; X86-NEXT:    shrl %cl, %eax
358; X86-NEXT:    cmpw %ax, %dx
359; X86-NEXT:    cmovnel %esi, %edi
360; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
361; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
362; X86-NEXT:    movl %edx, %esi
363; X86-NEXT:    shll %cl, %esi
364; X86-NEXT:    movzwl %si, %eax
365; X86-NEXT:    shrl %cl, %eax
366; X86-NEXT:    cmpw %ax, %dx
367; X86-NEXT:    movl $65535, %eax # imm = 0xFFFF
368; X86-NEXT:    cmovnel %eax, %esi
369; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
370; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
371; X86-NEXT:    shll %cl, %eax
372; X86-NEXT:    movzwl %ax, %edx
373; X86-NEXT:    shrl %cl, %edx
374; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
375; X86-NEXT:    cmpw %dx, %cx
376; X86-NEXT:    movl $65535, %ecx # imm = 0xFFFF
377; X86-NEXT:    cmovnel %ecx, %eax
378; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
379; X86-NEXT:    movw %ax, 14(%ecx)
380; X86-NEXT:    movw %si, 12(%ecx)
381; X86-NEXT:    movw %di, 10(%ecx)
382; X86-NEXT:    movw %bx, 8(%ecx)
383; X86-NEXT:    movw %bp, 6(%ecx)
384; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
385; X86-NEXT:    movw %ax, 4(%ecx)
386; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
387; X86-NEXT:    movw %ax, 2(%ecx)
388; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
389; X86-NEXT:    movw %ax, (%ecx)
390; X86-NEXT:    movl %ecx, %eax
391; X86-NEXT:    addl $12, %esp
392; X86-NEXT:    popl %esi
393; X86-NEXT:    popl %edi
394; X86-NEXT:    popl %ebx
395; X86-NEXT:    popl %ebp
396; X86-NEXT:    retl $4
397  %tmp = call <8 x i16> @llvm.ushl.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
398  ret <8 x i16> %tmp
399}
400
401define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
402; X64-LABEL: vec_v16i8:
403; X64:       # %bb.0:
404; X64-NEXT:    psllw $5, %xmm1
405; X64-NEXT:    pxor %xmm3, %xmm3
406; X64-NEXT:    pxor %xmm4, %xmm4
407; X64-NEXT:    pcmpgtb %xmm1, %xmm4
408; X64-NEXT:    movdqa %xmm4, %xmm5
409; X64-NEXT:    pandn %xmm0, %xmm5
410; X64-NEXT:    movdqa %xmm0, %xmm2
411; X64-NEXT:    psllw $4, %xmm2
412; X64-NEXT:    pand %xmm4, %xmm2
413; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
414; X64-NEXT:    por %xmm5, %xmm2
415; X64-NEXT:    paddb %xmm1, %xmm1
416; X64-NEXT:    pxor %xmm5, %xmm5
417; X64-NEXT:    pcmpgtb %xmm1, %xmm5
418; X64-NEXT:    movdqa %xmm5, %xmm6
419; X64-NEXT:    pandn %xmm2, %xmm6
420; X64-NEXT:    psllw $2, %xmm2
421; X64-NEXT:    pand %xmm5, %xmm2
422; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
423; X64-NEXT:    por %xmm6, %xmm2
424; X64-NEXT:    paddb %xmm1, %xmm1
425; X64-NEXT:    pcmpgtb %xmm1, %xmm3
426; X64-NEXT:    movdqa %xmm3, %xmm1
427; X64-NEXT:    pandn %xmm2, %xmm1
428; X64-NEXT:    paddb %xmm2, %xmm2
429; X64-NEXT:    pand %xmm3, %xmm2
430; X64-NEXT:    por %xmm1, %xmm2
431; X64-NEXT:    movdqa %xmm2, %xmm1
432; X64-NEXT:    psrlw $4, %xmm1
433; X64-NEXT:    pand %xmm4, %xmm1
434; X64-NEXT:    pandn %xmm2, %xmm4
435; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
436; X64-NEXT:    por %xmm4, %xmm1
437; X64-NEXT:    movdqa %xmm5, %xmm4
438; X64-NEXT:    pandn %xmm1, %xmm4
439; X64-NEXT:    psrlw $2, %xmm1
440; X64-NEXT:    pand %xmm5, %xmm1
441; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
442; X64-NEXT:    por %xmm4, %xmm1
443; X64-NEXT:    movdqa %xmm3, %xmm4
444; X64-NEXT:    pandn %xmm1, %xmm4
445; X64-NEXT:    psrlw $1, %xmm1
446; X64-NEXT:    pand %xmm3, %xmm1
447; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
448; X64-NEXT:    por %xmm4, %xmm1
449; X64-NEXT:    pcmpeqb %xmm1, %xmm0
450; X64-NEXT:    pcmpeqd %xmm1, %xmm1
451; X64-NEXT:    pxor %xmm1, %xmm0
452; X64-NEXT:    por %xmm2, %xmm0
453; X64-NEXT:    retq
454;
455; X64-AVX2-LABEL: vec_v16i8:
456; X64-AVX2:       # %bb.0:
457; X64-AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
458; X64-AVX2-NEXT:    vpsllw $4, %xmm0, %xmm2
459; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
460; X64-AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm2
461; X64-AVX2-NEXT:    vpsllw $2, %xmm2, %xmm3
462; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
463; X64-AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm4
464; X64-AVX2-NEXT:    vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
465; X64-AVX2-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
466; X64-AVX2-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
467; X64-AVX2-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
468; X64-AVX2-NEXT:    vpsrlw $4, %xmm2, %xmm3
469; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
470; X64-AVX2-NEXT:    vpblendvb %xmm1, %xmm3, %xmm2, %xmm1
471; X64-AVX2-NEXT:    vpsrlw $2, %xmm1, %xmm3
472; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
473; X64-AVX2-NEXT:    vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
474; X64-AVX2-NEXT:    vpsrlw $1, %xmm1, %xmm3
475; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
476; X64-AVX2-NEXT:    vpblendvb %xmm5, %xmm3, %xmm1, %xmm1
477; X64-AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
478; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
479; X64-AVX2-NEXT:    vpblendvb %xmm0, %xmm2, %xmm1, %xmm0
480; X64-AVX2-NEXT:    retq
481;
482; X86-LABEL: vec_v16i8:
483; X86:       # %bb.0:
484; X86-NEXT:    pushl %ebp
485; X86-NEXT:    pushl %ebx
486; X86-NEXT:    pushl %edi
487; X86-NEXT:    pushl %esi
488; X86-NEXT:    subl $48, %esp
489; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
490; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
491; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
492; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
493; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
494; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
495; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
496; X86-NEXT:    movb %bl, %bh
497; X86-NEXT:    shlb %cl, %bh
498; X86-NEXT:    movzbl %bh, %edi
499; X86-NEXT:    shrb %cl, %bh
500; X86-NEXT:    cmpb %bh, %bl
501; X86-NEXT:    movl $255, %esi
502; X86-NEXT:    cmovnel %esi, %edi
503; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
504; X86-NEXT:    movb %dh, %bl
505; X86-NEXT:    movb %ah, %cl
506; X86-NEXT:    shlb %cl, %bl
507; X86-NEXT:    movzbl %bl, %edi
508; X86-NEXT:    shrb %cl, %bl
509; X86-NEXT:    cmpb %bl, %dh
510; X86-NEXT:    cmovnel %esi, %edi
511; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
512; X86-NEXT:    movb %ch, %ah
513; X86-NEXT:    movb %dl, %cl
514; X86-NEXT:    shlb %cl, %ah
515; X86-NEXT:    movzbl %ah, %edi
516; X86-NEXT:    shrb %cl, %ah
517; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
518; X86-NEXT:    cmpb %ah, %ch
519; X86-NEXT:    cmovnel %esi, %edi
520; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
521; X86-NEXT:    movb %dl, %ah
522; X86-NEXT:    movl %eax, %ecx
523; X86-NEXT:    shlb %cl, %ah
524; X86-NEXT:    movzbl %ah, %edi
525; X86-NEXT:    shrb %cl, %ah
526; X86-NEXT:    cmpb %ah, %dl
527; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
528; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
529; X86-NEXT:    cmovnel %esi, %edi
530; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
531; X86-NEXT:    movl %eax, %edx
532; X86-NEXT:    shlb %cl, %dl
533; X86-NEXT:    movzbl %dl, %edi
534; X86-NEXT:    shrb %cl, %dl
535; X86-NEXT:    cmpb %dl, %al
536; X86-NEXT:    cmovnel %esi, %edi
537; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
538; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
539; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
540; X86-NEXT:    movl %eax, %edx
541; X86-NEXT:    shlb %cl, %dl
542; X86-NEXT:    movzbl %dl, %edi
543; X86-NEXT:    shrb %cl, %dl
544; X86-NEXT:    cmpb %dl, %al
545; X86-NEXT:    cmovnel %esi, %edi
546; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
547; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
548; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
549; X86-NEXT:    movl %eax, %edx
550; X86-NEXT:    shlb %cl, %dl
551; X86-NEXT:    movzbl %dl, %edi
552; X86-NEXT:    shrb %cl, %dl
553; X86-NEXT:    cmpb %dl, %al
554; X86-NEXT:    cmovnel %esi, %edi
555; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
556; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
557; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
558; X86-NEXT:    movl %eax, %edx
559; X86-NEXT:    shlb %cl, %dl
560; X86-NEXT:    movzbl %dl, %edi
561; X86-NEXT:    shrb %cl, %dl
562; X86-NEXT:    cmpb %dl, %al
563; X86-NEXT:    cmovnel %esi, %edi
564; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
565; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
566; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
567; X86-NEXT:    movl %eax, %edx
568; X86-NEXT:    shlb %cl, %dl
569; X86-NEXT:    movzbl %dl, %edi
570; X86-NEXT:    shrb %cl, %dl
571; X86-NEXT:    cmpb %dl, %al
572; X86-NEXT:    cmovnel %esi, %edi
573; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
574; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
575; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
576; X86-NEXT:    movl %eax, %edx
577; X86-NEXT:    shlb %cl, %dl
578; X86-NEXT:    movzbl %dl, %edi
579; X86-NEXT:    shrb %cl, %dl
580; X86-NEXT:    cmpb %dl, %al
581; X86-NEXT:    cmovnel %esi, %edi
582; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
583; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
584; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
585; X86-NEXT:    movl %eax, %edx
586; X86-NEXT:    shlb %cl, %dl
587; X86-NEXT:    movzbl %dl, %edi
588; X86-NEXT:    shrb %cl, %dl
589; X86-NEXT:    cmpb %dl, %al
590; X86-NEXT:    cmovnel %esi, %edi
591; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
592; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
593; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
594; X86-NEXT:    movl %eax, %edx
595; X86-NEXT:    shlb %cl, %dl
596; X86-NEXT:    movzbl %dl, %edi
597; X86-NEXT:    shrb %cl, %dl
598; X86-NEXT:    cmpb %dl, %al
599; X86-NEXT:    cmovnel %esi, %edi
600; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
601; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
602; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
603; X86-NEXT:    movl %eax, %edx
604; X86-NEXT:    shlb %cl, %dl
605; X86-NEXT:    movzbl %dl, %ebp
606; X86-NEXT:    shrb %cl, %dl
607; X86-NEXT:    cmpb %dl, %al
608; X86-NEXT:    cmovnel %esi, %ebp
609; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
610; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
611; X86-NEXT:    movl %eax, %edx
612; X86-NEXT:    shlb %cl, %dl
613; X86-NEXT:    movzbl %dl, %edi
614; X86-NEXT:    shrb %cl, %dl
615; X86-NEXT:    cmpb %dl, %al
616; X86-NEXT:    cmovnel %esi, %edi
617; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
618; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
619; X86-NEXT:    movl %eax, %edx
620; X86-NEXT:    shlb %cl, %dl
621; X86-NEXT:    movzbl %dl, %ebx
622; X86-NEXT:    shrb %cl, %dl
623; X86-NEXT:    cmpb %dl, %al
624; X86-NEXT:    cmovnel %esi, %ebx
625; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
626; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
627; X86-NEXT:    movb %al, %ah
628; X86-NEXT:    shlb %cl, %ah
629; X86-NEXT:    movzbl %ah, %edx
630; X86-NEXT:    shrb %cl, %ah
631; X86-NEXT:    cmpb %ah, %al
632; X86-NEXT:    cmovnel %esi, %edx
633; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
634; X86-NEXT:    movb %dl, 15(%eax)
635; X86-NEXT:    movb %bl, 14(%eax)
636; X86-NEXT:    movl %edi, %ecx
637; X86-NEXT:    movb %cl, 13(%eax)
638; X86-NEXT:    movl %ebp, %ecx
639; X86-NEXT:    movb %cl, 12(%eax)
640; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
641; X86-NEXT:    movb %cl, 11(%eax)
642; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
643; X86-NEXT:    movb %cl, 10(%eax)
644; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
645; X86-NEXT:    movb %cl, 9(%eax)
646; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
647; X86-NEXT:    movb %cl, 8(%eax)
648; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
649; X86-NEXT:    movb %cl, 7(%eax)
650; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
651; X86-NEXT:    movb %cl, 6(%eax)
652; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
653; X86-NEXT:    movb %cl, 5(%eax)
654; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
655; X86-NEXT:    movb %cl, 4(%eax)
656; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
657; X86-NEXT:    movb %cl, 3(%eax)
658; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
659; X86-NEXT:    movb %cl, 2(%eax)
660; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
661; X86-NEXT:    movb %cl, 1(%eax)
662; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
663; X86-NEXT:    movb %cl, (%eax)
664; X86-NEXT:    addl $48, %esp
665; X86-NEXT:    popl %esi
666; X86-NEXT:    popl %edi
667; X86-NEXT:    popl %ebx
668; X86-NEXT:    popl %ebp
669; X86-NEXT:    retl $4
670  %tmp = call <16 x i8> @llvm.ushl.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
671  ret <16 x i8> %tmp
672}
673