xref: /llvm-project/llvm/test/CodeGen/X86/combine-udiv.ll (revision 9540a7ae82dfabe551bfef94fc9f29ebebf841da)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,XOP
7
8; fold (udiv x, 1) -> x
9define i32 @combine_udiv_by_one(i32 %x) {
10; CHECK-LABEL: combine_udiv_by_one:
11; CHECK:       # %bb.0:
12; CHECK-NEXT:    movl %edi, %eax
13; CHECK-NEXT:    retq
14  %1 = udiv i32 %x, 1
15  ret i32 %1
16}
17
18define <4 x i32> @combine_vec_udiv_by_one(<4 x i32> %x) {
19; CHECK-LABEL: combine_vec_udiv_by_one:
20; CHECK:       # %bb.0:
21; CHECK-NEXT:    retq
22  %1 = udiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
23  ret <4 x i32> %1
24}
25
26; fold (udiv x, -1) -> select((icmp eq x, -1), 1, 0)
27define i32 @combine_udiv_by_negone(i32 %x) {
28; CHECK-LABEL: combine_udiv_by_negone:
29; CHECK:       # %bb.0:
30; CHECK-NEXT:    xorl %eax, %eax
31; CHECK-NEXT:    cmpl $-1, %edi
32; CHECK-NEXT:    sete %al
33; CHECK-NEXT:    retq
34  %1 = udiv i32 %x, -1
35  ret i32 %1
36}
37
38define <4 x i32> @combine_vec_udiv_by_negone(<4 x i32> %x) {
39; SSE-LABEL: combine_vec_udiv_by_negone:
40; SSE:       # %bb.0:
41; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
42; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
43; SSE-NEXT:    psrld $31, %xmm0
44; SSE-NEXT:    retq
45;
46; AVX-LABEL: combine_vec_udiv_by_negone:
47; AVX:       # %bb.0:
48; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
49; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
50; AVX-NEXT:    vpsrld $31, %xmm0, %xmm0
51; AVX-NEXT:    retq
52;
53; XOP-LABEL: combine_vec_udiv_by_negone:
54; XOP:       # %bb.0:
55; XOP-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
56; XOP-NEXT:    vpcomeqd %xmm1, %xmm0, %xmm0
57; XOP-NEXT:    vpsrld $31, %xmm0, %xmm0
58; XOP-NEXT:    retq
59  %1 = udiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
60  ret <4 x i32> %1
61}
62
63; fold (udiv x, INT_MIN) -> (srl x, 31)
64define i32 @combine_udiv_by_minsigned(i32 %x) {
65; CHECK-LABEL: combine_udiv_by_minsigned:
66; CHECK:       # %bb.0:
67; CHECK-NEXT:    movl %edi, %eax
68; CHECK-NEXT:    shrl $31, %eax
69; CHECK-NEXT:    retq
70  %1 = udiv i32 %x, -2147483648
71  ret i32 %1
72}
73
74define <4 x i32> @combine_vec_udiv_by_minsigned(<4 x i32> %x) {
75; SSE-LABEL: combine_vec_udiv_by_minsigned:
76; SSE:       # %bb.0:
77; SSE-NEXT:    psrld $31, %xmm0
78; SSE-NEXT:    retq
79;
80; AVX-LABEL: combine_vec_udiv_by_minsigned:
81; AVX:       # %bb.0:
82; AVX-NEXT:    vpsrld $31, %xmm0, %xmm0
83; AVX-NEXT:    retq
84;
85; XOP-LABEL: combine_vec_udiv_by_minsigned:
86; XOP:       # %bb.0:
87; XOP-NEXT:    vpsrld $31, %xmm0, %xmm0
88; XOP-NEXT:    retq
89  %1 = udiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
90  ret <4 x i32> %1
91}
92
93; fold (udiv 0, x) -> 0
94define i32 @combine_udiv_zero(i32 %x) {
95; CHECK-LABEL: combine_udiv_zero:
96; CHECK:       # %bb.0:
97; CHECK-NEXT:    xorl %eax, %eax
98; CHECK-NEXT:    retq
99  %1 = udiv i32 0, %x
100  ret i32 %1
101}
102
103define <4 x i32> @combine_vec_udiv_zero(<4 x i32> %x) {
104; SSE-LABEL: combine_vec_udiv_zero:
105; SSE:       # %bb.0:
106; SSE-NEXT:    xorps %xmm0, %xmm0
107; SSE-NEXT:    retq
108;
109; AVX-LABEL: combine_vec_udiv_zero:
110; AVX:       # %bb.0:
111; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
112; AVX-NEXT:    retq
113;
114; XOP-LABEL: combine_vec_udiv_zero:
115; XOP:       # %bb.0:
116; XOP-NEXT:    vxorps %xmm0, %xmm0, %xmm0
117; XOP-NEXT:    retq
118  %1 = udiv <4 x i32> zeroinitializer, %x
119  ret <4 x i32> %1
120}
121
122; fold (udiv x, x) -> 1
123define i32 @combine_udiv_dupe(i32 %x) {
124; CHECK-LABEL: combine_udiv_dupe:
125; CHECK:       # %bb.0:
126; CHECK-NEXT:    movl $1, %eax
127; CHECK-NEXT:    retq
128  %1 = udiv i32 %x, %x
129  ret i32 %1
130}
131
132define <4 x i32> @combine_vec_udiv_dupe(<4 x i32> %x) {
133; SSE-LABEL: combine_vec_udiv_dupe:
134; SSE:       # %bb.0:
135; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
136; SSE-NEXT:    retq
137;
138; AVX-LABEL: combine_vec_udiv_dupe:
139; AVX:       # %bb.0:
140; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
141; AVX-NEXT:    retq
142;
143; XOP-LABEL: combine_vec_udiv_dupe:
144; XOP:       # %bb.0:
145; XOP-NEXT:    vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
146; XOP-NEXT:    retq
147  %1 = udiv <4 x i32> %x, %x
148  ret <4 x i32> %1
149}
150
151; fold (udiv x, (1 << c)) -> x >>u c
152define <4 x i32> @combine_vec_udiv_by_pow2a(<4 x i32> %x) {
153; SSE-LABEL: combine_vec_udiv_by_pow2a:
154; SSE:       # %bb.0:
155; SSE-NEXT:    psrld $2, %xmm0
156; SSE-NEXT:    retq
157;
158; AVX-LABEL: combine_vec_udiv_by_pow2a:
159; AVX:       # %bb.0:
160; AVX-NEXT:    vpsrld $2, %xmm0, %xmm0
161; AVX-NEXT:    retq
162;
163; XOP-LABEL: combine_vec_udiv_by_pow2a:
164; XOP:       # %bb.0:
165; XOP-NEXT:    vpsrld $2, %xmm0, %xmm0
166; XOP-NEXT:    retq
167  %1 = udiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
168  ret <4 x i32> %1
169}
170
171define <4 x i32> @combine_vec_udiv_by_pow2b(<4 x i32> %x) {
172; SSE2-LABEL: combine_vec_udiv_by_pow2b:
173; SSE2:       # %bb.0:
174; SSE2-NEXT:    movdqa %xmm0, %xmm1
175; SSE2-NEXT:    psrld $4, %xmm1
176; SSE2-NEXT:    movdqa %xmm0, %xmm2
177; SSE2-NEXT:    psrld $3, %xmm2
178; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
179; SSE2-NEXT:    movdqa %xmm0, %xmm1
180; SSE2-NEXT:    psrld $2, %xmm1
181; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
182; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
183; SSE2-NEXT:    retq
184;
185; SSE41-LABEL: combine_vec_udiv_by_pow2b:
186; SSE41:       # %bb.0:
187; SSE41-NEXT:    movdqa %xmm0, %xmm1
188; SSE41-NEXT:    psrld $4, %xmm1
189; SSE41-NEXT:    movdqa %xmm0, %xmm2
190; SSE41-NEXT:    psrld $2, %xmm2
191; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
192; SSE41-NEXT:    movdqa %xmm0, %xmm1
193; SSE41-NEXT:    psrld $3, %xmm1
194; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
195; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
196; SSE41-NEXT:    retq
197;
198; AVX1-LABEL: combine_vec_udiv_by_pow2b:
199; AVX1:       # %bb.0:
200; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm1
201; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm2
202; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
203; AVX1-NEXT:    vpsrld $3, %xmm0, %xmm2
204; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
205; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
206; AVX1-NEXT:    retq
207;
208; AVX2-LABEL: combine_vec_udiv_by_pow2b:
209; AVX2:       # %bb.0:
210; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
211; AVX2-NEXT:    retq
212;
213; XOP-LABEL: combine_vec_udiv_by_pow2b:
214; XOP:       # %bb.0:
215; XOP-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
216; XOP-NEXT:    retq
217  %1 = udiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16>
218  ret <4 x i32> %1
219}
220
221define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
222; SSE2-LABEL: combine_vec_udiv_by_pow2c:
223; SSE2:       # %bb.0:
224; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
225; SSE2-NEXT:    movdqa %xmm0, %xmm3
226; SSE2-NEXT:    psrld %xmm2, %xmm3
227; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
228; SSE2-NEXT:    movdqa %xmm0, %xmm2
229; SSE2-NEXT:    psrld %xmm4, %xmm2
230; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
231; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
232; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
233; SSE2-NEXT:    movdqa %xmm0, %xmm4
234; SSE2-NEXT:    psrld %xmm3, %xmm4
235; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
236; SSE2-NEXT:    psrld %xmm1, %xmm0
237; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
238; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
239; SSE2-NEXT:    movaps %xmm2, %xmm0
240; SSE2-NEXT:    retq
241;
242; SSE41-LABEL: combine_vec_udiv_by_pow2c:
243; SSE41:       # %bb.0:
244; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
245; SSE41-NEXT:    movdqa %xmm0, %xmm3
246; SSE41-NEXT:    psrld %xmm2, %xmm3
247; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
248; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
249; SSE41-NEXT:    movdqa %xmm0, %xmm5
250; SSE41-NEXT:    psrld %xmm4, %xmm5
251; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
252; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
253; SSE41-NEXT:    movdqa %xmm0, %xmm3
254; SSE41-NEXT:    psrld %xmm1, %xmm3
255; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
256; SSE41-NEXT:    psrld %xmm1, %xmm0
257; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
258; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
259; SSE41-NEXT:    retq
260;
261; AVX1-LABEL: combine_vec_udiv_by_pow2c:
262; AVX1:       # %bb.0:
263; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
264; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
265; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
266; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
267; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
268; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
269; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
270; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
271; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
272; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
273; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
274; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
275; AVX1-NEXT:    retq
276;
277; AVX2-LABEL: combine_vec_udiv_by_pow2c:
278; AVX2:       # %bb.0:
279; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
280; AVX2-NEXT:    retq
281;
282; XOP-LABEL: combine_vec_udiv_by_pow2c:
283; XOP:       # %bb.0:
284; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
285; XOP-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
286; XOP-NEXT:    vpshld %xmm1, %xmm0, %xmm0
287; XOP-NEXT:    retq
288  %1 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y
289  %2 = udiv <4 x i32> %x, %1
290  ret <4 x i32> %2
291}
292
293; fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
294define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
295; SSE2-LABEL: combine_vec_udiv_by_shl_pow2a:
296; SSE2:       # %bb.0:
297; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
298; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
299; SSE2-NEXT:    movdqa %xmm0, %xmm3
300; SSE2-NEXT:    psrld %xmm2, %xmm3
301; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
302; SSE2-NEXT:    movdqa %xmm0, %xmm2
303; SSE2-NEXT:    psrld %xmm4, %xmm2
304; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
305; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
306; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
307; SSE2-NEXT:    movdqa %xmm0, %xmm4
308; SSE2-NEXT:    psrld %xmm3, %xmm4
309; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
310; SSE2-NEXT:    psrld %xmm1, %xmm0
311; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
312; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
313; SSE2-NEXT:    movaps %xmm2, %xmm0
314; SSE2-NEXT:    retq
315;
316; SSE41-LABEL: combine_vec_udiv_by_shl_pow2a:
317; SSE41:       # %bb.0:
318; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
319; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
320; SSE41-NEXT:    movdqa %xmm0, %xmm3
321; SSE41-NEXT:    psrld %xmm2, %xmm3
322; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
323; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
324; SSE41-NEXT:    movdqa %xmm0, %xmm5
325; SSE41-NEXT:    psrld %xmm4, %xmm5
326; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
327; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
328; SSE41-NEXT:    movdqa %xmm0, %xmm3
329; SSE41-NEXT:    psrld %xmm1, %xmm3
330; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
331; SSE41-NEXT:    psrld %xmm1, %xmm0
332; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
333; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
334; SSE41-NEXT:    retq
335;
336; AVX1-LABEL: combine_vec_udiv_by_shl_pow2a:
337; AVX1:       # %bb.0:
338; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
339; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
340; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
341; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
342; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
343; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
344; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
345; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
346; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
347; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
348; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
349; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
350; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
351; AVX1-NEXT:    retq
352;
353; AVX2-LABEL: combine_vec_udiv_by_shl_pow2a:
354; AVX2:       # %bb.0:
355; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
356; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
357; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
358; AVX2-NEXT:    retq
359;
360; XOP-LABEL: combine_vec_udiv_by_shl_pow2a:
361; XOP:       # %bb.0:
362; XOP-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4294967294,4294967294,4294967294,4294967294]
363; XOP-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
364; XOP-NEXT:    vpshld %xmm1, %xmm0, %xmm0
365; XOP-NEXT:    retq
366  %1 = shl <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %y
367  %2 = udiv <4 x i32> %x, %1
368  ret <4 x i32> %2
369}
370
371define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
372; SSE2-LABEL: combine_vec_udiv_by_shl_pow2b:
373; SSE2:       # %bb.0:
374; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
375; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
376; SSE2-NEXT:    movdqa %xmm0, %xmm3
377; SSE2-NEXT:    psrld %xmm2, %xmm3
378; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
379; SSE2-NEXT:    movdqa %xmm0, %xmm2
380; SSE2-NEXT:    psrld %xmm4, %xmm2
381; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
382; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
383; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
384; SSE2-NEXT:    movdqa %xmm0, %xmm4
385; SSE2-NEXT:    psrld %xmm3, %xmm4
386; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
387; SSE2-NEXT:    psrld %xmm1, %xmm0
388; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
389; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
390; SSE2-NEXT:    movaps %xmm2, %xmm0
391; SSE2-NEXT:    retq
392;
393; SSE41-LABEL: combine_vec_udiv_by_shl_pow2b:
394; SSE41:       # %bb.0:
395; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
396; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
397; SSE41-NEXT:    movdqa %xmm0, %xmm3
398; SSE41-NEXT:    psrld %xmm2, %xmm3
399; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
400; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
401; SSE41-NEXT:    movdqa %xmm0, %xmm5
402; SSE41-NEXT:    psrld %xmm4, %xmm5
403; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
404; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
405; SSE41-NEXT:    movdqa %xmm0, %xmm3
406; SSE41-NEXT:    psrld %xmm1, %xmm3
407; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
408; SSE41-NEXT:    psrld %xmm1, %xmm0
409; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
410; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
411; SSE41-NEXT:    retq
412;
413; AVX1-LABEL: combine_vec_udiv_by_shl_pow2b:
414; AVX1:       # %bb.0:
415; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
416; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
417; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
418; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm4
419; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
420; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
421; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
422; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm1
423; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
424; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
425; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm0
426; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
427; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
428; AVX1-NEXT:    retq
429;
430; AVX2-LABEL: combine_vec_udiv_by_shl_pow2b:
431; AVX2:       # %bb.0:
432; AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
433; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
434; AVX2-NEXT:    retq
435;
436; XOP-LABEL: combine_vec_udiv_by_shl_pow2b:
437; XOP:       # %bb.0:
438; XOP-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,4294967294,4294967293,4294967292]
439; XOP-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
440; XOP-NEXT:    vpshld %xmm1, %xmm0, %xmm0
441; XOP-NEXT:    retq
442  %1 = shl <4 x i32> <i32 1, i32 4, i32 8, i32 16>, %y
443  %2 = udiv <4 x i32> %x, %1
444  ret <4 x i32> %2
445}
446
447; fold (udiv x, c1)
448define i32 @combine_udiv_uniform(i32 %x) {
449; CHECK-LABEL: combine_udiv_uniform:
450; CHECK:       # %bb.0:
451; CHECK-NEXT:    movl %edi, %ecx
452; CHECK-NEXT:    movl $2987803337, %eax # imm = 0xB21642C9
453; CHECK-NEXT:    imulq %rcx, %rax
454; CHECK-NEXT:    shrq $36, %rax
455; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
456; CHECK-NEXT:    retq
457  %1 = udiv i32 %x, 23
458  ret i32 %1
459}
460
461define <8 x i16> @combine_vec_udiv_uniform(<8 x i16> %x) {
462; SSE-LABEL: combine_vec_udiv_uniform:
463; SSE:       # %bb.0:
464; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [25645,25645,25645,25645,25645,25645,25645,25645]
465; SSE-NEXT:    pmulhuw %xmm0, %xmm1
466; SSE-NEXT:    psubw %xmm1, %xmm0
467; SSE-NEXT:    psrlw $1, %xmm0
468; SSE-NEXT:    paddw %xmm1, %xmm0
469; SSE-NEXT:    psrlw $4, %xmm0
470; SSE-NEXT:    retq
471;
472; AVX-LABEL: combine_vec_udiv_uniform:
473; AVX:       # %bb.0:
474; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [25645,25645,25645,25645,25645,25645,25645,25645]
475; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
476; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
477; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
478; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
479; AVX-NEXT:    retq
480;
481; XOP-LABEL: combine_vec_udiv_uniform:
482; XOP:       # %bb.0:
483; XOP-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [25645,25645,25645,25645,25645,25645,25645,25645]
484; XOP-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
485; XOP-NEXT:    vpsrlw $1, %xmm0, %xmm0
486; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
487; XOP-NEXT:    vpsrlw $4, %xmm0, %xmm0
488; XOP-NEXT:    retq
489  %1 = udiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23>
490  ret <8 x i16> %1
491}
492
493define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
494; SSE2-LABEL: combine_vec_udiv_nonuniform:
495; SSE2:       # %bb.0:
496; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
497; SSE2-NEXT:    movdqa %xmm0, %xmm2
498; SSE2-NEXT:    pand %xmm1, %xmm2
499; SSE2-NEXT:    movdqa %xmm0, %xmm3
500; SSE2-NEXT:    psrlw $3, %xmm3
501; SSE2-NEXT:    pandn %xmm3, %xmm1
502; SSE2-NEXT:    por %xmm2, %xmm1
503; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
504; SSE2-NEXT:    psubw %xmm1, %xmm0
505; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
506; SSE2-NEXT:    paddw %xmm1, %xmm0
507; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,0]
508; SSE2-NEXT:    pandn %xmm0, %xmm1
509; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
510; SSE2-NEXT:    por %xmm1, %xmm0
511; SSE2-NEXT:    retq
512;
513; SSE41-LABEL: combine_vec_udiv_nonuniform:
514; SSE41:       # %bb.0:
515; SSE41-NEXT:    movdqa %xmm0, %xmm1
516; SSE41-NEXT:    psrlw $3, %xmm1
517; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
518; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
519; SSE41-NEXT:    psubw %xmm1, %xmm0
520; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
521; SSE41-NEXT:    paddw %xmm1, %xmm0
522; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4096,2048,8,u,u,2,2,u]
523; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
524; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7]
525; SSE41-NEXT:    retq
526;
527; AVX-LABEL: combine_vec_udiv_nonuniform:
528; AVX:       # %bb.0:
529; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm1
530; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
531; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
532; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
533; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
534; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
535; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4096,2048,8,u,u,2,2,u]
536; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7]
537; AVX-NEXT:    retq
538;
539; XOP-LABEL: combine_vec_udiv_nonuniform:
540; XOP:       # %bb.0:
541; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
542; XOP-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
543; XOP-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
544; XOP-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
545; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
546; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
547; XOP-NEXT:    retq
548  %1 = udiv <8 x i16> %x, <i16 23, i16 34, i16 -23, i16 56, i16 128, i16 -1, i16 -256, i16 -32768>
549  ret <8 x i16> %1
550}
551
552define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
553; SSE2-LABEL: combine_vec_udiv_nonuniform2:
554; SSE2:       # %bb.0:
555; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
556; SSE2-NEXT:    movdqa %xmm0, %xmm2
557; SSE2-NEXT:    pand %xmm1, %xmm2
558; SSE2-NEXT:    psrlw $1, %xmm0
559; SSE2-NEXT:    pandn %xmm0, %xmm1
560; SSE2-NEXT:    por %xmm2, %xmm1
561; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [16393,59919,58255,32787,55189,8197,52429,32789]
562; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [8,2048,2048,2,2048,8,2048,2]
563; SSE2-NEXT:    movdqa %xmm1, %xmm0
564; SSE2-NEXT:    retq
565;
566; SSE41-LABEL: combine_vec_udiv_nonuniform2:
567; SSE41:       # %bb.0:
568; SSE41-NEXT:    movdqa %xmm0, %xmm1
569; SSE41-NEXT:    psrlw $1, %xmm1
570; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
571; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16393,59919,58255,32787,55189,8197,52429,32789]
572; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [8,2048,2048,2,2048,8,2048,2]
573; SSE41-NEXT:    retq
574;
575; AVX-LABEL: combine_vec_udiv_nonuniform2:
576; AVX:       # %bb.0:
577; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm1
578; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
579; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16393,59919,58255,32787,55189,8197,52429,32789]
580; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [8,2048,2048,2,2048,8,2048,2]
581; AVX-NEXT:    retq
582;
583; XOP-LABEL: combine_vec_udiv_nonuniform2:
584; XOP:       # %bb.0:
585; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
586; XOP-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16393,59919,58255,32787,55189,8197,52429,32789]
587; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
588; XOP-NEXT:    retq
589  %1 = udiv <8 x i16> %x, <i16 -34, i16 35, i16 36, i16 -37, i16 38, i16 -39, i16 40, i16 -41>
590  ret <8 x i16> %1
591}
592
593define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
594; SSE-LABEL: combine_vec_udiv_nonuniform3:
595; SSE:       # %bb.0:
596; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [9363,25645,18351,12137,2115,23705,1041,517]
597; SSE-NEXT:    pmulhuw %xmm0, %xmm1
598; SSE-NEXT:    psubw %xmm1, %xmm0
599; SSE-NEXT:    psrlw $1, %xmm0
600; SSE-NEXT:    paddw %xmm1, %xmm0
601; SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,4096,4096,4096,4096,2048,2048,1024]
602; SSE-NEXT:    retq
603;
604; AVX-LABEL: combine_vec_udiv_nonuniform3:
605; AVX:       # %bb.0:
606; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [9363,25645,18351,12137,2115,23705,1041,517]
607; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
608; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
609; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
610; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16384,4096,4096,4096,4096,2048,2048,1024]
611; AVX-NEXT:    retq
612;
613; XOP-LABEL: combine_vec_udiv_nonuniform3:
614; XOP:       # %bb.0:
615; XOP-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [9363,25645,18351,12137,2115,23705,1041,517]
616; XOP-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
617; XOP-NEXT:    vpsrlw $1, %xmm0, %xmm0
618; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
619; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
620; XOP-NEXT:    retq
621  %1 = udiv <8 x i16> %x, <i16 7, i16 23, i16 25, i16 27, i16 31, i16 47, i16 63, i16 127>
622  ret <8 x i16> %1
623}
624
625define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
626; SSE2-LABEL: combine_vec_udiv_nonuniform4:
627; SSE2:       # %bb.0:
628; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
629; SSE2-NEXT:    movdqa %xmm0, %xmm2
630; SSE2-NEXT:    pand %xmm1, %xmm2
631; SSE2-NEXT:    pxor %xmm3, %xmm3
632; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
633; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
634; SSE2-NEXT:    psrlw $15, %xmm0
635; SSE2-NEXT:    pandn %xmm0, %xmm1
636; SSE2-NEXT:    por %xmm2, %xmm1
637; SSE2-NEXT:    movdqa %xmm1, %xmm0
638; SSE2-NEXT:    retq
639;
640; SSE41-LABEL: combine_vec_udiv_nonuniform4:
641; SSE41:       # %bb.0:
642; SSE41-NEXT:    movdqa %xmm0, %xmm1
643; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
644; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
645; SSE41-NEXT:    psrlw $8, %xmm2
646; SSE41-NEXT:    packuswb %xmm2, %xmm2
647; SSE41-NEXT:    psrlw $7, %xmm2
648; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
649; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
650; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
651; SSE41-NEXT:    movdqa %xmm2, %xmm0
652; SSE41-NEXT:    retq
653;
654; AVX-LABEL: combine_vec_udiv_nonuniform4:
655; AVX:       # %bb.0:
656; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
657; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
658; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
659; AVX-NEXT:    vpackuswb %xmm1, %xmm1, %xmm1
660; AVX-NEXT:    vpsrlw $7, %xmm1, %xmm1
661; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
662; AVX-NEXT:    vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
663; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
664; AVX-NEXT:    retq
665;
666; XOP-LABEL: combine_vec_udiv_nonuniform4:
667; XOP:       # %bb.0:
668; XOP-NEXT:    movl $171, %eax
669; XOP-NEXT:    vmovd %eax, %xmm1
670; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
671; XOP-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
672; XOP-NEXT:    vpsrlw $8, %xmm1, %xmm1
673; XOP-NEXT:    movl $249, %eax
674; XOP-NEXT:    vmovd %eax, %xmm2
675; XOP-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
676; XOP-NEXT:    vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
677; XOP-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
678; XOP-NEXT:    retq
679  %div = udiv <16 x i8> %x, <i8 -64, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
680  ret <16 x i8> %div
681}
682
683define <8 x i16> @pr38477(<8 x i16> %a0) {
684; SSE2-LABEL: pr38477:
685; SSE2:       # %bb.0:
686; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
687; SSE2-NEXT:    movdqa %xmm1, %xmm2
688; SSE2-NEXT:    pandn %xmm0, %xmm2
689; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [u,4957,57457,4103,16385,35545,2048,2115]
690; SSE2-NEXT:    pmulhuw %xmm0, %xmm3
691; SSE2-NEXT:    psubw %xmm3, %xmm0
692; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,32768,0,0,0,0,0,32768]
693; SSE2-NEXT:    paddw %xmm3, %xmm0
694; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
695; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
696; SSE2-NEXT:    por %xmm3, %xmm0
697; SSE2-NEXT:    pand %xmm1, %xmm0
698; SSE2-NEXT:    por %xmm2, %xmm0
699; SSE2-NEXT:    retq
700;
701; SSE41-LABEL: pr38477:
702; SSE41:       # %bb.0:
703; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,4957,57457,4103,16385,35545,2048,2115]
704; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
705; SSE41-NEXT:    movdqa %xmm0, %xmm2
706; SSE41-NEXT:    psubw %xmm1, %xmm2
707; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,32768,0,0,0,0,0,32768]
708; SSE41-NEXT:    paddw %xmm1, %xmm2
709; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,1024,1024,16,4,1024,u,4096]
710; SSE41-NEXT:    pmulhuw %xmm2, %xmm1
711; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7]
712; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
713; SSE41-NEXT:    retq
714;
715; AVX-LABEL: pr38477:
716; AVX:       # %bb.0:
717; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,4957,57457,4103,16385,35545,2048,2115]
718; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
719; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,32768,0,0,0,0,0,32768]
720; AVX-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
721; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,1024,1024,16,4,1024,u,4096]
722; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7]
723; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
724; AVX-NEXT:    retq
725;
726; XOP-LABEL: pr38477:
727; XOP:       # %bb.0:
728; XOP-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,4957,57457,4103,16385,35545,2048,2115]
729; XOP-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
730; XOP-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,32768,0,0,0,0,0,32768]
731; XOP-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
732; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
733; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
734; XOP-NEXT:    retq
735  %1 = udiv <8 x i16> %a0, <i16 1, i16 119, i16 73, i16 -111, i16 -3, i16 118, i16 32, i16 31>
736  ret <8 x i16> %1
737}
738
739define i1 @bool_udiv(i1 %x, i1 %y) {
740; CHECK-LABEL: bool_udiv:
741; CHECK:       # %bb.0:
742; CHECK-NEXT:    movl %edi, %eax
743; CHECK-NEXT:    # kill: def $al killed $al killed $eax
744; CHECK-NEXT:    retq
745  %r = udiv i1 %x, %y
746  ret i1 %r
747}
748
749define <4 x i1> @boolvec_udiv(<4 x i1> %x, <4 x i1> %y) {
750; CHECK-LABEL: boolvec_udiv:
751; CHECK:       # %bb.0:
752; CHECK-NEXT:    retq
753  %r = udiv <4 x i1> %x, %y
754  ret <4 x i1> %r
755}
756
757define <4 x i32> @vector_div_leading_zeros(<4 x i32> %x) {
758; SSE2-LABEL: vector_div_leading_zeros:
759; SSE2:       # %bb.0:
760; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
761; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
762; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
763; SSE2-NEXT:    pmuludq %xmm1, %xmm0
764; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
765; SSE2-NEXT:    pmuludq %xmm1, %xmm2
766; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
767; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
768; SSE2-NEXT:    retq
769;
770; SSE41-LABEL: vector_div_leading_zeros:
771; SSE41:       # %bb.0:
772; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
773; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
774; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
775; SSE41-NEXT:    pmuludq %xmm2, %xmm1
776; SSE41-NEXT:    pmuludq %xmm2, %xmm0
777; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
778; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
779; SSE41-NEXT:    retq
780;
781; AVX1-LABEL: vector_div_leading_zeros:
782; AVX1:       # %bb.0:
783; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
784; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
785; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
786; AVX1-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
787; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
788; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
789; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
790; AVX1-NEXT:    retq
791;
792; AVX2-LABEL: vector_div_leading_zeros:
793; AVX2:       # %bb.0:
794; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
795; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
796; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
797; AVX2-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
798; AVX2-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
799; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
800; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
801; AVX2-NEXT:    retq
802;
803; XOP-LABEL: vector_div_leading_zeros:
804; XOP:       # %bb.0:
805; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
806; XOP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
807; XOP-NEXT:    vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
808; XOP-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
809; XOP-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
810; XOP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
811; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
812; XOP-NEXT:    retq
813  %a = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
814  %b = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
815  ret <4 x i32> %b
816}
817