xref: /llvm-project/llvm/test/CodeGen/X86/dagcombine-shifts.ll (revision 74fe1da01eb149a2234fc0f9570c84a08692e782)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
3; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64
4
5; fold (shl (zext (lshr (A, X))), X) -> (zext (shl (lshr (A, X)), X))
6
7; Canolicalize the sequence shl/zext/lshr performing the zeroextend
8; as the last instruction of the sequence.
9; This will help DAGCombiner to identify and then fold the sequence
10; of shifts into a single AND.
11; This transformation is profitable if the shift amounts are the same
12; and if there is only one use of the zext.
13
14define i16 @fun1(i8 zeroext %v) {
15; X86-LABEL: fun1:
16; X86:       # %bb.0: # %entry
17; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
18; X86-NEXT:    andl $-16, %eax
19; X86-NEXT:    # kill: def $ax killed $ax killed $eax
20; X86-NEXT:    retl
21;
22; X64-LABEL: fun1:
23; X64:       # %bb.0: # %entry
24; X64-NEXT:    movl %edi, %eax
25; X64-NEXT:    andl $-16, %eax
26; X64-NEXT:    # kill: def $ax killed $ax killed $eax
27; X64-NEXT:    retq
28entry:
29  %shr = lshr i8 %v, 4
30  %ext = zext i8 %shr to i16
31  %shl = shl i16 %ext, 4
32  ret i16 %shl
33}
34
35define i32 @fun2(i8 zeroext %v) {
36; X86-LABEL: fun2:
37; X86:       # %bb.0: # %entry
38; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
39; X86-NEXT:    andl $-16, %eax
40; X86-NEXT:    retl
41;
42; X64-LABEL: fun2:
43; X64:       # %bb.0: # %entry
44; X64-NEXT:    movl %edi, %eax
45; X64-NEXT:    andl $-16, %eax
46; X64-NEXT:    retq
47entry:
48  %shr = lshr i8 %v, 4
49  %ext = zext i8 %shr to i32
50  %shl = shl i32 %ext, 4
51  ret i32 %shl
52}
53
54define i32 @fun3(i16 zeroext %v) {
55; X86-LABEL: fun3:
56; X86:       # %bb.0: # %entry
57; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
58; X86-NEXT:    andl $-16, %eax
59; X86-NEXT:    retl
60;
61; X64-LABEL: fun3:
62; X64:       # %bb.0: # %entry
63; X64-NEXT:    movl %edi, %eax
64; X64-NEXT:    andl $-16, %eax
65; X64-NEXT:    retq
66entry:
67  %shr = lshr i16 %v, 4
68  %ext = zext i16 %shr to i32
69  %shl = shl i32 %ext, 4
70  ret i32 %shl
71}
72
73define i64 @fun4(i8 zeroext %v) {
74; X86-LABEL: fun4:
75; X86:       # %bb.0: # %entry
76; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
77; X86-NEXT:    andl $-16, %eax
78; X86-NEXT:    xorl %edx, %edx
79; X86-NEXT:    retl
80;
81; X64-LABEL: fun4:
82; X64:       # %bb.0: # %entry
83; X64-NEXT:    movl %edi, %eax
84; X64-NEXT:    andl $-16, %eax
85; X64-NEXT:    retq
86entry:
87  %shr = lshr i8 %v, 4
88  %ext = zext i8 %shr to i64
89  %shl = shl i64 %ext, 4
90  ret i64 %shl
91}
92
93define i64 @fun5(i16 zeroext %v) {
94; X86-LABEL: fun5:
95; X86:       # %bb.0: # %entry
96; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
97; X86-NEXT:    andl $-16, %eax
98; X86-NEXT:    xorl %edx, %edx
99; X86-NEXT:    retl
100;
101; X64-LABEL: fun5:
102; X64:       # %bb.0: # %entry
103; X64-NEXT:    movl %edi, %eax
104; X64-NEXT:    andl $-16, %eax
105; X64-NEXT:    retq
106entry:
107  %shr = lshr i16 %v, 4
108  %ext = zext i16 %shr to i64
109  %shl = shl i64 %ext, 4
110  ret i64 %shl
111}
112
113define i64 @fun6(i32 zeroext %v) {
114; X86-LABEL: fun6:
115; X86:       # %bb.0: # %entry
116; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
117; X86-NEXT:    andl $-16, %eax
118; X86-NEXT:    xorl %edx, %edx
119; X86-NEXT:    retl
120;
121; X64-LABEL: fun6:
122; X64:       # %bb.0: # %entry
123; X64-NEXT:    movl %edi, %eax
124; X64-NEXT:    andl $-16, %eax
125; X64-NEXT:    retq
126entry:
127  %shr = lshr i32 %v, 4
128  %ext = zext i32 %shr to i64
129  %shl = shl i64 %ext, 4
130  ret i64 %shl
131}
132
133; Don't fold the pattern if we use arithmetic shifts.
134
135define i64 @fun7(i8 zeroext %v) {
136; X86-LABEL: fun7:
137; X86:       # %bb.0: # %entry
138; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
139; X86-NEXT:    sarb $4, %al
140; X86-NEXT:    movzbl %al, %eax
141; X86-NEXT:    shll $4, %eax
142; X86-NEXT:    xorl %edx, %edx
143; X86-NEXT:    retl
144;
145; X64-LABEL: fun7:
146; X64:       # %bb.0: # %entry
147; X64-NEXT:    sarb $4, %dil
148; X64-NEXT:    movzbl %dil, %eax
149; X64-NEXT:    shll $4, %eax
150; X64-NEXT:    retq
151entry:
152  %shr = ashr i8 %v, 4
153  %ext = zext i8 %shr to i64
154  %shl = shl i64 %ext, 4
155  ret i64 %shl
156}
157
158define i64 @fun8(i16 zeroext %v) {
159; X86-LABEL: fun8:
160; X86:       # %bb.0: # %entry
161; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
162; X86-NEXT:    andl $1048560, %eax # imm = 0xFFFF0
163; X86-NEXT:    xorl %edx, %edx
164; X86-NEXT:    retl
165;
166; X64-LABEL: fun8:
167; X64:       # %bb.0: # %entry
168; X64-NEXT:    movswl %di, %eax
169; X64-NEXT:    andl $1048560, %eax # imm = 0xFFFF0
170; X64-NEXT:    retq
171entry:
172  %shr = ashr i16 %v, 4
173  %ext = zext i16 %shr to i64
174  %shl = shl i64 %ext, 4
175  ret i64 %shl
176}
177
178define i64 @fun9(i32 zeroext %v) {
179; X86-LABEL: fun9:
180; X86:       # %bb.0: # %entry
181; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
182; X86-NEXT:    movl %eax, %edx
183; X86-NEXT:    sarl $4, %edx
184; X86-NEXT:    andl $-16, %eax
185; X86-NEXT:    shrl $28, %edx
186; X86-NEXT:    retl
187;
188; X64-LABEL: fun9:
189; X64:       # %bb.0: # %entry
190; X64-NEXT:    movl %edi, %eax
191; X64-NEXT:    sarl $4, %eax
192; X64-NEXT:    shlq $4, %rax
193; X64-NEXT:    retq
194entry:
195  %shr = ashr i32 %v, 4
196  %ext = zext i32 %shr to i64
197  %shl = shl i64 %ext, 4
198  ret i64 %shl
199}
200
201; Don't fold the pattern if there is more than one use of the
202; operand in input to the shift left.
203
204define i64 @fun10(i8 zeroext %v) {
205; X86-LABEL: fun10:
206; X86:       # %bb.0: # %entry
207; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
208; X86-NEXT:    shrb $4, %al
209; X86-NEXT:    movzbl %al, %ecx
210; X86-NEXT:    movl %ecx, %eax
211; X86-NEXT:    shll $4, %eax
212; X86-NEXT:    orl %ecx, %eax
213; X86-NEXT:    xorl %edx, %edx
214; X86-NEXT:    retl
215;
216; X64-LABEL: fun10:
217; X64:       # %bb.0: # %entry
218; X64-NEXT:    # kill: def $edi killed $edi def $rdi
219; X64-NEXT:    movl %edi, %eax
220; X64-NEXT:    shrb $4, %al
221; X64-NEXT:    movzbl %al, %eax
222; X64-NEXT:    andl $-16, %edi
223; X64-NEXT:    orq %rdi, %rax
224; X64-NEXT:    retq
225entry:
226  %shr = lshr i8 %v, 4
227  %ext = zext i8 %shr to i64
228  %shl = shl i64 %ext, 4
229  %add = add i64 %shl, %ext
230  ret i64 %add
231}
232
233define i64 @fun11(i16 zeroext %v) {
234; X86-LABEL: fun11:
235; X86:       # %bb.0: # %entry
236; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
237; X86-NEXT:    movl %eax, %ecx
238; X86-NEXT:    shrl $4, %ecx
239; X86-NEXT:    andl $-16, %eax
240; X86-NEXT:    addl %ecx, %eax
241; X86-NEXT:    xorl %edx, %edx
242; X86-NEXT:    retl
243;
244; X64-LABEL: fun11:
245; X64:       # %bb.0: # %entry
246; X64-NEXT:    # kill: def $edi killed $edi def $rdi
247; X64-NEXT:    movl %edi, %eax
248; X64-NEXT:    shrl $4, %eax
249; X64-NEXT:    andl $-16, %edi
250; X64-NEXT:    addq %rdi, %rax
251; X64-NEXT:    retq
252entry:
253  %shr = lshr i16 %v, 4
254  %ext = zext i16 %shr to i64
255  %shl = shl i64 %ext, 4
256  %add = add i64 %shl, %ext
257  ret i64 %add
258}
259
260define i64 @fun12(i32 zeroext %v) {
261; X86-LABEL: fun12:
262; X86:       # %bb.0: # %entry
263; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
264; X86-NEXT:    movl %eax, %ecx
265; X86-NEXT:    shrl $4, %ecx
266; X86-NEXT:    andl $-16, %eax
267; X86-NEXT:    xorl %edx, %edx
268; X86-NEXT:    addl %ecx, %eax
269; X86-NEXT:    setb %dl
270; X86-NEXT:    retl
271;
272; X64-LABEL: fun12:
273; X64:       # %bb.0: # %entry
274; X64-NEXT:    # kill: def $edi killed $edi def $rdi
275; X64-NEXT:    movl %edi, %eax
276; X64-NEXT:    shrl $4, %eax
277; X64-NEXT:    andl $-16, %edi
278; X64-NEXT:    addq %rdi, %rax
279; X64-NEXT:    retq
280entry:
281  %shr = lshr i32 %v, 4
282  %ext = zext i32 %shr to i64
283  %shl = shl i64 %ext, 4
284  %add = add i64 %shl, %ext
285  ret i64 %add
286}
287
288; PR17380
289; Make sure that the combined dags are legal if we run the DAGCombiner after
290; Legalization took place. The add instruction is redundant and increases by
291; one the number of uses of the zext. This prevents the transformation from
292; firing before dags are legalized and optimized.
293; Once the add is removed, the number of uses becomes one and therefore the
294; dags are canonicalized. After Legalization, we need to make sure that the
295; valuetype for the shift count is legal.
296; Verify also that we correctly fold the shl-shr sequence into an
297; AND with bitmask.
298
299define void @g(i32 %a) nounwind {
300; X86-LABEL: g:
301; X86:       # %bb.0:
302; X86-NEXT:    subl $12, %esp
303; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
304; X86-NEXT:    andl $-4, %eax
305; X86-NEXT:    subl $8, %esp
306; X86-NEXT:    pushl $0
307; X86-NEXT:    pushl %eax
308; X86-NEXT:    calll f
309; X86-NEXT:    addl $28, %esp
310; X86-NEXT:    retl
311;
312; X64-LABEL: g:
313; X64:       # %bb.0:
314; X64-NEXT:    # kill: def $edi killed $edi def $rdi
315; X64-NEXT:    andl $-4, %edi
316; X64-NEXT:    jmp f # TAILCALL
317  %b = lshr i32 %a, 2
318  %c = zext i32 %b to i64
319  %d = add i64 %c, 1
320  %e = shl i64 %c, 2
321  tail call void @f(i64 %e)
322  ret void
323}
324
325define i32 @shift_zext_shl(i8 zeroext %x) {
326; X86-LABEL: shift_zext_shl:
327; X86:       # %bb.0:
328; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
329; X86-NEXT:    andl $64, %eax
330; X86-NEXT:    shll $9, %eax
331; X86-NEXT:    retl
332;
333; X64-LABEL: shift_zext_shl:
334; X64:       # %bb.0:
335; X64-NEXT:    movl %edi, %eax
336; X64-NEXT:    andl $64, %eax
337; X64-NEXT:    shll $9, %eax
338; X64-NEXT:    retq
339  %a = and i8 %x, 64
340  %b = zext i8 %a to i16
341  %c = shl i16 %b, 9
342  %d = zext i16 %c to i32
343  ret i32 %d
344}
345
346define i32 @shift_zext_shl2(i8 zeroext %x) {
347; X86-LABEL: shift_zext_shl2:
348; X86:       # %bb.0:
349; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
350; X86-NEXT:    andl $64, %eax
351; X86-NEXT:    shll $9, %eax
352; X86-NEXT:    retl
353;
354; X64-LABEL: shift_zext_shl2:
355; X64:       # %bb.0:
356; X64-NEXT:    movl %edi, %eax
357; X64-NEXT:    andl $64, %eax
358; X64-NEXT:    shll $9, %eax
359; X64-NEXT:    retq
360  %a = and i8 %x, 64
361  %b = zext i8 %a to i32
362  %c = shl i32 %b, 9
363  ret i32 %c
364}
365
366define <4 x i32> @shift_zext_shl_vec(<4 x i8> %x) nounwind {
367; X86-LABEL: shift_zext_shl_vec:
368; X86:       # %bb.0:
369; X86-NEXT:    pushl %edi
370; X86-NEXT:    pushl %esi
371; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
372; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
373; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
374; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
375; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
376; X86-NEXT:    andl $64, %ecx
377; X86-NEXT:    shll $9, %ecx
378; X86-NEXT:    andl $63, %edx
379; X86-NEXT:    shll $8, %edx
380; X86-NEXT:    andl $31, %esi
381; X86-NEXT:    shll $7, %esi
382; X86-NEXT:    andl $23, %edi
383; X86-NEXT:    shll $6, %edi
384; X86-NEXT:    movl %edi, 12(%eax)
385; X86-NEXT:    movl %esi, 8(%eax)
386; X86-NEXT:    movl %edx, 4(%eax)
387; X86-NEXT:    movl %ecx, (%eax)
388; X86-NEXT:    popl %esi
389; X86-NEXT:    popl %edi
390; X86-NEXT:    retl $4
391;
392; X64-LABEL: shift_zext_shl_vec:
393; X64:       # %bb.0:
394; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
395; X64-NEXT:    pxor %xmm1, %xmm1
396; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
397; X64-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [512,256,128,64,u,u,u,u]
398; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
399; X64-NEXT:    retq
400  %a = and <4 x i8> %x, <i8 64, i8 63, i8 31, i8 23>
401  %b = zext <4 x i8> %a to <4 x i16>
402  %c = shl <4 x i16> %b, <i16 9, i16 8, i16 7, i16 6>
403  %d = zext <4 x i16> %c to <4 x i32>
404  ret <4 x i32> %d
405}
406
407define <4 x i32> @shift_zext_shl2_vec(<4 x i8> %x) nounwind {
408; X86-LABEL: shift_zext_shl2_vec:
409; X86:       # %bb.0:
410; X86-NEXT:    pushl %edi
411; X86-NEXT:    pushl %esi
412; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
413; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
414; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
415; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
416; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
417; X86-NEXT:    andl $23, %edi
418; X86-NEXT:    andl $31, %esi
419; X86-NEXT:    andl $63, %edx
420; X86-NEXT:    andl $64, %ecx
421; X86-NEXT:    shll $9, %ecx
422; X86-NEXT:    shll $8, %edx
423; X86-NEXT:    shll $7, %esi
424; X86-NEXT:    shll $6, %edi
425; X86-NEXT:    movl %edi, 12(%eax)
426; X86-NEXT:    movl %esi, 8(%eax)
427; X86-NEXT:    movl %edx, 4(%eax)
428; X86-NEXT:    movl %ecx, (%eax)
429; X86-NEXT:    popl %esi
430; X86-NEXT:    popl %edi
431; X86-NEXT:    retl $4
432;
433; X64-LABEL: shift_zext_shl2_vec:
434; X64:       # %bb.0:
435; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
436; X64-NEXT:    pxor %xmm1, %xmm1
437; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
438; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
439; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
440; X64-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
441; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
442; X64-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
443; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
444; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
445; X64-NEXT:    retq
446  %a = and <4 x i8> %x, <i8 64, i8 63, i8 31, i8 23>
447  %b = zext <4 x i8> %a to <4 x i32>
448  %c = shl <4 x i32> %b, <i32 9, i32 8, i32 7, i32 6>
449  ret <4 x i32> %c
450}
451
452declare dso_local void @f(i64)
453
454