1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86 3; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64 4 5; fold (shl (zext (lshr (A, X))), X) -> (zext (shl (lshr (A, X)), X)) 6 7; Canolicalize the sequence shl/zext/lshr performing the zeroextend 8; as the last instruction of the sequence. 9; This will help DAGCombiner to identify and then fold the sequence 10; of shifts into a single AND. 11; This transformation is profitable if the shift amounts are the same 12; and if there is only one use of the zext. 13 14define i16 @fun1(i8 zeroext %v) { 15; X86-LABEL: fun1: 16; X86: # %bb.0: # %entry 17; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 18; X86-NEXT: andl $-16, %eax 19; X86-NEXT: # kill: def $ax killed $ax killed $eax 20; X86-NEXT: retl 21; 22; X64-LABEL: fun1: 23; X64: # %bb.0: # %entry 24; X64-NEXT: movl %edi, %eax 25; X64-NEXT: andl $-16, %eax 26; X64-NEXT: # kill: def $ax killed $ax killed $eax 27; X64-NEXT: retq 28entry: 29 %shr = lshr i8 %v, 4 30 %ext = zext i8 %shr to i16 31 %shl = shl i16 %ext, 4 32 ret i16 %shl 33} 34 35define i32 @fun2(i8 zeroext %v) { 36; X86-LABEL: fun2: 37; X86: # %bb.0: # %entry 38; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 39; X86-NEXT: andl $-16, %eax 40; X86-NEXT: retl 41; 42; X64-LABEL: fun2: 43; X64: # %bb.0: # %entry 44; X64-NEXT: movl %edi, %eax 45; X64-NEXT: andl $-16, %eax 46; X64-NEXT: retq 47entry: 48 %shr = lshr i8 %v, 4 49 %ext = zext i8 %shr to i32 50 %shl = shl i32 %ext, 4 51 ret i32 %shl 52} 53 54define i32 @fun3(i16 zeroext %v) { 55; X86-LABEL: fun3: 56; X86: # %bb.0: # %entry 57; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 58; X86-NEXT: andl $-16, %eax 59; X86-NEXT: retl 60; 61; X64-LABEL: fun3: 62; X64: # %bb.0: # %entry 63; X64-NEXT: movl %edi, %eax 64; X64-NEXT: andl $-16, %eax 65; X64-NEXT: retq 66entry: 67 %shr = lshr i16 %v, 4 68 %ext = zext i16 %shr to i32 69 %shl = shl i32 %ext, 4 70 ret i32 %shl 71} 72 73define i64 @fun4(i8 zeroext %v) { 74; X86-LABEL: fun4: 75; X86: # %bb.0: # %entry 76; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 77; X86-NEXT: andl $-16, %eax 78; X86-NEXT: xorl %edx, %edx 79; X86-NEXT: retl 80; 81; X64-LABEL: fun4: 82; X64: # %bb.0: # %entry 83; X64-NEXT: movl %edi, %eax 84; X64-NEXT: andl $-16, %eax 85; X64-NEXT: retq 86entry: 87 %shr = lshr i8 %v, 4 88 %ext = zext i8 %shr to i64 89 %shl = shl i64 %ext, 4 90 ret i64 %shl 91} 92 93define i64 @fun5(i16 zeroext %v) { 94; X86-LABEL: fun5: 95; X86: # %bb.0: # %entry 96; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 97; X86-NEXT: andl $-16, %eax 98; X86-NEXT: xorl %edx, %edx 99; X86-NEXT: retl 100; 101; X64-LABEL: fun5: 102; X64: # %bb.0: # %entry 103; X64-NEXT: movl %edi, %eax 104; X64-NEXT: andl $-16, %eax 105; X64-NEXT: retq 106entry: 107 %shr = lshr i16 %v, 4 108 %ext = zext i16 %shr to i64 109 %shl = shl i64 %ext, 4 110 ret i64 %shl 111} 112 113define i64 @fun6(i32 zeroext %v) { 114; X86-LABEL: fun6: 115; X86: # %bb.0: # %entry 116; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 117; X86-NEXT: andl $-16, %eax 118; X86-NEXT: xorl %edx, %edx 119; X86-NEXT: retl 120; 121; X64-LABEL: fun6: 122; X64: # %bb.0: # %entry 123; X64-NEXT: movl %edi, %eax 124; X64-NEXT: andl $-16, %eax 125; X64-NEXT: retq 126entry: 127 %shr = lshr i32 %v, 4 128 %ext = zext i32 %shr to i64 129 %shl = shl i64 %ext, 4 130 ret i64 %shl 131} 132 133; Don't fold the pattern if we use arithmetic shifts. 134 135define i64 @fun7(i8 zeroext %v) { 136; X86-LABEL: fun7: 137; X86: # %bb.0: # %entry 138; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 139; X86-NEXT: sarb $4, %al 140; X86-NEXT: movzbl %al, %eax 141; X86-NEXT: shll $4, %eax 142; X86-NEXT: xorl %edx, %edx 143; X86-NEXT: retl 144; 145; X64-LABEL: fun7: 146; X64: # %bb.0: # %entry 147; X64-NEXT: sarb $4, %dil 148; X64-NEXT: movzbl %dil, %eax 149; X64-NEXT: shll $4, %eax 150; X64-NEXT: retq 151entry: 152 %shr = ashr i8 %v, 4 153 %ext = zext i8 %shr to i64 154 %shl = shl i64 %ext, 4 155 ret i64 %shl 156} 157 158define i64 @fun8(i16 zeroext %v) { 159; X86-LABEL: fun8: 160; X86: # %bb.0: # %entry 161; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax 162; X86-NEXT: andl $1048560, %eax # imm = 0xFFFF0 163; X86-NEXT: xorl %edx, %edx 164; X86-NEXT: retl 165; 166; X64-LABEL: fun8: 167; X64: # %bb.0: # %entry 168; X64-NEXT: movswl %di, %eax 169; X64-NEXT: andl $1048560, %eax # imm = 0xFFFF0 170; X64-NEXT: retq 171entry: 172 %shr = ashr i16 %v, 4 173 %ext = zext i16 %shr to i64 174 %shl = shl i64 %ext, 4 175 ret i64 %shl 176} 177 178define i64 @fun9(i32 zeroext %v) { 179; X86-LABEL: fun9: 180; X86: # %bb.0: # %entry 181; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 182; X86-NEXT: movl %eax, %edx 183; X86-NEXT: sarl $4, %edx 184; X86-NEXT: andl $-16, %eax 185; X86-NEXT: shrl $28, %edx 186; X86-NEXT: retl 187; 188; X64-LABEL: fun9: 189; X64: # %bb.0: # %entry 190; X64-NEXT: movl %edi, %eax 191; X64-NEXT: sarl $4, %eax 192; X64-NEXT: shlq $4, %rax 193; X64-NEXT: retq 194entry: 195 %shr = ashr i32 %v, 4 196 %ext = zext i32 %shr to i64 197 %shl = shl i64 %ext, 4 198 ret i64 %shl 199} 200 201; Don't fold the pattern if there is more than one use of the 202; operand in input to the shift left. 203 204define i64 @fun10(i8 zeroext %v) { 205; X86-LABEL: fun10: 206; X86: # %bb.0: # %entry 207; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 208; X86-NEXT: shrb $4, %al 209; X86-NEXT: movzbl %al, %ecx 210; X86-NEXT: movl %ecx, %eax 211; X86-NEXT: shll $4, %eax 212; X86-NEXT: orl %ecx, %eax 213; X86-NEXT: xorl %edx, %edx 214; X86-NEXT: retl 215; 216; X64-LABEL: fun10: 217; X64: # %bb.0: # %entry 218; X64-NEXT: # kill: def $edi killed $edi def $rdi 219; X64-NEXT: movl %edi, %eax 220; X64-NEXT: shrb $4, %al 221; X64-NEXT: movzbl %al, %eax 222; X64-NEXT: andl $-16, %edi 223; X64-NEXT: orq %rdi, %rax 224; X64-NEXT: retq 225entry: 226 %shr = lshr i8 %v, 4 227 %ext = zext i8 %shr to i64 228 %shl = shl i64 %ext, 4 229 %add = add i64 %shl, %ext 230 ret i64 %add 231} 232 233define i64 @fun11(i16 zeroext %v) { 234; X86-LABEL: fun11: 235; X86: # %bb.0: # %entry 236; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 237; X86-NEXT: movl %eax, %ecx 238; X86-NEXT: shrl $4, %ecx 239; X86-NEXT: andl $-16, %eax 240; X86-NEXT: addl %ecx, %eax 241; X86-NEXT: xorl %edx, %edx 242; X86-NEXT: retl 243; 244; X64-LABEL: fun11: 245; X64: # %bb.0: # %entry 246; X64-NEXT: # kill: def $edi killed $edi def $rdi 247; X64-NEXT: movl %edi, %eax 248; X64-NEXT: shrl $4, %eax 249; X64-NEXT: andl $-16, %edi 250; X64-NEXT: addq %rdi, %rax 251; X64-NEXT: retq 252entry: 253 %shr = lshr i16 %v, 4 254 %ext = zext i16 %shr to i64 255 %shl = shl i64 %ext, 4 256 %add = add i64 %shl, %ext 257 ret i64 %add 258} 259 260define i64 @fun12(i32 zeroext %v) { 261; X86-LABEL: fun12: 262; X86: # %bb.0: # %entry 263; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 264; X86-NEXT: movl %eax, %ecx 265; X86-NEXT: shrl $4, %ecx 266; X86-NEXT: andl $-16, %eax 267; X86-NEXT: xorl %edx, %edx 268; X86-NEXT: addl %ecx, %eax 269; X86-NEXT: setb %dl 270; X86-NEXT: retl 271; 272; X64-LABEL: fun12: 273; X64: # %bb.0: # %entry 274; X64-NEXT: # kill: def $edi killed $edi def $rdi 275; X64-NEXT: movl %edi, %eax 276; X64-NEXT: shrl $4, %eax 277; X64-NEXT: andl $-16, %edi 278; X64-NEXT: addq %rdi, %rax 279; X64-NEXT: retq 280entry: 281 %shr = lshr i32 %v, 4 282 %ext = zext i32 %shr to i64 283 %shl = shl i64 %ext, 4 284 %add = add i64 %shl, %ext 285 ret i64 %add 286} 287 288; PR17380 289; Make sure that the combined dags are legal if we run the DAGCombiner after 290; Legalization took place. The add instruction is redundant and increases by 291; one the number of uses of the zext. This prevents the transformation from 292; firing before dags are legalized and optimized. 293; Once the add is removed, the number of uses becomes one and therefore the 294; dags are canonicalized. After Legalization, we need to make sure that the 295; valuetype for the shift count is legal. 296; Verify also that we correctly fold the shl-shr sequence into an 297; AND with bitmask. 298 299define void @g(i32 %a) nounwind { 300; X86-LABEL: g: 301; X86: # %bb.0: 302; X86-NEXT: subl $12, %esp 303; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 304; X86-NEXT: andl $-4, %eax 305; X86-NEXT: subl $8, %esp 306; X86-NEXT: pushl $0 307; X86-NEXT: pushl %eax 308; X86-NEXT: calll f 309; X86-NEXT: addl $28, %esp 310; X86-NEXT: retl 311; 312; X64-LABEL: g: 313; X64: # %bb.0: 314; X64-NEXT: # kill: def $edi killed $edi def $rdi 315; X64-NEXT: andl $-4, %edi 316; X64-NEXT: jmp f # TAILCALL 317 %b = lshr i32 %a, 2 318 %c = zext i32 %b to i64 319 %d = add i64 %c, 1 320 %e = shl i64 %c, 2 321 tail call void @f(i64 %e) 322 ret void 323} 324 325define i32 @shift_zext_shl(i8 zeroext %x) { 326; X86-LABEL: shift_zext_shl: 327; X86: # %bb.0: 328; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 329; X86-NEXT: andl $64, %eax 330; X86-NEXT: shll $9, %eax 331; X86-NEXT: retl 332; 333; X64-LABEL: shift_zext_shl: 334; X64: # %bb.0: 335; X64-NEXT: movl %edi, %eax 336; X64-NEXT: andl $64, %eax 337; X64-NEXT: shll $9, %eax 338; X64-NEXT: retq 339 %a = and i8 %x, 64 340 %b = zext i8 %a to i16 341 %c = shl i16 %b, 9 342 %d = zext i16 %c to i32 343 ret i32 %d 344} 345 346define i32 @shift_zext_shl2(i8 zeroext %x) { 347; X86-LABEL: shift_zext_shl2: 348; X86: # %bb.0: 349; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 350; X86-NEXT: andl $64, %eax 351; X86-NEXT: shll $9, %eax 352; X86-NEXT: retl 353; 354; X64-LABEL: shift_zext_shl2: 355; X64: # %bb.0: 356; X64-NEXT: movl %edi, %eax 357; X64-NEXT: andl $64, %eax 358; X64-NEXT: shll $9, %eax 359; X64-NEXT: retq 360 %a = and i8 %x, 64 361 %b = zext i8 %a to i32 362 %c = shl i32 %b, 9 363 ret i32 %c 364} 365 366define <4 x i32> @shift_zext_shl_vec(<4 x i8> %x) nounwind { 367; X86-LABEL: shift_zext_shl_vec: 368; X86: # %bb.0: 369; X86-NEXT: pushl %edi 370; X86-NEXT: pushl %esi 371; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 372; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edi 373; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi 374; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx 375; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx 376; X86-NEXT: andl $64, %ecx 377; X86-NEXT: shll $9, %ecx 378; X86-NEXT: andl $63, %edx 379; X86-NEXT: shll $8, %edx 380; X86-NEXT: andl $31, %esi 381; X86-NEXT: shll $7, %esi 382; X86-NEXT: andl $23, %edi 383; X86-NEXT: shll $6, %edi 384; X86-NEXT: movl %edi, 12(%eax) 385; X86-NEXT: movl %esi, 8(%eax) 386; X86-NEXT: movl %edx, 4(%eax) 387; X86-NEXT: movl %ecx, (%eax) 388; X86-NEXT: popl %esi 389; X86-NEXT: popl %edi 390; X86-NEXT: retl $4 391; 392; X64-LABEL: shift_zext_shl_vec: 393; X64: # %bb.0: 394; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 395; X64-NEXT: pxor %xmm1, %xmm1 396; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 397; X64-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [512,256,128,64,u,u,u,u] 398; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 399; X64-NEXT: retq 400 %a = and <4 x i8> %x, <i8 64, i8 63, i8 31, i8 23> 401 %b = zext <4 x i8> %a to <4 x i16> 402 %c = shl <4 x i16> %b, <i16 9, i16 8, i16 7, i16 6> 403 %d = zext <4 x i16> %c to <4 x i32> 404 ret <4 x i32> %d 405} 406 407define <4 x i32> @shift_zext_shl2_vec(<4 x i8> %x) nounwind { 408; X86-LABEL: shift_zext_shl2_vec: 409; X86: # %bb.0: 410; X86-NEXT: pushl %edi 411; X86-NEXT: pushl %esi 412; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 413; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx 414; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx 415; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi 416; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edi 417; X86-NEXT: andl $23, %edi 418; X86-NEXT: andl $31, %esi 419; X86-NEXT: andl $63, %edx 420; X86-NEXT: andl $64, %ecx 421; X86-NEXT: shll $9, %ecx 422; X86-NEXT: shll $8, %edx 423; X86-NEXT: shll $7, %esi 424; X86-NEXT: shll $6, %edi 425; X86-NEXT: movl %edi, 12(%eax) 426; X86-NEXT: movl %esi, 8(%eax) 427; X86-NEXT: movl %edx, 4(%eax) 428; X86-NEXT: movl %ecx, (%eax) 429; X86-NEXT: popl %esi 430; X86-NEXT: popl %edi 431; X86-NEXT: retl $4 432; 433; X64-LABEL: shift_zext_shl2_vec: 434; X64: # %bb.0: 435; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 436; X64-NEXT: pxor %xmm1, %xmm1 437; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 438; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 439; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 440; X64-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 441; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 442; X64-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 443; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 444; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 445; X64-NEXT: retq 446 %a = and <4 x i8> %x, <i8 64, i8 63, i8 31, i8 23> 447 %b = zext <4 x i8> %a to <4 x i32> 448 %c = shl <4 x i32> %b, <i32 9, i32 8, i32 7, i32 6> 449 ret <4 x i32> %c 450} 451 452declare dso_local void @f(i64) 453 454