1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64 4 5; Check that under certain conditions we can factor out a rotate 6; from the following idioms: 7; (a*c0) >> s1 | (a*c1) 8; (a/c0) << s1 | (a/c1) 9; This targets cases where instcombine has folded a shl/srl/mul/udiv 10; with one of the shifts from the rotate idiom 11 12define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) { 13; CHECK-LABEL: vroll_v4i32_extract_shl: 14; CHECK: # %bb.0: 15; CHECK-NEXT: vpslld $3, %xmm0, %xmm0 16; CHECK-NEXT: vprold $7, %xmm0, %xmm0 17; CHECK-NEXT: ret{{[l|q]}} 18 %lhs_mul = shl <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3> 19 %rhs_mul = shl <4 x i32> %i, <i32 10, i32 10, i32 10, i32 10> 20 %lhs_shift = lshr <4 x i32> %lhs_mul, <i32 25, i32 25, i32 25, i32 25> 21 %out = or <4 x i32> %lhs_shift, %rhs_mul 22 ret <4 x i32> %out 23} 24 25define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind { 26; CHECK-LABEL: vrolq_v4i64_extract_shrl: 27; CHECK: # %bb.0: 28; CHECK-NEXT: vpsrlq $5, %ymm0, %ymm0 29; CHECK-NEXT: vprolq $29, %ymm0, %ymm0 30; CHECK-NEXT: ret{{[l|q]}} 31 %lhs_div = lshr <4 x i64> %i, <i64 40, i64 40, i64 40, i64 40> 32 %rhs_div = lshr <4 x i64> %i, <i64 5, i64 5, i64 5, i64 5> 33 %rhs_shift = shl <4 x i64> %rhs_div, <i64 29, i64 29, i64 29, i64 29> 34 %out = or <4 x i64> %lhs_div, %rhs_shift 35 ret <4 x i64> %out 36} 37 38define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind { 39; X86-LABEL: vroll_extract_mul: 40; X86: # %bb.0: 41; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 42; X86-NEXT: vprold $6, %ymm0, %ymm0 43; X86-NEXT: retl 44; 45; X64-LABEL: vroll_extract_mul: 46; X64: # %bb.0: 47; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 48; X64-NEXT: vprold $6, %ymm0, %ymm0 49; X64-NEXT: retq 50 %lhs_mul = mul <8 x i32> %i, <i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640> 51 %rhs_mul = mul <8 x i32> %i, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10> 52 %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26> 53 %out = or <8 x i32> %lhs_mul, %rhs_shift 54 ret <8 x i32> %out 55} 56 57define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind { 58; X86-LABEL: vrolq_extract_udiv: 59; X86: # %bb.0: 60; X86-NEXT: subl $32, %esp 61; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 62; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) 63; X86-NEXT: vmovss %xmm0, (%esp) 64; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 65; X86-NEXT: movl $3, {{[0-9]+}}(%esp) 66; X86-NEXT: calll __udivdi3 67; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 68; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) 69; X86-NEXT: vextractps $2, %xmm0, (%esp) 70; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 71; X86-NEXT: movl $3, {{[0-9]+}}(%esp) 72; X86-NEXT: vmovd %eax, %xmm0 73; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 74; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 75; X86-NEXT: calll __udivdi3 76; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 77; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 78; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 79; X86-NEXT: vprolq $57, %xmm0, %xmm0 80; X86-NEXT: addl $32, %esp 81; X86-NEXT: retl 82; 83; X64-LABEL: vrolq_extract_udiv: 84; X64: # %bb.0: 85; X64-NEXT: vpextrq $1, %xmm0, %rax 86; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB 87; X64-NEXT: mulq %rcx 88; X64-NEXT: vmovq %rdx, %xmm1 89; X64-NEXT: vmovq %xmm0, %rax 90; X64-NEXT: mulq %rcx 91; X64-NEXT: vmovq %rdx, %xmm0 92; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 93; X64-NEXT: vpsrlq $1, %xmm0, %xmm0 94; X64-NEXT: vprolq $57, %xmm0, %xmm0 95; X64-NEXT: retq 96 %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3> 97 %rhs_div = udiv <2 x i64> %i, <i64 384, i64 384> 98 %lhs_shift = shl <2 x i64> %lhs_div, <i64 57, i64 57> 99 %out = or <2 x i64> %lhs_shift, %rhs_div 100 ret <2 x i64> %out 101} 102 103define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind { 104; X86-LABEL: vrolw_extract_mul_with_mask: 105; X86: # %bb.0: 106; X86-NEXT: vpslld $3, %xmm0, %xmm1 107; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0 108; X86-NEXT: vprold $7, %xmm0, %xmm0 109; X86-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 110; X86-NEXT: retl 111; 112; X64-LABEL: vrolw_extract_mul_with_mask: 113; X64: # %bb.0: 114; X64-NEXT: vpslld $3, %xmm0, %xmm1 115; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 116; X64-NEXT: vprold $7, %xmm0, %xmm0 117; X64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 118; X64-NEXT: retq 119 %lhs_mul = mul <4 x i32> %i, <i32 1152, i32 1152, i32 1152, i32 1152> 120 %rhs_mul = mul <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9> 121 %lhs_and = and <4 x i32> %lhs_mul, <i32 160, i32 160, i32 160, i32 160> 122 %rhs_shift = lshr <4 x i32> %rhs_mul, <i32 25, i32 25, i32 25, i32 25> 123 %out = or <4 x i32> %lhs_and, %rhs_shift 124 ret <4 x i32> %out 125} 126 127define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind { 128; X86-LABEL: illegal_no_extract_mul: 129; X86: # %bb.0: 130; X86-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 # [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] 131; X86-NEXT: vpsrlw $10, %zmm0, %zmm1 132; X86-NEXT: vpsllw $6, %zmm0, %zmm0 133; X86-NEXT: vporq %zmm1, %zmm0, %zmm0 134; X86-NEXT: retl 135; 136; X64-LABEL: illegal_no_extract_mul: 137; X64: # %bb.0: 138; X64-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] 139; X64-NEXT: vpsrlw $10, %zmm0, %zmm1 140; X64-NEXT: vpsllw $6, %zmm0, %zmm0 141; X64-NEXT: vporq %zmm1, %zmm0, %zmm0 142; X64-NEXT: retq 143 %lhs_mul = mul <32 x i16> %i, <i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640> 144 %rhs_mul = mul <32 x i16> %i, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> 145 %rhs_shift = lshr <32 x i16> %rhs_mul, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> 146 %out = or <32 x i16> %lhs_mul, %rhs_shift 147 ret <32 x i16> %out 148} 149 150; Result would undershift 151define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind { 152; X86-LABEL: no_extract_shl: 153; X86: # %bb.0: 154; X86-NEXT: vpsllq $24, %ymm0, %ymm1 155; X86-NEXT: vpsrlq $39, %ymm0, %ymm0 156; X86-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm1, %ymm0 157; X86-NEXT: retl 158; 159; X64-LABEL: no_extract_shl: 160; X64: # %bb.0: 161; X64-NEXT: vpsllq $24, %ymm0, %ymm1 162; X64-NEXT: vpsrlq $39, %ymm0, %ymm0 163; X64-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 164; X64-NEXT: retq 165 %lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11> 166 %rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24> 167 %lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50> 168 %out = or <4 x i64> %lhs_shift, %rhs_mul 169 ret <4 x i64> %out 170} 171 172; Result would overshift 173define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind { 174; X86-LABEL: no_extract_shrl: 175; X86: # %bb.0: 176; X86-NEXT: vpsrld $9, %xmm0, %xmm1 177; X86-NEXT: vpslld $25, %xmm0, %xmm0 178; X86-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm1, %xmm0 179; X86-NEXT: retl 180; 181; X64-LABEL: no_extract_shrl: 182; X64: # %bb.0: 183; X64-NEXT: vpsrld $9, %xmm0, %xmm1 184; X64-NEXT: vpslld $25, %xmm0, %xmm0 185; X64-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 186; X64-NEXT: retq 187 %lhs_div = lshr <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3> 188 %rhs_div = lshr <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9> 189 %lhs_shift = shl <4 x i32> %lhs_div, <i32 28, i32 28, i32 28, i32 28> 190 %out = or <4 x i32> %lhs_shift, %rhs_div 191 ret <4 x i32> %out 192} 193 194; Can factor 512 from 1536, but result is 3 instead of 9 195define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind { 196; X86-LABEL: no_extract_mul: 197; X86: # %bb.0: 198; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1 199; X86-NEXT: vpslld $3, %ymm0, %ymm2 200; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 201; X86-NEXT: vpsrld $23, %ymm0, %ymm0 202; X86-NEXT: vpor %ymm0, %ymm1, %ymm0 203; X86-NEXT: retl 204; 205; X64-LABEL: no_extract_mul: 206; X64: # %bb.0: 207; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 208; X64-NEXT: vpslld $3, %ymm0, %ymm2 209; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 210; X64-NEXT: vpsrld $23, %ymm0, %ymm0 211; X64-NEXT: vpor %ymm0, %ymm1, %ymm0 212; X64-NEXT: retq 213 %lhs_mul = mul <8 x i32> %i, <i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536> 214 %rhs_mul = mul <8 x i32> %i, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9> 215 %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23> 216 %out = or <8 x i32> %lhs_mul, %rhs_shift 217 ret <8 x i32> %out 218} 219 220; Can't evenly factor 256 from 770 221define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind { 222; X86-LABEL: no_extract_udiv: 223; X86: # %bb.0: 224; X86-NEXT: subl $48, %esp 225; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 226; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) 227; X86-NEXT: vmovss %xmm0, (%esp) 228; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 229; X86-NEXT: movl $3, {{[0-9]+}}(%esp) 230; X86-NEXT: calll __udivdi3 231; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 232; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) 233; X86-NEXT: vextractps $2, %xmm0, (%esp) 234; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 235; X86-NEXT: movl $3, {{[0-9]+}}(%esp) 236; X86-NEXT: vmovd %eax, %xmm0 237; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 238; X86-NEXT: calll __udivdi3 239; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 240; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 241; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 242; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 243; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) 244; X86-NEXT: vmovss %xmm0, (%esp) 245; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 246; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302 247; X86-NEXT: calll __udivdi3 248; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 249; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) 250; X86-NEXT: vextractps $2, %xmm0, (%esp) 251; X86-NEXT: movl $0, {{[0-9]+}}(%esp) 252; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302 253; X86-NEXT: vmovd %eax, %xmm0 254; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 255; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 256; X86-NEXT: calll __udivdi3 257; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 258; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 259; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 260; X86-NEXT: vpsllq $56, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Folded Reload 261; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 262; X86-NEXT: addl $48, %esp 263; X86-NEXT: retl 264; 265; X64-LABEL: no_extract_udiv: 266; X64: # %bb.0: 267; X64-NEXT: vpextrq $1, %xmm0, %rcx 268; X64-NEXT: movabsq $-6148914691236517205, %rdi # imm = 0xAAAAAAAAAAAAAAAB 269; X64-NEXT: movq %rcx, %rax 270; X64-NEXT: mulq %rdi 271; X64-NEXT: vmovq %rdx, %xmm1 272; X64-NEXT: vmovq %xmm0, %rsi 273; X64-NEXT: movq %rsi, %rax 274; X64-NEXT: mulq %rdi 275; X64-NEXT: vmovq %rdx, %xmm0 276; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 277; X64-NEXT: vpsrlq $1, %xmm0, %xmm0 278; X64-NEXT: movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B 279; X64-NEXT: movq %rcx, %rax 280; X64-NEXT: mulq %rdi 281; X64-NEXT: vmovq %rdx, %xmm1 282; X64-NEXT: movq %rsi, %rax 283; X64-NEXT: mulq %rdi 284; X64-NEXT: vmovq %rdx, %xmm2 285; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 286; X64-NEXT: vpsrlq $9, %xmm1, %xmm1 287; X64-NEXT: vpsllq $56, %xmm0, %xmm0 288; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 289; X64-NEXT: retq 290 %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3> 291 %rhs_div = udiv <2 x i64> %i, <i64 770, i64 770> 292 %lhs_shift = shl <2 x i64> %lhs_div, <i64 56, i64 56> 293 %out = or <2 x i64> %lhs_shift, %rhs_div 294 ret <2 x i64> %out 295} 296 297; DAGCombiner transforms shl X, 1 into add X, X. 298define <4 x i32> @extract_add_1(<4 x i32> %i) nounwind { 299; CHECK-LABEL: extract_add_1: 300; CHECK: # %bb.0: 301; CHECK-NEXT: vprold $1, %xmm0, %xmm0 302; CHECK-NEXT: ret{{[l|q]}} 303 %ii = add <4 x i32> %i, %i 304 %rhs = lshr <4 x i32> %i, <i32 31, i32 31, i32 31, i32 31> 305 %out = or <4 x i32> %ii, %rhs 306 ret <4 x i32> %out 307} 308 309define <4 x i32> @extract_add_1_comut(<4 x i32> %i) nounwind { 310; CHECK-LABEL: extract_add_1_comut: 311; CHECK: # %bb.0: 312; CHECK-NEXT: vprold $1, %xmm0, %xmm0 313; CHECK-NEXT: ret{{[l|q]}} 314 %ii = add <4 x i32> %i, %i 315 %lhs = lshr <4 x i32> %i, <i32 31, i32 31, i32 31, i32 31> 316 %out = or <4 x i32> %lhs, %ii 317 ret <4 x i32> %out 318} 319 320define <4 x i32> @no_extract_add_1(<4 x i32> %i) nounwind { 321; CHECK-LABEL: no_extract_add_1: 322; CHECK: # %bb.0: 323; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm1 324; CHECK-NEXT: vpsrld $27, %xmm0, %xmm0 325; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 326; CHECK-NEXT: ret{{[l|q]}} 327 %ii = add <4 x i32> %i, %i 328 %rhs = lshr <4 x i32> %i, <i32 27, i32 27, i32 27, i32 27> 329 %out = or <4 x i32> %ii, %rhs 330 ret <4 x i32> %out 331} 332