1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-ALL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-PERLANE 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 8 9; fold (shl 0, x) -> 0 10define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) { 11; SSE-LABEL: combine_vec_shl_zero: 12; SSE: # %bb.0: 13; SSE-NEXT: xorps %xmm0, %xmm0 14; SSE-NEXT: retq 15; 16; AVX-LABEL: combine_vec_shl_zero: 17; AVX: # %bb.0: 18; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 19; AVX-NEXT: retq 20 %1 = shl <4 x i32> zeroinitializer, %x 21 ret <4 x i32> %1 22} 23 24; fold (shl x, c >= size(x)) -> undef 25define <4 x i32> @combine_vec_shl_outofrange0(<4 x i32> %x) { 26; CHECK-LABEL: combine_vec_shl_outofrange0: 27; CHECK: # %bb.0: 28; CHECK-NEXT: retq 29 %1 = shl <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33> 30 ret <4 x i32> %1 31} 32 33define <4 x i32> @combine_vec_shl_outofrange1(<4 x i32> %x) { 34; CHECK-LABEL: combine_vec_shl_outofrange1: 35; CHECK: # %bb.0: 36; CHECK-NEXT: retq 37 %1 = shl <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36> 38 ret <4 x i32> %1 39} 40 41define <4 x i32> @combine_vec_shl_outofrange2(<4 x i32> %a0) { 42; CHECK-LABEL: combine_vec_shl_outofrange2: 43; CHECK: # %bb.0: 44; CHECK-NEXT: retq 45 %1 = and <4 x i32> %a0, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647> 46 %2 = shl <4 x i32> %1, <i32 33, i32 33, i32 33, i32 33> 47 ret <4 x i32> %2 48} 49 50define <4 x i32> @combine_vec_shl_outofrange3(<4 x i32> %a0) { 51; CHECK-LABEL: combine_vec_shl_outofrange3: 52; CHECK: # %bb.0: 53; CHECK-NEXT: retq 54 %1 = shl <4 x i32> %a0, <i32 33, i32 34, i32 35, i32 undef> 55 ret <4 x i32> %1 56} 57 58; fold (shl x, 0) -> x 59define <4 x i32> @combine_vec_shl_by_zero(<4 x i32> %x) { 60; CHECK-LABEL: combine_vec_shl_by_zero: 61; CHECK: # %bb.0: 62; CHECK-NEXT: retq 63 %1 = shl <4 x i32> %x, zeroinitializer 64 ret <4 x i32> %1 65} 66 67; if (shl x, c) is known to be zero, return 0 68define <4 x i32> @combine_vec_shl_known_zero0(<4 x i32> %x) { 69; SSE-LABEL: combine_vec_shl_known_zero0: 70; SSE: # %bb.0: 71; SSE-NEXT: xorps %xmm0, %xmm0 72; SSE-NEXT: retq 73; 74; AVX-LABEL: combine_vec_shl_known_zero0: 75; AVX: # %bb.0: 76; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 77; AVX-NEXT: retq 78 %1 = and <4 x i32> %x, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760> 79 %2 = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16> 80 ret <4 x i32> %2 81} 82 83define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) { 84; SSE2-LABEL: combine_vec_shl_known_zero1: 85; SSE2: # %bb.0: 86; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 87; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65536,32768,16384,8192] 88; SSE2-NEXT: pmuludq %xmm0, %xmm1 89; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 90; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 91; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 92; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 93; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 94; SSE2-NEXT: movdqa %xmm1, %xmm0 95; SSE2-NEXT: retq 96; 97; SSE41-LABEL: combine_vec_shl_known_zero1: 98; SSE41: # %bb.0: 99; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 100; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 101; SSE41-NEXT: retq 102; 103; AVX-LABEL: combine_vec_shl_known_zero1: 104; AVX: # %bb.0: 105; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 106; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 107; AVX-NEXT: retq 108 %1 = and <4 x i32> %x, <i32 4294901760, i32 8589803520, i32 17179607040, i32 34359214080> 109 %2 = shl <4 x i32> %1, <i32 16, i32 15, i32 14, i32 13> 110 ret <4 x i32> %2 111} 112 113; fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))). 114define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) { 115; SSE2-LABEL: combine_vec_shl_trunc_and: 116; SSE2: # %bb.0: 117; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 118; SSE2-NEXT: pslld $23, %xmm1 119; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 120; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 121; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 122; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 123; SSE2-NEXT: pmuludq %xmm1, %xmm0 124; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 125; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 126; SSE2-NEXT: pmuludq %xmm2, %xmm1 127; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 128; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 129; SSE2-NEXT: retq 130; 131; SSE41-LABEL: combine_vec_shl_trunc_and: 132; SSE41: # %bb.0: 133; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 134; SSE41-NEXT: pslld $23, %xmm1 135; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 136; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 137; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 138; SSE41-NEXT: pmulld %xmm1, %xmm0 139; SSE41-NEXT: retq 140; 141; AVX2-SLOW-LABEL: combine_vec_shl_trunc_and: 142; AVX2-SLOW: # %bb.0: 143; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 144; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 145; AVX2-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 146; AVX2-SLOW-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 147; AVX2-SLOW-NEXT: vzeroupper 148; AVX2-SLOW-NEXT: retq 149; 150; AVX2-FAST-ALL-LABEL: combine_vec_shl_trunc_and: 151; AVX2-FAST-ALL: # %bb.0: 152; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0] 153; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 154; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 155; AVX2-FAST-ALL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 156; AVX2-FAST-ALL-NEXT: vzeroupper 157; AVX2-FAST-ALL-NEXT: retq 158; 159; AVX2-FAST-PERLANE-LABEL: combine_vec_shl_trunc_and: 160; AVX2-FAST-PERLANE: # %bb.0: 161; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 162; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 163; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 164; AVX2-FAST-PERLANE-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 165; AVX2-FAST-PERLANE-NEXT: vzeroupper 166; AVX2-FAST-PERLANE-NEXT: retq 167; 168; AVX512-LABEL: combine_vec_shl_trunc_and: 169; AVX512: # %bb.0: 170; AVX512-NEXT: vpmovqd %ymm1, %xmm1 171; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 172; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 173; AVX512-NEXT: vzeroupper 174; AVX512-NEXT: retq 175 %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535> 176 %2 = trunc <4 x i64> %1 to <4 x i32> 177 %3 = shl <4 x i32> %x, %2 178 ret <4 x i32> %3 179} 180 181; fold (shl (shl x, c1), c2) -> (shl x, (add c1, c2)) 182define <4 x i32> @combine_vec_shl_shl0(<4 x i32> %x) { 183; SSE-LABEL: combine_vec_shl_shl0: 184; SSE: # %bb.0: 185; SSE-NEXT: pslld $6, %xmm0 186; SSE-NEXT: retq 187; 188; AVX-LABEL: combine_vec_shl_shl0: 189; AVX: # %bb.0: 190; AVX-NEXT: vpslld $6, %xmm0, %xmm0 191; AVX-NEXT: retq 192 %1 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2> 193 %2 = shl <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4> 194 ret <4 x i32> %2 195} 196 197define <4 x i32> @combine_vec_shl_shl1(<4 x i32> %x) { 198; SSE2-LABEL: combine_vec_shl_shl1: 199; SSE2: # %bb.0: 200; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 201; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 202; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 203; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 204; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 205; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 206; SSE2-NEXT: retq 207; 208; SSE41-LABEL: combine_vec_shl_shl1: 209; SSE41: # %bb.0: 210; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 211; SSE41-NEXT: retq 212; 213; AVX-LABEL: combine_vec_shl_shl1: 214; AVX: # %bb.0: 215; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 216; AVX-NEXT: retq 217 %1 = shl <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3> 218 %2 = shl <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7> 219 ret <4 x i32> %2 220} 221 222; fold (shl (shl x, c1), c2) -> 0 223define <4 x i32> @combine_vec_shl_shlr_zero0(<4 x i32> %x) { 224; SSE-LABEL: combine_vec_shl_shlr_zero0: 225; SSE: # %bb.0: 226; SSE-NEXT: xorps %xmm0, %xmm0 227; SSE-NEXT: retq 228; 229; AVX-LABEL: combine_vec_shl_shlr_zero0: 230; AVX: # %bb.0: 231; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 232; AVX-NEXT: retq 233 %1 = shl <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16> 234 %2 = shl <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20> 235 ret <4 x i32> %2 236} 237 238define <4 x i32> @combine_vec_shl_shl_zero1(<4 x i32> %x) { 239; SSE-LABEL: combine_vec_shl_shl_zero1: 240; SSE: # %bb.0: 241; SSE-NEXT: xorps %xmm0, %xmm0 242; SSE-NEXT: retq 243; 244; AVX-LABEL: combine_vec_shl_shl_zero1: 245; AVX: # %bb.0: 246; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 247; AVX-NEXT: retq 248 %1 = shl <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20> 249 %2 = shl <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28> 250 ret <4 x i32> %2 251} 252 253; fold (shl (ext (shl x, c1)), c2) -> (shl (ext x), (add c1, c2)) 254define <8 x i32> @combine_vec_shl_ext_shl0(<8 x i16> %x) { 255; SSE2-LABEL: combine_vec_shl_ext_shl0: 256; SSE2: # %bb.0: 257; SSE2-NEXT: movdqa %xmm0, %xmm1 258; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 259; SSE2-NEXT: pslld $20, %xmm0 260; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 261; SSE2-NEXT: pslld $20, %xmm1 262; SSE2-NEXT: retq 263; 264; SSE41-LABEL: combine_vec_shl_ext_shl0: 265; SSE41: # %bb.0: 266; SSE41-NEXT: movdqa %xmm0, %xmm1 267; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 268; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 269; SSE41-NEXT: pslld $20, %xmm1 270; SSE41-NEXT: pslld $20, %xmm0 271; SSE41-NEXT: retq 272; 273; AVX-LABEL: combine_vec_shl_ext_shl0: 274; AVX: # %bb.0: 275; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 276; AVX-NEXT: vpslld $20, %ymm0, %ymm0 277; AVX-NEXT: retq 278 %1 = shl <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4> 279 %2 = sext <8 x i16> %1 to <8 x i32> 280 %3 = shl <8 x i32> %2, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 281 ret <8 x i32> %3 282} 283 284define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) { 285; SSE-LABEL: combine_vec_shl_ext_shl1: 286; SSE: # %bb.0: 287; SSE-NEXT: xorps %xmm0, %xmm0 288; SSE-NEXT: xorps %xmm1, %xmm1 289; SSE-NEXT: retq 290; 291; AVX-LABEL: combine_vec_shl_ext_shl1: 292; AVX: # %bb.0: 293; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 294; AVX-NEXT: retq 295 %1 = shl <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8> 296 %2 = sext <8 x i16> %1 to <8 x i32> 297 %3 = shl <8 x i32> %2, <i32 31, i32 31, i32 30, i32 30, i32 29, i32 29, i32 28, i32 28> 298 ret <8 x i32> %3 299} 300 301define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) { 302; SSE2-LABEL: combine_vec_shl_ext_shl2: 303; SSE2: # %bb.0: 304; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 305; SSE2-NEXT: psrad $16, %xmm1 306; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 307; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 308; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] 309; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 310; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] 311; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 312; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 313; SSE2-NEXT: psrad $16, %xmm0 314; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 315; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 316; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] 317; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 318; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 319; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 320; SSE2-NEXT: movdqa %xmm2, %xmm0 321; SSE2-NEXT: retq 322; 323; SSE41-LABEL: combine_vec_shl_ext_shl2: 324; SSE41: # %bb.0: 325; SSE41-NEXT: pmovsxwd %xmm0, %xmm2 326; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 327; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 328; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 329; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 330; SSE41-NEXT: movdqa %xmm2, %xmm0 331; SSE41-NEXT: retq 332; 333; AVX-LABEL: combine_vec_shl_ext_shl2: 334; AVX: # %bb.0: 335; AVX-NEXT: vpmovsxwd %xmm0, %ymm0 336; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 337; AVX-NEXT: retq 338 %1 = shl <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8> 339 %2 = sext <8 x i16> %1 to <8 x i32> 340 %3 = shl <8 x i32> %2, <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 341 ret <8 x i32> %3 342} 343 344; fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C)) 345define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) { 346; SSE2-LABEL: combine_vec_shl_zext_lshr0: 347; SSE2: # %bb.0: 348; SSE2-NEXT: movdqa %xmm0, %xmm1 349; SSE2-NEXT: pxor %xmm2, %xmm2 350; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 351; SSE2-NEXT: movdqa %xmm1, %xmm0 352; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 353; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 354; SSE2-NEXT: retq 355; 356; SSE41-LABEL: combine_vec_shl_zext_lshr0: 357; SSE41: # %bb.0: 358; SSE41-NEXT: movdqa %xmm0, %xmm1 359; SSE41-NEXT: pxor %xmm2, %xmm2 360; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 361; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 362; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 363; SSE41-NEXT: retq 364; 365; AVX2-LABEL: combine_vec_shl_zext_lshr0: 366; AVX2: # %bb.0: 367; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 368; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 369; AVX2-NEXT: retq 370; 371; AVX512-LABEL: combine_vec_shl_zext_lshr0: 372; AVX512: # %bb.0: 373; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 374; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 375; AVX512-NEXT: retq 376 %1 = lshr <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4> 377 %2 = zext <8 x i16> %1 to <8 x i32> 378 %3 = shl <8 x i32> %2, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 379 ret <8 x i32> %3 380} 381 382define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) { 383; SSE2-LABEL: combine_vec_shl_zext_lshr1: 384; SSE2: # %bb.0: 385; SSE2-NEXT: movdqa %xmm0, %xmm1 386; SSE2-NEXT: pxor %xmm2, %xmm2 387; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 388; SSE2-NEXT: movdqa %xmm1, %xmm0 389; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 390; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 391; SSE2-NEXT: retq 392; 393; SSE41-LABEL: combine_vec_shl_zext_lshr1: 394; SSE41: # %bb.0: 395; SSE41-NEXT: movdqa %xmm0, %xmm1 396; SSE41-NEXT: pxor %xmm2, %xmm2 397; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 398; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 399; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 400; SSE41-NEXT: retq 401; 402; AVX-LABEL: combine_vec_shl_zext_lshr1: 403; AVX: # %bb.0: 404; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 405; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 406; AVX-NEXT: retq 407 %1 = lshr <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 15> 408 %2 = zext <8 x i16> %1 to <8 x i32> 409 %3 = shl <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 15> 410 ret <8 x i32> %3 411} 412 413; fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2 414define <4 x i32> @combine_vec_shl_ge_ashr_exact0(<4 x i32> %x) { 415; SSE-LABEL: combine_vec_shl_ge_ashr_exact0: 416; SSE: # %bb.0: 417; SSE-NEXT: pslld $2, %xmm0 418; SSE-NEXT: retq 419; 420; AVX-LABEL: combine_vec_shl_ge_ashr_exact0: 421; AVX: # %bb.0: 422; AVX-NEXT: vpslld $2, %xmm0, %xmm0 423; AVX-NEXT: retq 424 %1 = ashr exact <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3> 425 %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5> 426 ret <4 x i32> %2 427} 428 429define <4 x i32> @combine_vec_shl_ge_ashr_exact1(<4 x i32> %x) { 430; SSE2-LABEL: combine_vec_shl_ge_ashr_exact1: 431; SSE2: # %bb.0: 432; SSE2-NEXT: movdqa %xmm0, %xmm1 433; SSE2-NEXT: pslld $2, %xmm1 434; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 435; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 436; SSE2-NEXT: movaps %xmm1, %xmm0 437; SSE2-NEXT: retq 438; 439; SSE41-LABEL: combine_vec_shl_ge_ashr_exact1: 440; SSE41: # %bb.0: 441; SSE41-NEXT: movdqa %xmm0, %xmm1 442; SSE41-NEXT: pslld $2, %xmm1 443; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 444; SSE41-NEXT: retq 445; 446; AVX-LABEL: combine_vec_shl_ge_ashr_exact1: 447; AVX: # %bb.0: 448; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 449; AVX-NEXT: retq 450 %1 = ashr exact <4 x i32> %x, <i32 3, i32 4, i32 5, i32 8> 451 %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8> 452 ret <4 x i32> %2 453} 454 455; fold (shl (sr[la] exact SEL(X,Y), C1), C2) -> (shl SEL(X,Y), (C2-C1)) if C1 <= C2 456define i32 @combine_shl_ge_sel_ashr_exact0(i32 %x, i32 %y, i32 %z) { 457; CHECK-LABEL: combine_shl_ge_sel_ashr_exact0: 458; CHECK: # %bb.0: 459; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 460; CHECK-NEXT: testl %edx, %edx 461; CHECK-NEXT: cmovel %esi, %edi 462; CHECK-NEXT: leal (,%rdi,4), %eax 463; CHECK-NEXT: retq 464 %cmp = icmp ne i32 %z, 0 465 %ashrx = ashr exact i32 %x, 3 466 %ashry = ashr exact i32 %y, 3 467 %sel = select i1 %cmp, i32 %ashrx, i32 %ashry 468 %shl = shl i32 %sel, 5 469 ret i32 %shl 470} 471 472; fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2 473define <4 x i32> @combine_vec_shl_lt_ashr_exact0(<4 x i32> %x) { 474; SSE-LABEL: combine_vec_shl_lt_ashr_exact0: 475; SSE: # %bb.0: 476; SSE-NEXT: psrad $2, %xmm0 477; SSE-NEXT: retq 478; 479; AVX-LABEL: combine_vec_shl_lt_ashr_exact0: 480; AVX: # %bb.0: 481; AVX-NEXT: vpsrad $2, %xmm0, %xmm0 482; AVX-NEXT: retq 483 %1 = ashr exact <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> 484 %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3> 485 ret <4 x i32> %2 486} 487 488define <4 x i32> @combine_vec_shl_lt_ashr_exact1(<4 x i32> %x) { 489; SSE2-LABEL: combine_vec_shl_lt_ashr_exact1: 490; SSE2: # %bb.0: 491; SSE2-NEXT: movdqa %xmm0, %xmm1 492; SSE2-NEXT: psrad $2, %xmm1 493; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 494; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 495; SSE2-NEXT: movaps %xmm1, %xmm0 496; SSE2-NEXT: retq 497; 498; SSE41-LABEL: combine_vec_shl_lt_ashr_exact1: 499; SSE41: # %bb.0: 500; SSE41-NEXT: movdqa %xmm0, %xmm1 501; SSE41-NEXT: psrad $2, %xmm1 502; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 503; SSE41-NEXT: retq 504; 505; AVX-LABEL: combine_vec_shl_lt_ashr_exact1: 506; AVX: # %bb.0: 507; AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 508; AVX-NEXT: retq 509 %1 = ashr exact <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8> 510 %2 = shl <4 x i32> %1, <i32 3, i32 4, i32 5, i32 8> 511 ret <4 x i32> %2 512} 513 514; fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) if C2 > C1 515define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) { 516; SSE-LABEL: combine_vec_shl_gt_lshr0: 517; SSE: # %bb.0: 518; SSE-NEXT: pslld $2, %xmm0 519; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 520; SSE-NEXT: retq 521; 522; AVX2-LABEL: combine_vec_shl_gt_lshr0: 523; AVX2: # %bb.0: 524; AVX2-NEXT: vpslld $2, %xmm0, %xmm0 525; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264] 526; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 527; AVX2-NEXT: retq 528; 529; AVX512-LABEL: combine_vec_shl_gt_lshr0: 530; AVX512: # %bb.0: 531; AVX512-NEXT: vpslld $2, %xmm0, %xmm0 532; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 533; AVX512-NEXT: retq 534 %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3> 535 %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5> 536 ret <4 x i32> %2 537} 538 539define <4 x i32> @combine_vec_shl_gt_lshr1(<4 x i32> %x) { 540; SSE-LABEL: combine_vec_shl_gt_lshr1: 541; SSE: # %bb.0: 542; SSE-NEXT: pslld $2, %xmm0 543; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 544; SSE-NEXT: retq 545; 546; AVX-LABEL: combine_vec_shl_gt_lshr1: 547; AVX: # %bb.0: 548; AVX-NEXT: vpslld $2, %xmm0, %xmm0 549; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 550; AVX-NEXT: retq 551 %1 = lshr <4 x i32> %x, <i32 3, i32 4, i32 5, i32 29> 552 %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 31> 553 ret <4 x i32> %2 554} 555 556; fold (shl (srl x, c1), c2) -> (and (srl x, (sub c1, c2), MASK) if C1 >= C2 557define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) { 558; SSE-LABEL: combine_vec_shl_le_lshr0: 559; SSE: # %bb.0: 560; SSE-NEXT: psrld $2, %xmm0 561; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 562; SSE-NEXT: retq 563; 564; AVX2-LABEL: combine_vec_shl_le_lshr0: 565; AVX2: # %bb.0: 566; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0 567; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816] 568; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 569; AVX2-NEXT: retq 570; 571; AVX512-LABEL: combine_vec_shl_le_lshr0: 572; AVX512: # %bb.0: 573; AVX512-NEXT: vpsrld $2, %xmm0, %xmm0 574; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 575; AVX512-NEXT: retq 576 %1 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> 577 %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3> 578 ret <4 x i32> %2 579} 580 581define <4 x i32> @combine_vec_shl_le_lshr1(<4 x i32> %x) { 582; SSE2-LABEL: combine_vec_shl_le_lshr1: 583; SSE2: # %bb.0: 584; SSE2-NEXT: movdqa %xmm0, %xmm1 585; SSE2-NEXT: psrld $2, %xmm1 586; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 587; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 588; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 589; SSE2-NEXT: movaps %xmm1, %xmm0 590; SSE2-NEXT: retq 591; 592; SSE41-LABEL: combine_vec_shl_le_lshr1: 593; SSE41: # %bb.0: 594; SSE41-NEXT: movdqa %xmm0, %xmm1 595; SSE41-NEXT: psrld $2, %xmm1 596; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 597; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 598; SSE41-NEXT: retq 599; 600; AVX-LABEL: combine_vec_shl_le_lshr1: 601; AVX: # %bb.0: 602; AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 603; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 604; AVX-NEXT: retq 605 %1 = lshr <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8> 606 %2 = shl <4 x i32> %1, <i32 3, i32 4, i32 5, i32 8> 607 ret <4 x i32> %2 608} 609 610; fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1)) 611define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) { 612; SSE-LABEL: combine_vec_shl_ashr0: 613; SSE: # %bb.0: 614; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 615; SSE-NEXT: retq 616; 617; AVX2-LABEL: combine_vec_shl_ashr0: 618; AVX2: # %bb.0: 619; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264] 620; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 621; AVX2-NEXT: retq 622; 623; AVX512-LABEL: combine_vec_shl_ashr0: 624; AVX512: # %bb.0: 625; AVX512-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 626; AVX512-NEXT: retq 627 %1 = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> 628 %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5> 629 ret <4 x i32> %2 630} 631 632define <4 x i32> @combine_vec_shl_ashr1(<4 x i32> %x) { 633; SSE-LABEL: combine_vec_shl_ashr1: 634; SSE: # %bb.0: 635; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 636; SSE-NEXT: retq 637; 638; AVX-LABEL: combine_vec_shl_ashr1: 639; AVX: # %bb.0: 640; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 641; AVX-NEXT: retq 642 %1 = ashr <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8> 643 %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8> 644 ret <4 x i32> %2 645} 646 647; fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 648define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) { 649; SSE-LABEL: combine_vec_shl_add0: 650; SSE: # %bb.0: 651; SSE-NEXT: pslld $2, %xmm0 652; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 653; SSE-NEXT: retq 654; 655; AVX2-LABEL: combine_vec_shl_add0: 656; AVX2: # %bb.0: 657; AVX2-NEXT: vpslld $2, %xmm0, %xmm0 658; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] 659; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 660; AVX2-NEXT: retq 661; 662; AVX512-LABEL: combine_vec_shl_add0: 663; AVX512: # %bb.0: 664; AVX512-NEXT: vpslld $2, %xmm0, %xmm0 665; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 666; AVX512-NEXT: retq 667 %1 = add <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> 668 %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2> 669 ret <4 x i32> %2 670} 671 672define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) { 673; SSE2-LABEL: combine_vec_shl_add1: 674; SSE2: # %bb.0: 675; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 676; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 677; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 678; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 679; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 680; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 681; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 682; SSE2-NEXT: retq 683; 684; SSE41-LABEL: combine_vec_shl_add1: 685; SSE41: # %bb.0: 686; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 687; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 688; SSE41-NEXT: retq 689; 690; AVX-LABEL: combine_vec_shl_add1: 691; AVX: # %bb.0: 692; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 693; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 694; AVX-NEXT: retq 695 %1 = add <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8> 696 %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4> 697 ret <4 x i32> %2 698} 699 700; fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) 701define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) { 702; SSE-LABEL: combine_vec_shl_or0: 703; SSE: # %bb.0: 704; SSE-NEXT: pslld $2, %xmm0 705; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 706; SSE-NEXT: retq 707; 708; AVX2-LABEL: combine_vec_shl_or0: 709; AVX2: # %bb.0: 710; AVX2-NEXT: vpslld $2, %xmm0, %xmm0 711; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] 712; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 713; AVX2-NEXT: retq 714; 715; AVX512-LABEL: combine_vec_shl_or0: 716; AVX512: # %bb.0: 717; AVX512-NEXT: vpslld $2, %xmm0, %xmm0 718; AVX512-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 719; AVX512-NEXT: retq 720 %1 = or <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> 721 %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2> 722 ret <4 x i32> %2 723} 724 725define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) { 726; SSE2-LABEL: combine_vec_shl_or1: 727; SSE2: # %bb.0: 728; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 729; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 730; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 731; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 732; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 733; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 734; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 735; SSE2-NEXT: retq 736; 737; SSE41-LABEL: combine_vec_shl_or1: 738; SSE41: # %bb.0: 739; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 740; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 741; SSE41-NEXT: retq 742; 743; AVX-LABEL: combine_vec_shl_or1: 744; AVX: # %bb.0: 745; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 746; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 747; AVX-NEXT: retq 748 %1 = or <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8> 749 %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4> 750 ret <4 x i32> %2 751} 752 753; fold (shl (mul x, c1), c2) -> (mul x, c1 << c2) 754define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) { 755; SSE2-LABEL: combine_vec_shl_mul0: 756; SSE2: # %bb.0: 757; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [20,20,20,20] 758; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 759; SSE2-NEXT: pmuludq %xmm1, %xmm0 760; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 761; SSE2-NEXT: pmuludq %xmm1, %xmm2 762; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 763; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 764; SSE2-NEXT: retq 765; 766; SSE41-LABEL: combine_vec_shl_mul0: 767; SSE41: # %bb.0: 768; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 769; SSE41-NEXT: retq 770; 771; AVX2-LABEL: combine_vec_shl_mul0: 772; AVX2: # %bb.0: 773; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] 774; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 775; AVX2-NEXT: retq 776; 777; AVX512-LABEL: combine_vec_shl_mul0: 778; AVX512: # %bb.0: 779; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 780; AVX512-NEXT: retq 781 %1 = mul <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> 782 %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2> 783 ret <4 x i32> %2 784} 785 786define <4 x i32> @combine_vec_shl_mul1(<4 x i32> %x) { 787; SSE2-LABEL: combine_vec_shl_mul1: 788; SSE2: # %bb.0: 789; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 790; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 791; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 792; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 793; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 794; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 795; SSE2-NEXT: retq 796; 797; SSE41-LABEL: combine_vec_shl_mul1: 798; SSE41: # %bb.0: 799; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 800; SSE41-NEXT: retq 801; 802; AVX-LABEL: combine_vec_shl_mul1: 803; AVX: # %bb.0: 804; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 805; AVX-NEXT: retq 806 %1 = mul <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8> 807 %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4> 808 ret <4 x i32> %2 809} 810 811; fold (add (shl x, c1), c2) -> (or (shl x, c1), c2) 812define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0) { 813; SSE2-LABEL: combine_vec_add_shl_nonsplat: 814; SSE2: # %bb.0: 815; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 816; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 817; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 818; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 819; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 820; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 821; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 822; SSE2-NEXT: retq 823; 824; SSE41-LABEL: combine_vec_add_shl_nonsplat: 825; SSE41: # %bb.0: 826; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 827; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 828; SSE41-NEXT: retq 829; 830; AVX2-LABEL: combine_vec_add_shl_nonsplat: 831; AVX2: # %bb.0: 832; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 833; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] 834; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 835; AVX2-NEXT: retq 836; 837; AVX512-LABEL: combine_vec_add_shl_nonsplat: 838; AVX512: # %bb.0: 839; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 840; AVX512-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 841; AVX512-NEXT: retq 842 %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 4, i32 5> 843 %2 = add <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3> 844 ret <4 x i32> %2 845} 846 847define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0) { 848; SSE2-LABEL: combine_vec_add_shl_and_nonsplat: 849; SSE2: # %bb.0: 850; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 851; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4,8,16,32] 852; SSE2-NEXT: pmuludq %xmm0, %xmm1 853; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 854; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 855; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 856; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 857; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 858; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 859; SSE2-NEXT: movdqa %xmm1, %xmm0 860; SSE2-NEXT: retq 861; 862; SSE41-LABEL: combine_vec_add_shl_and_nonsplat: 863; SSE41: # %bb.0: 864; SSE41-NEXT: pxor %xmm1, %xmm1 865; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] 866; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 867; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 868; SSE41-NEXT: retq 869; 870; AVX2-LABEL: combine_vec_add_shl_and_nonsplat: 871; AVX2: # %bb.0: 872; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 873; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] 874; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 875; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] 876; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 877; AVX2-NEXT: retq 878; 879; AVX512-LABEL: combine_vec_add_shl_and_nonsplat: 880; AVX512: # %bb.0: 881; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 882; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] 883; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 884; AVX512-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 885; AVX512-NEXT: retq 886 %1 = and <4 x i32> %a0, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760> 887 %2 = shl <4 x i32> %1, <i32 2, i32 3, i32 4, i32 5> 888 %3 = add <4 x i32> %2, <i32 15, i32 15, i32 15, i32 15> 889 ret <4 x i32> %3 890} 891 892define <4 x i32> @combine_vec_add_shuffle_shl(<4 x i32> %a0) { 893; SSE2-LABEL: combine_vec_add_shuffle_shl: 894; SSE2: # %bb.0: 895; SSE2-NEXT: movdqa %xmm0, %xmm1 896; SSE2-NEXT: pslld $3, %xmm1 897; SSE2-NEXT: pslld $2, %xmm0 898; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 899; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,3,0] 900; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 901; SSE2-NEXT: retq 902; 903; SSE41-LABEL: combine_vec_add_shuffle_shl: 904; SSE41: # %bb.0: 905; SSE41-NEXT: movdqa %xmm0, %xmm1 906; SSE41-NEXT: pslld $3, %xmm1 907; SSE41-NEXT: pslld $2, %xmm0 908; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 909; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 910; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 911; SSE41-NEXT: retq 912; 913; AVX2-LABEL: combine_vec_add_shuffle_shl: 914; AVX2: # %bb.0: 915; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 916; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 917; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] 918; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 919; AVX2-NEXT: retq 920; 921; AVX512-LABEL: combine_vec_add_shuffle_shl: 922; AVX512: # %bb.0: 923; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 924; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 925; AVX512-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 926; AVX512-NEXT: retq 927 %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 0, i32 1> 928 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0> 929 %3 = add <4 x i32> %2, <i32 3, i32 3, i32 3, i32 3> 930 ret <4 x i32> %3 931} 932 933define <4 x i32> @combine_vec_shl_clamped1(<4 x i32> %sh, <4 x i32> %amt) { 934; SSE2-LABEL: combine_vec_shl_clamped1: 935; SSE2: # %bb.0: 936; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 937; SSE2-NEXT: pxor %xmm1, %xmm2 938; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 939; SSE2-NEXT: pslld $23, %xmm1 940; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 941; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 942; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 943; SSE2-NEXT: pmuludq %xmm1, %xmm0 944; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 945; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 946; SSE2-NEXT: pmuludq %xmm3, %xmm1 947; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 948; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 949; SSE2-NEXT: pandn %xmm0, %xmm2 950; SSE2-NEXT: movdqa %xmm2, %xmm0 951; SSE2-NEXT: retq 952; 953; SSE41-LABEL: combine_vec_shl_clamped1: 954; SSE41: # %bb.0: 955; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31] 956; SSE41-NEXT: pminud %xmm1, %xmm2 957; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 958; SSE41-NEXT: pslld $23, %xmm1 959; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 960; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 961; SSE41-NEXT: pmulld %xmm1, %xmm0 962; SSE41-NEXT: pand %xmm2, %xmm0 963; SSE41-NEXT: retq 964; 965; AVX-LABEL: combine_vec_shl_clamped1: 966; AVX: # %bb.0: 967; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 968; AVX-NEXT: retq 969 %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32> 970 %shl = shl <4 x i32> %sh, %amt 971 %1 = select <4 x i1> %cmp.i, <4 x i32> %shl, <4 x i32> zeroinitializer 972 ret <4 x i32> %1 973} 974 975define <4 x i32> @combine_vec_shl_clamped2(<4 x i32> %sh, <4 x i32> %amt) { 976; SSE2-LABEL: combine_vec_shl_clamped2: 977; SSE2: # %bb.0: 978; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 979; SSE2-NEXT: pxor %xmm1, %xmm2 980; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 981; SSE2-NEXT: pandn %xmm0, %xmm2 982; SSE2-NEXT: pslld $23, %xmm1 983; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 984; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 985; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 986; SSE2-NEXT: pmuludq %xmm1, %xmm2 987; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 988; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 989; SSE2-NEXT: pmuludq %xmm3, %xmm1 990; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 991; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 992; SSE2-NEXT: retq 993; 994; SSE41-LABEL: combine_vec_shl_clamped2: 995; SSE41: # %bb.0: 996; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31] 997; SSE41-NEXT: pminud %xmm1, %xmm2 998; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 999; SSE41-NEXT: pand %xmm2, %xmm0 1000; SSE41-NEXT: pslld $23, %xmm1 1001; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1002; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 1003; SSE41-NEXT: pmulld %xmm1, %xmm0 1004; SSE41-NEXT: retq 1005; 1006; AVX-LABEL: combine_vec_shl_clamped2: 1007; AVX: # %bb.0: 1008; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 1009; AVX-NEXT: retq 1010 %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32> 1011 %1 = select <4 x i1> %cmp.i, <4 x i32> %sh, <4 x i32> zeroinitializer 1012 %shl = shl <4 x i32> %1, %amt 1013 ret <4 x i32> %shl 1014} 1015 1016define <4 x i32> @combine_vec_shl_commuted_clamped(<4 x i32> %sh, <4 x i32> %amt) { 1017; SSE2-LABEL: combine_vec_shl_commuted_clamped: 1018; SSE2: # %bb.0: 1019; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 1020; SSE2-NEXT: pxor %xmm1, %xmm2 1021; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1022; SSE2-NEXT: pandn %xmm0, %xmm2 1023; SSE2-NEXT: pslld $23, %xmm1 1024; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1025; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 1026; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 1027; SSE2-NEXT: pmuludq %xmm1, %xmm2 1028; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1029; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1030; SSE2-NEXT: pmuludq %xmm3, %xmm1 1031; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1032; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1033; SSE2-NEXT: retq 1034; 1035; SSE41-LABEL: combine_vec_shl_commuted_clamped: 1036; SSE41: # %bb.0: 1037; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31] 1038; SSE41-NEXT: pminud %xmm1, %xmm2 1039; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 1040; SSE41-NEXT: pand %xmm2, %xmm0 1041; SSE41-NEXT: pslld $23, %xmm1 1042; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1043; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 1044; SSE41-NEXT: pmulld %xmm1, %xmm0 1045; SSE41-NEXT: retq 1046; 1047; AVX-LABEL: combine_vec_shl_commuted_clamped: 1048; AVX: # %bb.0: 1049; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 1050; AVX-NEXT: retq 1051 %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32> 1052 %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %sh 1053 %shl = shl <4 x i32> %1, %amt 1054 ret <4 x i32> %shl 1055} 1056 1057define <4 x i32> @combine_vec_shl_commuted_clamped1(<4 x i32> %sh, <4 x i32> %amt) { 1058; SSE2-LABEL: combine_vec_shl_commuted_clamped1: 1059; SSE2: # %bb.0: 1060; SSE2-NEXT: movdqa %xmm1, %xmm2 1061; SSE2-NEXT: pslld $23, %xmm2 1062; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1063; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 1064; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1065; SSE2-NEXT: pmuludq %xmm2, %xmm0 1066; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1067; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1068; SSE2-NEXT: pmuludq %xmm3, %xmm2 1069; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1070; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1071; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1072; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1073; SSE2-NEXT: pandn %xmm0, %xmm1 1074; SSE2-NEXT: movdqa %xmm1, %xmm0 1075; SSE2-NEXT: retq 1076; 1077; SSE41-LABEL: combine_vec_shl_commuted_clamped1: 1078; SSE41: # %bb.0: 1079; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31] 1080; SSE41-NEXT: pminud %xmm1, %xmm2 1081; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 1082; SSE41-NEXT: pslld $23, %xmm1 1083; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1084; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 1085; SSE41-NEXT: pmulld %xmm1, %xmm0 1086; SSE41-NEXT: pand %xmm2, %xmm0 1087; SSE41-NEXT: retq 1088; 1089; AVX-LABEL: combine_vec_shl_commuted_clamped1: 1090; AVX: # %bb.0: 1091; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 1092; AVX-NEXT: retq 1093 %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32> 1094 %shl = shl <4 x i32> %sh, %amt 1095 %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %shl 1096 ret <4 x i32> %1 1097} 1098