1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-ALL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-PERLANE 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 8 9; fold (sra 0, x) -> 0 10define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) { 11; SSE-LABEL: combine_vec_ashr_zero: 12; SSE: # %bb.0: 13; SSE-NEXT: xorps %xmm0, %xmm0 14; SSE-NEXT: retq 15; 16; AVX-LABEL: combine_vec_ashr_zero: 17; AVX: # %bb.0: 18; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 19; AVX-NEXT: retq 20 %1 = ashr <4 x i32> zeroinitializer, %x 21 ret <4 x i32> %1 22} 23 24; fold (sra -1, x) -> -1 25define <4 x i32> @combine_vec_ashr_allones(<4 x i32> %x) { 26; SSE-LABEL: combine_vec_ashr_allones: 27; SSE: # %bb.0: 28; SSE-NEXT: pcmpeqd %xmm0, %xmm0 29; SSE-NEXT: retq 30; 31; AVX-LABEL: combine_vec_ashr_allones: 32; AVX: # %bb.0: 33; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 34; AVX-NEXT: retq 35 %1 = ashr <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %x 36 ret <4 x i32> %1 37} 38 39; fold (sra x, c >= size(x)) -> undef 40define <4 x i32> @combine_vec_ashr_outofrange0(<4 x i32> %x) { 41; CHECK-LABEL: combine_vec_ashr_outofrange0: 42; CHECK: # %bb.0: 43; CHECK-NEXT: retq 44 %1 = ashr <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33> 45 ret <4 x i32> %1 46} 47 48define <4 x i32> @combine_vec_ashr_outofrange1(<4 x i32> %x) { 49; CHECK-LABEL: combine_vec_ashr_outofrange1: 50; CHECK: # %bb.0: 51; CHECK-NEXT: retq 52 %1 = ashr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36> 53 ret <4 x i32> %1 54} 55 56define <4 x i32> @combine_vec_ashr_outofrange2(<4 x i32> %x) { 57; CHECK-LABEL: combine_vec_ashr_outofrange2: 58; CHECK: # %bb.0: 59; CHECK-NEXT: retq 60 %1 = ashr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 undef> 61 ret <4 x i32> %1 62} 63 64; fold (sra x, 0) -> x 65define <4 x i32> @combine_vec_ashr_by_zero(<4 x i32> %x) { 66; CHECK-LABEL: combine_vec_ashr_by_zero: 67; CHECK: # %bb.0: 68; CHECK-NEXT: retq 69 %1 = ashr <4 x i32> %x, zeroinitializer 70 ret <4 x i32> %1 71} 72 73; fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2)) 74define <4 x i32> @combine_vec_ashr_ashr0(<4 x i32> %x) { 75; SSE-LABEL: combine_vec_ashr_ashr0: 76; SSE: # %bb.0: 77; SSE-NEXT: psrad $6, %xmm0 78; SSE-NEXT: retq 79; 80; AVX-LABEL: combine_vec_ashr_ashr0: 81; AVX: # %bb.0: 82; AVX-NEXT: vpsrad $6, %xmm0, %xmm0 83; AVX-NEXT: retq 84 %1 = ashr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2> 85 %2 = ashr <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4> 86 ret <4 x i32> %2 87} 88 89define <4 x i32> @combine_vec_ashr_ashr1(<4 x i32> %x) { 90; SSE2-LABEL: combine_vec_ashr_ashr1: 91; SSE2: # %bb.0: 92; SSE2-NEXT: movdqa %xmm0, %xmm1 93; SSE2-NEXT: psrad $10, %xmm1 94; SSE2-NEXT: movdqa %xmm0, %xmm2 95; SSE2-NEXT: psrad $8, %xmm2 96; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 97; SSE2-NEXT: movdqa %xmm0, %xmm1 98; SSE2-NEXT: psrad $6, %xmm1 99; SSE2-NEXT: psrad $4, %xmm0 100; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 101; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] 102; SSE2-NEXT: retq 103; 104; SSE41-LABEL: combine_vec_ashr_ashr1: 105; SSE41: # %bb.0: 106; SSE41-NEXT: movdqa %xmm0, %xmm1 107; SSE41-NEXT: psrad $10, %xmm1 108; SSE41-NEXT: movdqa %xmm0, %xmm2 109; SSE41-NEXT: psrad $6, %xmm2 110; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 111; SSE41-NEXT: movdqa %xmm0, %xmm1 112; SSE41-NEXT: psrad $8, %xmm1 113; SSE41-NEXT: psrad $4, %xmm0 114; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 115; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 116; SSE41-NEXT: retq 117; 118; AVX-LABEL: combine_vec_ashr_ashr1: 119; AVX: # %bb.0: 120; AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 121; AVX-NEXT: retq 122 %1 = ashr <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3> 123 %2 = ashr <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7> 124 ret <4 x i32> %2 125} 126 127define <4 x i32> @combine_vec_ashr_ashr2(<4 x i32> %x) { 128; SSE-LABEL: combine_vec_ashr_ashr2: 129; SSE: # %bb.0: 130; SSE-NEXT: psrad $31, %xmm0 131; SSE-NEXT: retq 132; 133; AVX-LABEL: combine_vec_ashr_ashr2: 134; AVX: # %bb.0: 135; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 136; AVX-NEXT: retq 137 %1 = ashr <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20> 138 %2 = ashr <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28> 139 ret <4 x i32> %2 140} 141 142define <4 x i32> @combine_vec_ashr_ashr3(<4 x i32> %x) { 143; SSE2-LABEL: combine_vec_ashr_ashr3: 144; SSE2: # %bb.0: 145; SSE2-NEXT: movdqa %xmm0, %xmm2 146; SSE2-NEXT: psrad $27, %xmm2 147; SSE2-NEXT: movdqa %xmm0, %xmm1 148; SSE2-NEXT: psrad $31, %xmm1 149; SSE2-NEXT: movdqa %xmm1, %xmm3 150; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 151; SSE2-NEXT: psrad $15, %xmm0 152; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 153; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] 154; SSE2-NEXT: movaps %xmm1, %xmm0 155; SSE2-NEXT: retq 156; 157; SSE41-LABEL: combine_vec_ashr_ashr3: 158; SSE41: # %bb.0: 159; SSE41-NEXT: movdqa %xmm0, %xmm1 160; SSE41-NEXT: psrad $27, %xmm1 161; SSE41-NEXT: movdqa %xmm0, %xmm2 162; SSE41-NEXT: psrad $15, %xmm2 163; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 164; SSE41-NEXT: psrad $31, %xmm0 165; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 166; SSE41-NEXT: retq 167; 168; AVX-LABEL: combine_vec_ashr_ashr3: 169; AVX: # %bb.0: 170; AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 171; AVX-NEXT: retq 172 %1 = ashr <4 x i32> %x, <i32 1, i32 5, i32 50, i32 27> 173 %2 = ashr <4 x i32> %1, <i32 33, i32 10, i32 33, i32 0> 174 ret <4 x i32> %2 175} 176 177; fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))). 178define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) { 179; SSE2-LABEL: combine_vec_ashr_trunc_and: 180; SSE2: # %bb.0: 181; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 182; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 183; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 184; SSE2-NEXT: movdqa %xmm0, %xmm3 185; SSE2-NEXT: psrad %xmm2, %xmm3 186; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 187; SSE2-NEXT: movdqa %xmm0, %xmm2 188; SSE2-NEXT: psrad %xmm4, %xmm2 189; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 190; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 191; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 192; SSE2-NEXT: movdqa %xmm0, %xmm4 193; SSE2-NEXT: psrad %xmm3, %xmm4 194; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 195; SSE2-NEXT: psrad %xmm1, %xmm0 196; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 197; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 198; SSE2-NEXT: movaps %xmm2, %xmm0 199; SSE2-NEXT: retq 200; 201; SSE41-LABEL: combine_vec_ashr_trunc_and: 202; SSE41: # %bb.0: 203; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 204; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 205; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 206; SSE41-NEXT: movdqa %xmm0, %xmm3 207; SSE41-NEXT: psrad %xmm2, %xmm3 208; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 209; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 210; SSE41-NEXT: movdqa %xmm0, %xmm5 211; SSE41-NEXT: psrad %xmm4, %xmm5 212; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 213; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 214; SSE41-NEXT: movdqa %xmm0, %xmm3 215; SSE41-NEXT: psrad %xmm1, %xmm3 216; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 217; SSE41-NEXT: psrad %xmm1, %xmm0 218; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 219; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 220; SSE41-NEXT: retq 221; 222; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and: 223; AVX2-SLOW: # %bb.0: 224; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 225; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 226; AVX2-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 227; AVX2-SLOW-NEXT: vpsravd %xmm1, %xmm0, %xmm0 228; AVX2-SLOW-NEXT: vzeroupper 229; AVX2-SLOW-NEXT: retq 230; 231; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_and: 232; AVX2-FAST-ALL: # %bb.0: 233; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0] 234; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 235; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 236; AVX2-FAST-ALL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 237; AVX2-FAST-ALL-NEXT: vzeroupper 238; AVX2-FAST-ALL-NEXT: retq 239; 240; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_and: 241; AVX2-FAST-PERLANE: # %bb.0: 242; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 243; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 244; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 245; AVX2-FAST-PERLANE-NEXT: vpsravd %xmm1, %xmm0, %xmm0 246; AVX2-FAST-PERLANE-NEXT: vzeroupper 247; AVX2-FAST-PERLANE-NEXT: retq 248; 249; AVX512-LABEL: combine_vec_ashr_trunc_and: 250; AVX512: # %bb.0: 251; AVX512-NEXT: vpmovqd %ymm1, %xmm1 252; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 253; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 254; AVX512-NEXT: vzeroupper 255; AVX512-NEXT: retq 256 %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535> 257 %2 = trunc <4 x i64> %1 to <4 x i32> 258 %3 = ashr <4 x i32> %x, %2 259 ret <4 x i32> %3 260} 261 262; fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2)) 263; if c1 is equal to the number of bits the trunc removes 264define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) { 265; SSE2-LABEL: combine_vec_ashr_trunc_lshr: 266; SSE2: # %bb.0: 267; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 268; SSE2-NEXT: movaps %xmm0, %xmm1 269; SSE2-NEXT: psrad $3, %xmm1 270; SSE2-NEXT: movaps %xmm0, %xmm2 271; SSE2-NEXT: psrad $2, %xmm2 272; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 273; SSE2-NEXT: movaps %xmm0, %xmm1 274; SSE2-NEXT: psrad $1, %xmm1 275; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 276; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] 277; SSE2-NEXT: retq 278; 279; SSE41-LABEL: combine_vec_ashr_trunc_lshr: 280; SSE41: # %bb.0: 281; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 282; SSE41-NEXT: movaps %xmm0, %xmm2 283; SSE41-NEXT: psrad $2, %xmm2 284; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] 285; SSE41-NEXT: psrad $1, %xmm0 286; SSE41-NEXT: psrad $3, %xmm1 287; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 288; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 289; SSE41-NEXT: retq 290; 291; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr: 292; AVX2-SLOW: # %bb.0: 293; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 294; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 295; AVX2-SLOW-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 296; AVX2-SLOW-NEXT: vzeroupper 297; AVX2-SLOW-NEXT: retq 298; 299; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_lshr: 300; AVX2-FAST-ALL: # %bb.0: 301; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7] 302; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 303; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 304; AVX2-FAST-ALL-NEXT: vzeroupper 305; AVX2-FAST-ALL-NEXT: retq 306; 307; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_lshr: 308; AVX2-FAST-PERLANE: # %bb.0: 309; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 310; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 311; AVX2-FAST-PERLANE-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 312; AVX2-FAST-PERLANE-NEXT: vzeroupper 313; AVX2-FAST-PERLANE-NEXT: retq 314; 315; AVX512-LABEL: combine_vec_ashr_trunc_lshr: 316; AVX512: # %bb.0: 317; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0 318; AVX512-NEXT: vpmovqd %ymm0, %xmm0 319; AVX512-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 320; AVX512-NEXT: vzeroupper 321; AVX512-NEXT: retq 322 %1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32> 323 %2 = trunc <4 x i64> %1 to <4 x i32> 324 %3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3> 325 ret <4 x i32> %3 326} 327 328define <16 x i8> @combine_vec_ashr_trunc_lshr_splat(<16 x i32> %x) { 329; SSE-LABEL: combine_vec_ashr_trunc_lshr_splat: 330; SSE: # %bb.0: 331; SSE-NEXT: psrad $26, %xmm3 332; SSE-NEXT: psrad $26, %xmm2 333; SSE-NEXT: packssdw %xmm3, %xmm2 334; SSE-NEXT: psrad $26, %xmm1 335; SSE-NEXT: psrad $26, %xmm0 336; SSE-NEXT: packssdw %xmm1, %xmm0 337; SSE-NEXT: packsswb %xmm2, %xmm0 338; SSE-NEXT: retq 339; 340; AVX2-LABEL: combine_vec_ashr_trunc_lshr_splat: 341; AVX2: # %bb.0: 342; AVX2-NEXT: vpsrad $26, %ymm1, %ymm1 343; AVX2-NEXT: vpsrad $26, %ymm0, %ymm0 344; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 345; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 346; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 347; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 348; AVX2-NEXT: vzeroupper 349; AVX2-NEXT: retq 350; 351; AVX512-LABEL: combine_vec_ashr_trunc_lshr_splat: 352; AVX512: # %bb.0: 353; AVX512-NEXT: vpsrad $26, %zmm0, %zmm0 354; AVX512-NEXT: vpmovdb %zmm0, %xmm0 355; AVX512-NEXT: vzeroupper 356; AVX512-NEXT: retq 357 %1 = lshr <16 x i32> %x, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> 358 %2 = trunc <16 x i32> %1 to <16 x i8> 359 %3 = ashr <16 x i8> %2, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2> 360 ret <16 x i8> %3 361} 362 363; fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2)) 364; if c1 is equal to the number of bits the trunc removes 365define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) { 366; SSE2-LABEL: combine_vec_ashr_trunc_ashr: 367; SSE2: # %bb.0: 368; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 369; SSE2-NEXT: movaps %xmm0, %xmm1 370; SSE2-NEXT: psrad $3, %xmm1 371; SSE2-NEXT: movaps %xmm0, %xmm2 372; SSE2-NEXT: psrad $2, %xmm2 373; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 374; SSE2-NEXT: movaps %xmm0, %xmm1 375; SSE2-NEXT: psrad $1, %xmm1 376; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 377; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] 378; SSE2-NEXT: retq 379; 380; SSE41-LABEL: combine_vec_ashr_trunc_ashr: 381; SSE41: # %bb.0: 382; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 383; SSE41-NEXT: movaps %xmm0, %xmm2 384; SSE41-NEXT: psrad $2, %xmm2 385; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] 386; SSE41-NEXT: psrad $1, %xmm0 387; SSE41-NEXT: psrad $3, %xmm1 388; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 389; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 390; SSE41-NEXT: retq 391; 392; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr: 393; AVX2-SLOW: # %bb.0: 394; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 395; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 396; AVX2-SLOW-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 397; AVX2-SLOW-NEXT: vzeroupper 398; AVX2-SLOW-NEXT: retq 399; 400; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_ashr: 401; AVX2-FAST-ALL: # %bb.0: 402; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7] 403; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 404; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 405; AVX2-FAST-ALL-NEXT: vzeroupper 406; AVX2-FAST-ALL-NEXT: retq 407; 408; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_ashr: 409; AVX2-FAST-PERLANE: # %bb.0: 410; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 411; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 412; AVX2-FAST-PERLANE-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 413; AVX2-FAST-PERLANE-NEXT: vzeroupper 414; AVX2-FAST-PERLANE-NEXT: retq 415; 416; AVX512-LABEL: combine_vec_ashr_trunc_ashr: 417; AVX512: # %bb.0: 418; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0 419; AVX512-NEXT: vpmovqd %ymm0, %xmm0 420; AVX512-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 421; AVX512-NEXT: vzeroupper 422; AVX512-NEXT: retq 423 %1 = ashr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32> 424 %2 = trunc <4 x i64> %1 to <4 x i32> 425 %3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3> 426 ret <4 x i32> %3 427} 428 429define <8 x i16> @combine_vec_ashr_trunc_ashr_splat(<8 x i32> %x) { 430; SSE-LABEL: combine_vec_ashr_trunc_ashr_splat: 431; SSE: # %bb.0: 432; SSE-NEXT: psrad $19, %xmm1 433; SSE-NEXT: psrad $19, %xmm0 434; SSE-NEXT: packssdw %xmm1, %xmm0 435; SSE-NEXT: retq 436; 437; AVX2-LABEL: combine_vec_ashr_trunc_ashr_splat: 438; AVX2: # %bb.0: 439; AVX2-NEXT: vpsrad $19, %ymm0, %ymm0 440; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 441; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 442; AVX2-NEXT: vzeroupper 443; AVX2-NEXT: retq 444; 445; AVX512-LABEL: combine_vec_ashr_trunc_ashr_splat: 446; AVX512: # %bb.0: 447; AVX512-NEXT: vpsrad $19, %ymm0, %ymm0 448; AVX512-NEXT: vpmovdw %ymm0, %xmm0 449; AVX512-NEXT: vzeroupper 450; AVX512-NEXT: retq 451 %1 = ashr <8 x i32> %x, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 452 %2 = trunc <8 x i32> %1 to <8 x i16> 453 %3 = ashr <8 x i16> %2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 454 ret <8 x i16> %3 455} 456 457; If the sign bit is known to be zero, switch this to a SRL. 458define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) { 459; SSE2-LABEL: combine_vec_ashr_positive: 460; SSE2: # %bb.0: 461; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 462; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 463; SSE2-NEXT: movdqa %xmm0, %xmm3 464; SSE2-NEXT: psrld %xmm2, %xmm3 465; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] 466; SSE2-NEXT: movdqa %xmm0, %xmm2 467; SSE2-NEXT: psrld %xmm4, %xmm2 468; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 469; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 470; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 471; SSE2-NEXT: movdqa %xmm0, %xmm4 472; SSE2-NEXT: psrld %xmm3, %xmm4 473; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 474; SSE2-NEXT: psrld %xmm1, %xmm0 475; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 476; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 477; SSE2-NEXT: movaps %xmm2, %xmm0 478; SSE2-NEXT: retq 479; 480; SSE41-LABEL: combine_vec_ashr_positive: 481; SSE41: # %bb.0: 482; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 483; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 484; SSE41-NEXT: movdqa %xmm0, %xmm3 485; SSE41-NEXT: psrld %xmm2, %xmm3 486; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 487; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 488; SSE41-NEXT: movdqa %xmm0, %xmm5 489; SSE41-NEXT: psrld %xmm4, %xmm5 490; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 491; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 492; SSE41-NEXT: movdqa %xmm0, %xmm3 493; SSE41-NEXT: psrld %xmm1, %xmm3 494; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 495; SSE41-NEXT: psrld %xmm1, %xmm0 496; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 497; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 498; SSE41-NEXT: retq 499; 500; AVX-LABEL: combine_vec_ashr_positive: 501; AVX: # %bb.0: 502; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 503; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 504; AVX-NEXT: retq 505 %1 = and <4 x i32> %x, <i32 15, i32 255, i32 4095, i32 65535> 506 %2 = ashr <4 x i32> %1, %y 507 ret <4 x i32> %2 508} 509 510define <4 x i32> @combine_vec_ashr_positive_splat(<4 x i32> %x, <4 x i32> %y) { 511; SSE-LABEL: combine_vec_ashr_positive_splat: 512; SSE: # %bb.0: 513; SSE-NEXT: xorps %xmm0, %xmm0 514; SSE-NEXT: retq 515; 516; AVX-LABEL: combine_vec_ashr_positive_splat: 517; AVX: # %bb.0: 518; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 519; AVX-NEXT: retq 520 %1 = and <4 x i32> %x, <i32 1023, i32 1023, i32 1023, i32 1023> 521 %2 = ashr <4 x i32> %1, <i32 10, i32 10, i32 10, i32 10> 522 ret <4 x i32> %2 523} 524 525define <8 x i16> @combine_vec8i16_ashr_clamped(<8 x i16> %x, <8 x i16> %y) { 526; SSE2-LABEL: combine_vec8i16_ashr_clamped: 527; SSE2: # %bb.0: 528; SSE2-NEXT: movdqa %xmm1, %xmm2 529; SSE2-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 530; SSE2-NEXT: psubw %xmm2, %xmm1 531; SSE2-NEXT: psllw $12, %xmm1 532; SSE2-NEXT: movdqa %xmm1, %xmm2 533; SSE2-NEXT: psraw $15, %xmm2 534; SSE2-NEXT: movdqa %xmm2, %xmm3 535; SSE2-NEXT: pandn %xmm0, %xmm3 536; SSE2-NEXT: psraw $8, %xmm0 537; SSE2-NEXT: pand %xmm2, %xmm0 538; SSE2-NEXT: por %xmm3, %xmm0 539; SSE2-NEXT: paddw %xmm1, %xmm1 540; SSE2-NEXT: movdqa %xmm1, %xmm2 541; SSE2-NEXT: psraw $15, %xmm2 542; SSE2-NEXT: movdqa %xmm2, %xmm3 543; SSE2-NEXT: pandn %xmm0, %xmm3 544; SSE2-NEXT: psraw $4, %xmm0 545; SSE2-NEXT: pand %xmm2, %xmm0 546; SSE2-NEXT: por %xmm3, %xmm0 547; SSE2-NEXT: paddw %xmm1, %xmm1 548; SSE2-NEXT: movdqa %xmm1, %xmm2 549; SSE2-NEXT: psraw $15, %xmm2 550; SSE2-NEXT: movdqa %xmm2, %xmm3 551; SSE2-NEXT: pandn %xmm0, %xmm3 552; SSE2-NEXT: psraw $2, %xmm0 553; SSE2-NEXT: pand %xmm2, %xmm0 554; SSE2-NEXT: por %xmm3, %xmm0 555; SSE2-NEXT: paddw %xmm1, %xmm1 556; SSE2-NEXT: psraw $15, %xmm1 557; SSE2-NEXT: movdqa %xmm1, %xmm2 558; SSE2-NEXT: pandn %xmm0, %xmm2 559; SSE2-NEXT: psraw $1, %xmm0 560; SSE2-NEXT: pand %xmm1, %xmm0 561; SSE2-NEXT: por %xmm2, %xmm0 562; SSE2-NEXT: retq 563; 564; SSE41-LABEL: combine_vec8i16_ashr_clamped: 565; SSE41: # %bb.0: 566; SSE41-NEXT: movdqa %xmm0, %xmm2 567; SSE41-NEXT: pminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 568; SSE41-NEXT: movdqa %xmm1, %xmm0 569; SSE41-NEXT: psllw $12, %xmm0 570; SSE41-NEXT: psllw $4, %xmm1 571; SSE41-NEXT: por %xmm1, %xmm0 572; SSE41-NEXT: movdqa %xmm0, %xmm1 573; SSE41-NEXT: paddw %xmm0, %xmm1 574; SSE41-NEXT: movdqa %xmm2, %xmm3 575; SSE41-NEXT: psraw $8, %xmm3 576; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 577; SSE41-NEXT: movdqa %xmm2, %xmm3 578; SSE41-NEXT: psraw $4, %xmm3 579; SSE41-NEXT: movdqa %xmm1, %xmm0 580; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 581; SSE41-NEXT: movdqa %xmm2, %xmm3 582; SSE41-NEXT: psraw $2, %xmm3 583; SSE41-NEXT: paddw %xmm1, %xmm1 584; SSE41-NEXT: movdqa %xmm1, %xmm0 585; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 586; SSE41-NEXT: movdqa %xmm2, %xmm3 587; SSE41-NEXT: psraw $1, %xmm3 588; SSE41-NEXT: paddw %xmm1, %xmm1 589; SSE41-NEXT: movdqa %xmm1, %xmm0 590; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 591; SSE41-NEXT: movdqa %xmm2, %xmm0 592; SSE41-NEXT: retq 593; 594; AVX2-LABEL: combine_vec8i16_ashr_clamped: 595; AVX2: # %bb.0: 596; AVX2-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 597; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 598; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 599; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 600; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 601; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 602; AVX2-NEXT: vzeroupper 603; AVX2-NEXT: retq 604; 605; AVX512-LABEL: combine_vec8i16_ashr_clamped: 606; AVX512: # %bb.0: 607; AVX512-NEXT: vpsravw %xmm1, %xmm0, %xmm0 608; AVX512-NEXT: retq 609 %1 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %y, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>) 610 %2 = ashr <8 x i16> %x, %1 611 ret <8 x i16> %2 612} 613 614define <4 x i32> @combine_vec4i32_ashr_clamped(<4 x i32> %x, <4 x i32> %y) { 615; SSE2-LABEL: combine_vec4i32_ashr_clamped: 616; SSE2: # %bb.0: 617; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 618; SSE2-NEXT: pxor %xmm1, %xmm2 619; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 620; SSE2-NEXT: movdqa %xmm2, %xmm3 621; SSE2-NEXT: pandn %xmm1, %xmm3 622; SSE2-NEXT: psrld $27, %xmm2 623; SSE2-NEXT: por %xmm3, %xmm2 624; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7] 625; SSE2-NEXT: movdqa %xmm0, %xmm3 626; SSE2-NEXT: psrad %xmm1, %xmm3 627; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7] 628; SSE2-NEXT: movdqa %xmm0, %xmm1 629; SSE2-NEXT: psrad %xmm4, %xmm1 630; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 631; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 632; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] 633; SSE2-NEXT: movdqa %xmm0, %xmm4 634; SSE2-NEXT: psrad %xmm3, %xmm4 635; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] 636; SSE2-NEXT: psrad %xmm2, %xmm0 637; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] 638; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3] 639; SSE2-NEXT: movaps %xmm1, %xmm0 640; SSE2-NEXT: retq 641; 642; SSE41-LABEL: combine_vec4i32_ashr_clamped: 643; SSE41: # %bb.0: 644; SSE41-NEXT: pminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 645; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] 646; SSE41-NEXT: movdqa %xmm0, %xmm3 647; SSE41-NEXT: psrad %xmm2, %xmm3 648; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 649; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 650; SSE41-NEXT: movdqa %xmm0, %xmm5 651; SSE41-NEXT: psrad %xmm4, %xmm5 652; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 653; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 654; SSE41-NEXT: movdqa %xmm0, %xmm3 655; SSE41-NEXT: psrad %xmm1, %xmm3 656; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] 657; SSE41-NEXT: psrad %xmm1, %xmm0 658; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 659; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 660; SSE41-NEXT: retq 661; 662; AVX-LABEL: combine_vec4i32_ashr_clamped: 663; AVX: # %bb.0: 664; AVX-NEXT: vpsravd %xmm1, %xmm0, %xmm0 665; AVX-NEXT: retq 666 %1 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %y, <4 x i32> <i32 31, i32 31, i32 31, i32 31>) 667 %2 = ashr <4 x i32> %x, %1 668 ret <4 x i32> %2 669} 670 671define <4 x i64> @combine_vec4i64_ashr_clamped(<4 x i64> %x, <4 x i64> %y) { 672; SSE2-LABEL: combine_vec4i64_ashr_clamped: 673; SSE2: # %bb.0: 674; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] 675; SSE2-NEXT: movdqa %xmm3, %xmm4 676; SSE2-NEXT: pxor %xmm5, %xmm4 677; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] 678; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483711,2147483711,2147483711,2147483711] 679; SSE2-NEXT: movdqa %xmm7, %xmm8 680; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 681; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 682; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 683; SSE2-NEXT: pand %xmm8, %xmm4 684; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [63,63] 685; SSE2-NEXT: pand %xmm4, %xmm3 686; SSE2-NEXT: pandn %xmm6, %xmm4 687; SSE2-NEXT: por %xmm3, %xmm4 688; SSE2-NEXT: movdqa %xmm2, %xmm3 689; SSE2-NEXT: pxor %xmm5, %xmm3 690; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] 691; SSE2-NEXT: pcmpgtd %xmm8, %xmm7 692; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 693; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 694; SSE2-NEXT: pand %xmm7, %xmm3 695; SSE2-NEXT: pand %xmm3, %xmm2 696; SSE2-NEXT: pandn %xmm6, %xmm3 697; SSE2-NEXT: por %xmm2, %xmm3 698; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] 699; SSE2-NEXT: movdqa %xmm2, %xmm5 700; SSE2-NEXT: psrlq %xmm3, %xmm5 701; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] 702; SSE2-NEXT: movdqa %xmm2, %xmm7 703; SSE2-NEXT: psrlq %xmm6, %xmm7 704; SSE2-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] 705; SSE2-NEXT: movdqa %xmm0, %xmm5 706; SSE2-NEXT: psrlq %xmm3, %xmm5 707; SSE2-NEXT: psrlq %xmm6, %xmm0 708; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] 709; SSE2-NEXT: xorpd %xmm7, %xmm0 710; SSE2-NEXT: psubq %xmm7, %xmm0 711; SSE2-NEXT: movdqa %xmm2, %xmm3 712; SSE2-NEXT: psrlq %xmm4, %xmm3 713; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] 714; SSE2-NEXT: psrlq %xmm5, %xmm2 715; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] 716; SSE2-NEXT: movdqa %xmm1, %xmm3 717; SSE2-NEXT: psrlq %xmm4, %xmm3 718; SSE2-NEXT: psrlq %xmm5, %xmm1 719; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] 720; SSE2-NEXT: xorpd %xmm2, %xmm1 721; SSE2-NEXT: psubq %xmm2, %xmm1 722; SSE2-NEXT: retq 723; 724; SSE41-LABEL: combine_vec4i64_ashr_clamped: 725; SSE41: # %bb.0: 726; SSE41-NEXT: movdqa %xmm0, %xmm4 727; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] 728; SSE41-NEXT: movdqa %xmm3, %xmm6 729; SSE41-NEXT: pxor %xmm7, %xmm6 730; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259519,9223372039002259519] 731; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2] 732; SSE41-NEXT: pcmpeqd %xmm8, %xmm6 733; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483711,2147483711,2147483711,2147483711] 734; SSE41-NEXT: movdqa %xmm5, %xmm0 735; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 736; SSE41-NEXT: pand %xmm6, %xmm0 737; SSE41-NEXT: movapd {{.*#+}} xmm9 = [63,63] 738; SSE41-NEXT: movapd %xmm9, %xmm6 739; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 740; SSE41-NEXT: pxor %xmm2, %xmm7 741; SSE41-NEXT: pcmpeqd %xmm7, %xmm8 742; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 743; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 744; SSE41-NEXT: pand %xmm8, %xmm5 745; SSE41-NEXT: movdqa %xmm5, %xmm0 746; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 747; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] 748; SSE41-NEXT: movdqa %xmm0, %xmm2 749; SSE41-NEXT: psrlq %xmm9, %xmm2 750; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] 751; SSE41-NEXT: movdqa %xmm0, %xmm5 752; SSE41-NEXT: psrlq %xmm3, %xmm5 753; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7] 754; SSE41-NEXT: movdqa %xmm4, %xmm2 755; SSE41-NEXT: psrlq %xmm9, %xmm2 756; SSE41-NEXT: psrlq %xmm3, %xmm4 757; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7] 758; SSE41-NEXT: pxor %xmm5, %xmm4 759; SSE41-NEXT: psubq %xmm5, %xmm4 760; SSE41-NEXT: movdqa %xmm0, %xmm2 761; SSE41-NEXT: psrlq %xmm6, %xmm2 762; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] 763; SSE41-NEXT: psrlq %xmm3, %xmm0 764; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 765; SSE41-NEXT: movdqa %xmm1, %xmm2 766; SSE41-NEXT: psrlq %xmm6, %xmm2 767; SSE41-NEXT: psrlq %xmm3, %xmm1 768; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 769; SSE41-NEXT: pxor %xmm0, %xmm1 770; SSE41-NEXT: psubq %xmm0, %xmm1 771; SSE41-NEXT: movdqa %xmm4, %xmm0 772; SSE41-NEXT: retq 773; 774; AVX2-LABEL: combine_vec4i64_ashr_clamped: 775; AVX2: # %bb.0: 776; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 777; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 778; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775870,9223372036854775870,9223372036854775870,9223372036854775870] 779; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 780; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [63,63,63,63] 781; AVX2-NEXT: vblendvpd %ymm3, %ymm4, %ymm1, %ymm1 782; AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2 783; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 784; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 785; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 786; AVX2-NEXT: retq 787; 788; AVX512-LABEL: combine_vec4i64_ashr_clamped: 789; AVX512: # %bb.0: 790; AVX512-NEXT: vpsravq %ymm1, %ymm0, %ymm0 791; AVX512-NEXT: retq 792 %1 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %y, <4 x i64> <i64 63, i64 63, i64 63, i64 63>) 793 %2 = ashr <4 x i64> %x, %1 794 ret <4 x i64> %2 795} 796