1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X86,X86-SLOW 3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X86,X86-FAST-ALL 4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X86,X86-FAST-PERLANE 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X64,X64-SLOW 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X64,X64-FAST-ALL 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X64,X64-FAST-PERLANE 8 9; AVX2 Logical Shift Left 10 11define <16 x i16> @test_sllw_1(<16 x i16> %InVec) { 12; CHECK-LABEL: test_sllw_1: 13; CHECK: # %bb.0: # %entry 14; CHECK-NEXT: ret{{[l|q]}} 15entry: 16 %shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> 17 ret <16 x i16> %shl 18} 19 20define <16 x i16> @test_sllw_2(<16 x i16> %InVec) { 21; CHECK-LABEL: test_sllw_2: 22; CHECK: # %bb.0: # %entry 23; CHECK-NEXT: vpaddw %ymm0, %ymm0, %ymm0 24; CHECK-NEXT: ret{{[l|q]}} 25entry: 26 %shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 27 ret <16 x i16> %shl 28} 29 30define <16 x i16> @test_sllw_3(<16 x i16> %InVec) { 31; CHECK-LABEL: test_sllw_3: 32; CHECK: # %bb.0: # %entry 33; CHECK-NEXT: vpsllw $15, %ymm0, %ymm0 34; CHECK-NEXT: ret{{[l|q]}} 35entry: 36 %shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 37 ret <16 x i16> %shl 38} 39 40define <8 x i32> @test_slld_1(<8 x i32> %InVec) { 41; CHECK-LABEL: test_slld_1: 42; CHECK: # %bb.0: # %entry 43; CHECK-NEXT: ret{{[l|q]}} 44entry: 45 %shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 46 ret <8 x i32> %shl 47} 48 49define <8 x i32> @test_slld_2(<8 x i32> %InVec) { 50; CHECK-LABEL: test_slld_2: 51; CHECK: # %bb.0: # %entry 52; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0 53; CHECK-NEXT: ret{{[l|q]}} 54entry: 55 %shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 56 ret <8 x i32> %shl 57} 58 59define <8 x i32> @test_vpslld_var(i32 %shift) { 60; X86-LABEL: test_vpslld_var: 61; X86: # %bb.0: 62; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 63; X86-NEXT: vpmovzxbd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199] 64; X86-NEXT: vpslld %xmm0, %ymm1, %ymm0 65; X86-NEXT: retl 66; 67; X64-LABEL: test_vpslld_var: 68; X64: # %bb.0: 69; X64-NEXT: vmovd %edi, %xmm0 70; X64-NEXT: vpmovzxbd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199] 71; X64-NEXT: vpslld %xmm0, %ymm1, %ymm0 72; X64-NEXT: retq 73 %amt = insertelement <8 x i32> undef, i32 %shift, i32 0 74 %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt 75 ret <8 x i32> %tmp 76} 77 78define <8 x i32> @test_slld_3(<8 x i32> %InVec) { 79; CHECK-LABEL: test_slld_3: 80; CHECK: # %bb.0: # %entry 81; CHECK-NEXT: vpslld $31, %ymm0, %ymm0 82; CHECK-NEXT: ret{{[l|q]}} 83entry: 84 %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 85 ret <8 x i32> %shl 86} 87 88define <4 x i64> @test_sllq_1(<4 x i64> %InVec) { 89; CHECK-LABEL: test_sllq_1: 90; CHECK: # %bb.0: # %entry 91; CHECK-NEXT: ret{{[l|q]}} 92entry: 93 %shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0> 94 ret <4 x i64> %shl 95} 96 97define <4 x i64> @test_sllq_2(<4 x i64> %InVec) { 98; CHECK-LABEL: test_sllq_2: 99; CHECK: # %bb.0: # %entry 100; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0 101; CHECK-NEXT: ret{{[l|q]}} 102entry: 103 %shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1> 104 ret <4 x i64> %shl 105} 106 107define <4 x i64> @test_sllq_3(<4 x i64> %InVec) { 108; CHECK-LABEL: test_sllq_3: 109; CHECK: # %bb.0: # %entry 110; CHECK-NEXT: vpsllq $63, %ymm0, %ymm0 111; CHECK-NEXT: ret{{[l|q]}} 112entry: 113 %shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63> 114 ret <4 x i64> %shl 115} 116 117; AVX2 Arithmetic Shift 118 119define <16 x i16> @test_sraw_1(<16 x i16> %InVec) { 120; CHECK-LABEL: test_sraw_1: 121; CHECK: # %bb.0: # %entry 122; CHECK-NEXT: ret{{[l|q]}} 123entry: 124 %shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> 125 ret <16 x i16> %shl 126} 127 128define <16 x i16> @test_sraw_2(<16 x i16> %InVec) { 129; CHECK-LABEL: test_sraw_2: 130; CHECK: # %bb.0: # %entry 131; CHECK-NEXT: vpsraw $1, %ymm0, %ymm0 132; CHECK-NEXT: ret{{[l|q]}} 133entry: 134 %shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 135 ret <16 x i16> %shl 136} 137 138define <16 x i16> @test_sraw_3(<16 x i16> %InVec) { 139; CHECK-LABEL: test_sraw_3: 140; CHECK: # %bb.0: # %entry 141; CHECK-NEXT: vpsraw $15, %ymm0, %ymm0 142; CHECK-NEXT: ret{{[l|q]}} 143entry: 144 %shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 145 ret <16 x i16> %shl 146} 147 148define <8 x i32> @test_srad_1(<8 x i32> %InVec) { 149; CHECK-LABEL: test_srad_1: 150; CHECK: # %bb.0: # %entry 151; CHECK-NEXT: ret{{[l|q]}} 152entry: 153 %shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 154 ret <8 x i32> %shl 155} 156 157define <8 x i32> @test_srad_2(<8 x i32> %InVec) { 158; CHECK-LABEL: test_srad_2: 159; CHECK: # %bb.0: # %entry 160; CHECK-NEXT: vpsrad $1, %ymm0, %ymm0 161; CHECK-NEXT: ret{{[l|q]}} 162entry: 163 %shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 164 ret <8 x i32> %shl 165} 166 167define <8 x i32> @test_srad_3(<8 x i32> %InVec) { 168; CHECK-LABEL: test_srad_3: 169; CHECK: # %bb.0: # %entry 170; CHECK-NEXT: vpsrad $31, %ymm0, %ymm0 171; CHECK-NEXT: ret{{[l|q]}} 172entry: 173 %shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 174 ret <8 x i32> %shl 175} 176 177; SSE Logical Shift Right 178 179define <16 x i16> @test_srlw_1(<16 x i16> %InVec) { 180; CHECK-LABEL: test_srlw_1: 181; CHECK: # %bb.0: # %entry 182; CHECK-NEXT: ret{{[l|q]}} 183entry: 184 %shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> 185 ret <16 x i16> %shl 186} 187 188define <16 x i16> @test_srlw_2(<16 x i16> %InVec) { 189; CHECK-LABEL: test_srlw_2: 190; CHECK: # %bb.0: # %entry 191; CHECK-NEXT: vpsrlw $1, %ymm0, %ymm0 192; CHECK-NEXT: ret{{[l|q]}} 193entry: 194 %shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 195 ret <16 x i16> %shl 196} 197 198define <16 x i16> @test_srlw_3(<16 x i16> %InVec) { 199; CHECK-LABEL: test_srlw_3: 200; CHECK: # %bb.0: # %entry 201; CHECK-NEXT: vpsrlw $15, %ymm0, %ymm0 202; CHECK-NEXT: ret{{[l|q]}} 203entry: 204 %shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 205 ret <16 x i16> %shl 206} 207 208define <8 x i32> @test_srld_1(<8 x i32> %InVec) { 209; CHECK-LABEL: test_srld_1: 210; CHECK: # %bb.0: # %entry 211; CHECK-NEXT: ret{{[l|q]}} 212entry: 213 %shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 214 ret <8 x i32> %shl 215} 216 217define <8 x i32> @test_srld_2(<8 x i32> %InVec) { 218; CHECK-LABEL: test_srld_2: 219; CHECK: # %bb.0: # %entry 220; CHECK-NEXT: vpsrld $1, %ymm0, %ymm0 221; CHECK-NEXT: ret{{[l|q]}} 222entry: 223 %shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 224 ret <8 x i32> %shl 225} 226 227define <8 x i32> @test_srld_3(<8 x i32> %InVec) { 228; CHECK-LABEL: test_srld_3: 229; CHECK: # %bb.0: # %entry 230; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0 231; CHECK-NEXT: ret{{[l|q]}} 232entry: 233 %shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 234 ret <8 x i32> %shl 235} 236 237define <4 x i64> @test_srlq_1(<4 x i64> %InVec) { 238; CHECK-LABEL: test_srlq_1: 239; CHECK: # %bb.0: # %entry 240; CHECK-NEXT: ret{{[l|q]}} 241entry: 242 %shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0> 243 ret <4 x i64> %shl 244} 245 246define <4 x i64> @test_srlq_2(<4 x i64> %InVec) { 247; CHECK-LABEL: test_srlq_2: 248; CHECK: # %bb.0: # %entry 249; CHECK-NEXT: vpsrlq $1, %ymm0, %ymm0 250; CHECK-NEXT: ret{{[l|q]}} 251entry: 252 %shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1> 253 ret <4 x i64> %shl 254} 255 256define <4 x i64> @test_srlq_3(<4 x i64> %InVec) { 257; CHECK-LABEL: test_srlq_3: 258; CHECK: # %bb.0: # %entry 259; CHECK-NEXT: vpsrlq $63, %ymm0, %ymm0 260; CHECK-NEXT: ret{{[l|q]}} 261entry: 262 %shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63> 263 ret <4 x i64> %shl 264} 265 266define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { 267; X86-SLOW-LABEL: srl_trunc_and_v4i64: 268; X86-SLOW: # %bb.0: 269; X86-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 270; X86-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 271; X86-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] 272; X86-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1 273; X86-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 274; X86-SLOW-NEXT: vzeroupper 275; X86-SLOW-NEXT: retl 276; 277; X86-FAST-ALL-LABEL: srl_trunc_and_v4i64: 278; X86-FAST-ALL: # %bb.0: 279; X86-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0] 280; X86-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 281; X86-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] 282; X86-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 283; X86-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 284; X86-FAST-ALL-NEXT: vzeroupper 285; X86-FAST-ALL-NEXT: retl 286; 287; X86-FAST-PERLANE-LABEL: srl_trunc_and_v4i64: 288; X86-FAST-PERLANE: # %bb.0: 289; X86-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 290; X86-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 291; X86-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] 292; X86-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1 293; X86-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 294; X86-FAST-PERLANE-NEXT: vzeroupper 295; X86-FAST-PERLANE-NEXT: retl 296; 297; X64-SLOW-LABEL: srl_trunc_and_v4i64: 298; X64-SLOW: # %bb.0: 299; X64-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 300; X64-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 301; X64-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] 302; X64-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1 303; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 304; X64-SLOW-NEXT: vzeroupper 305; X64-SLOW-NEXT: retq 306; 307; X64-FAST-ALL-LABEL: srl_trunc_and_v4i64: 308; X64-FAST-ALL: # %bb.0: 309; X64-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0] 310; X64-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 311; X64-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] 312; X64-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 313; X64-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 314; X64-FAST-ALL-NEXT: vzeroupper 315; X64-FAST-ALL-NEXT: retq 316; 317; X64-FAST-PERLANE-LABEL: srl_trunc_and_v4i64: 318; X64-FAST-PERLANE: # %bb.0: 319; X64-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 320; X64-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 321; X64-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] 322; X64-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1 323; X64-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 324; X64-FAST-PERLANE-NEXT: vzeroupper 325; X64-FAST-PERLANE-NEXT: retq 326 %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8> 327 %trunc = trunc <4 x i64> %and to <4 x i32> 328 %sra = lshr <4 x i32> %x, %trunc 329 ret <4 x i32> %sra 330} 331 332; 333; Vectorized byte shifts 334; 335 336define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { 337; CHECK-LABEL: shl_8i16: 338; CHECK: # %bb.0: 339; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 340; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 341; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 342; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 343; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 344; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 345; CHECK-NEXT: vzeroupper 346; CHECK-NEXT: ret{{[l|q]}} 347 %shl = shl <8 x i16> %r, %a 348 ret <8 x i16> %shl 349} 350 351define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind { 352; CHECK-LABEL: shl_16i16: 353; CHECK: # %bb.0: 354; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 355; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 356; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 357; CHECK-NEXT: vpsllvd %ymm3, %ymm4, %ymm3 358; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3 359; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 360; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 361; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 362; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0 363; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 364; CHECK-NEXT: ret{{[l|q]}} 365 %shl = shl <16 x i16> %r, %a 366 ret <16 x i16> %shl 367} 368 369define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { 370; X86-LABEL: shl_32i8: 371; X86: # %bb.0: 372; X86-NEXT: vpsllw $5, %ymm1, %ymm1 373; X86-NEXT: vpsllw $4, %ymm0, %ymm2 374; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 375; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 376; X86-NEXT: vpsllw $2, %ymm0, %ymm2 377; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 378; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1 379; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 380; X86-NEXT: vpaddb %ymm0, %ymm0, %ymm2 381; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1 382; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 383; X86-NEXT: retl 384; 385; X64-LABEL: shl_32i8: 386; X64: # %bb.0: 387; X64-NEXT: vpsllw $5, %ymm1, %ymm1 388; X64-NEXT: vpsllw $4, %ymm0, %ymm2 389; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 390; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 391; X64-NEXT: vpsllw $2, %ymm0, %ymm2 392; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 393; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1 394; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 395; X64-NEXT: vpaddb %ymm0, %ymm0, %ymm2 396; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1 397; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 398; X64-NEXT: retq 399 %shl = shl <32 x i8> %r, %a 400 ret <32 x i8> %shl 401} 402 403define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { 404; CHECK-LABEL: ashr_8i16: 405; CHECK: # %bb.0: 406; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 407; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 408; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0 409; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 410; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 411; CHECK-NEXT: vzeroupper 412; CHECK-NEXT: ret{{[l|q]}} 413 %ashr = ashr <8 x i16> %r, %a 414 ret <8 x i16> %ashr 415} 416 417define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind { 418; CHECK-LABEL: ashr_16i16: 419; CHECK: # %bb.0: 420; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 421; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 422; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 423; CHECK-NEXT: vpsravd %ymm3, %ymm4, %ymm3 424; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3 425; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 426; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 427; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0 428; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0 429; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 430; CHECK-NEXT: ret{{[l|q]}} 431 %ashr = ashr <16 x i16> %r, %a 432 ret <16 x i16> %ashr 433} 434 435define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { 436; CHECK-LABEL: ashr_32i8: 437; CHECK: # %bb.0: 438; CHECK-NEXT: vpsllw $5, %ymm1, %ymm1 439; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 440; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 441; CHECK-NEXT: vpsraw $4, %ymm3, %ymm4 442; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 443; CHECK-NEXT: vpsraw $2, %ymm3, %ymm4 444; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2 445; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 446; CHECK-NEXT: vpsraw $1, %ymm3, %ymm4 447; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2 448; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 449; CHECK-NEXT: vpsrlw $8, %ymm2, %ymm2 450; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 451; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 452; CHECK-NEXT: vpsraw $4, %ymm0, %ymm3 453; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 454; CHECK-NEXT: vpsraw $2, %ymm0, %ymm3 455; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1 456; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 457; CHECK-NEXT: vpsraw $1, %ymm0, %ymm3 458; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1 459; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 460; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0 461; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 462; CHECK-NEXT: ret{{[l|q]}} 463 %ashr = ashr <32 x i8> %r, %a 464 ret <32 x i8> %ashr 465} 466 467define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { 468; CHECK-LABEL: lshr_8i16: 469; CHECK: # %bb.0: 470; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 471; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 472; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 473; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 474; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 475; CHECK-NEXT: vzeroupper 476; CHECK-NEXT: ret{{[l|q]}} 477 %lshr = lshr <8 x i16> %r, %a 478 ret <8 x i16> %lshr 479} 480 481define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind { 482; CHECK-LABEL: lshr_16i16: 483; CHECK: # %bb.0: 484; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 485; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 486; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 487; CHECK-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 488; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3 489; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 490; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 491; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 492; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0 493; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 494; CHECK-NEXT: ret{{[l|q]}} 495 %lshr = lshr <16 x i16> %r, %a 496 ret <16 x i16> %lshr 497} 498 499define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { 500; X86-LABEL: lshr_32i8: 501; X86: # %bb.0: 502; X86-NEXT: vpsllw $5, %ymm1, %ymm1 503; X86-NEXT: vpsrlw $4, %ymm0, %ymm2 504; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 505; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 506; X86-NEXT: vpsrlw $2, %ymm0, %ymm2 507; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 508; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1 509; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 510; X86-NEXT: vpsrlw $1, %ymm0, %ymm2 511; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 512; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1 513; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 514; X86-NEXT: retl 515; 516; X64-LABEL: lshr_32i8: 517; X64: # %bb.0: 518; X64-NEXT: vpsllw $5, %ymm1, %ymm1 519; X64-NEXT: vpsrlw $4, %ymm0, %ymm2 520; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 521; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 522; X64-NEXT: vpsrlw $2, %ymm0, %ymm2 523; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 524; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1 525; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 526; X64-NEXT: vpsrlw $1, %ymm0, %ymm2 527; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 528; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1 529; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 530; X64-NEXT: retq 531 %lshr = lshr <32 x i8> %r, %a 532 ret <32 x i8> %lshr 533} 534