1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512NOVLX,AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLX,AVX512VL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512NOVLX,AVX512BW 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLX,AVX512VLBW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512NOVLX,AVX512VBMI2 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLX,AVX512VLVBMI2 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 14 15; Just one 32-bit run to make sure we do reasonable things for i64 rotates. 16; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2 17 18; 19; Variable Rotates 20; 21 22define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 23; SSE2-LABEL: var_rotate_v2i64: 24; SSE2: # %bb.0: 25; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,64] 26; SSE2-NEXT: psubq %xmm1, %xmm2 27; SSE2-NEXT: movdqa %xmm0, %xmm3 28; SSE2-NEXT: psllq %xmm1, %xmm3 29; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 30; SSE2-NEXT: movdqa %xmm0, %xmm4 31; SSE2-NEXT: psllq %xmm1, %xmm4 32; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] 33; SSE2-NEXT: movdqa %xmm0, %xmm1 34; SSE2-NEXT: psrlq %xmm2, %xmm1 35; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 36; SSE2-NEXT: psrlq %xmm2, %xmm0 37; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 38; SSE2-NEXT: orpd %xmm4, %xmm0 39; SSE2-NEXT: retq 40; 41; SSE41-LABEL: var_rotate_v2i64: 42; SSE41: # %bb.0: 43; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [64,64] 44; SSE41-NEXT: psubq %xmm1, %xmm2 45; SSE41-NEXT: movdqa %xmm0, %xmm3 46; SSE41-NEXT: psllq %xmm1, %xmm3 47; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 48; SSE41-NEXT: movdqa %xmm0, %xmm4 49; SSE41-NEXT: psllq %xmm1, %xmm4 50; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7] 51; SSE41-NEXT: movdqa %xmm0, %xmm1 52; SSE41-NEXT: psrlq %xmm2, %xmm1 53; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 54; SSE41-NEXT: psrlq %xmm2, %xmm0 55; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 56; SSE41-NEXT: por %xmm4, %xmm0 57; SSE41-NEXT: retq 58; 59; AVX1-LABEL: var_rotate_v2i64: 60; AVX1: # %bb.0: 61; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] 62; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 63; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 64; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 65; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 66; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] 67; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3 68; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 69; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 70; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 71; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 72; AVX1-NEXT: retq 73; 74; AVX2-LABEL: var_rotate_v2i64: 75; AVX2: # %bb.0: 76; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] 77; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 78; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1 79; AVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 80; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 81; AVX2-NEXT: retq 82; 83; AVX512NOVLX-LABEL: var_rotate_v2i64: 84; AVX512NOVLX: # %bb.0: 85; AVX512NOVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 86; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 87; AVX512NOVLX-NEXT: vprolvq %zmm1, %zmm0, %zmm0 88; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 89; AVX512NOVLX-NEXT: vzeroupper 90; AVX512NOVLX-NEXT: retq 91; 92; AVX512VLX-LABEL: var_rotate_v2i64: 93; AVX512VLX: # %bb.0: 94; AVX512VLX-NEXT: vprolvq %xmm1, %xmm0, %xmm0 95; AVX512VLX-NEXT: retq 96; 97; XOP-LABEL: var_rotate_v2i64: 98; XOP: # %bb.0: 99; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0 100; XOP-NEXT: retq 101; 102; X86-SSE2-LABEL: var_rotate_v2i64: 103; X86-SSE2: # %bb.0: 104; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,0,64,0] 105; X86-SSE2-NEXT: psubq %xmm1, %xmm2 106; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 107; X86-SSE2-NEXT: psllq %xmm1, %xmm3 108; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 109; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 110; X86-SSE2-NEXT: psllq %xmm1, %xmm4 111; X86-SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] 112; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 113; X86-SSE2-NEXT: psrlq %xmm2, %xmm1 114; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 115; X86-SSE2-NEXT: psrlq %xmm2, %xmm0 116; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 117; X86-SSE2-NEXT: orpd %xmm4, %xmm0 118; X86-SSE2-NEXT: retl 119 %b64 = sub <2 x i64> <i64 64, i64 64>, %b 120 %shl = shl <2 x i64> %a, %b 121 %lshr = lshr <2 x i64> %a, %b64 122 %or = or <2 x i64> %shl, %lshr 123 ret <2 x i64> %or 124} 125 126define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 127; SSE2-LABEL: var_rotate_v4i32: 128; SSE2: # %bb.0: 129; SSE2-NEXT: pslld $23, %xmm1 130; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 131; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 132; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 133; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 134; SSE2-NEXT: pmuludq %xmm1, %xmm0 135; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 136; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 137; SSE2-NEXT: pmuludq %xmm2, %xmm1 138; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 139; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 140; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 141; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 142; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 143; SSE2-NEXT: por %xmm3, %xmm0 144; SSE2-NEXT: retq 145; 146; SSE41-LABEL: var_rotate_v4i32: 147; SSE41: # %bb.0: 148; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 149; SSE41-NEXT: pslld $23, %xmm1 150; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 151; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 152; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 153; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 154; SSE41-NEXT: pmuludq %xmm2, %xmm3 155; SSE41-NEXT: pmuludq %xmm1, %xmm0 156; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 157; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 158; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 159; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 160; SSE41-NEXT: por %xmm1, %xmm0 161; SSE41-NEXT: retq 162; 163; AVX1-LABEL: var_rotate_v4i32: 164; AVX1: # %bb.0: 165; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 166; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 167; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 168; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 169; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 170; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 171; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 172; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 173; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 174; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 175; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] 176; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 177; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 178; AVX1-NEXT: retq 179; 180; AVX2-LABEL: var_rotate_v4i32: 181; AVX2: # %bb.0: 182; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] 183; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 184; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 185; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 186; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 187; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 188; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 189; AVX2-NEXT: retq 190; 191; AVX512NOVLX-LABEL: var_rotate_v4i32: 192; AVX512NOVLX: # %bb.0: 193; AVX512NOVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 194; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 195; AVX512NOVLX-NEXT: vprolvd %zmm1, %zmm0, %zmm0 196; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 197; AVX512NOVLX-NEXT: vzeroupper 198; AVX512NOVLX-NEXT: retq 199; 200; AVX512VLX-LABEL: var_rotate_v4i32: 201; AVX512VLX: # %bb.0: 202; AVX512VLX-NEXT: vprolvd %xmm1, %xmm0, %xmm0 203; AVX512VLX-NEXT: retq 204; 205; XOP-LABEL: var_rotate_v4i32: 206; XOP: # %bb.0: 207; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 208; XOP-NEXT: retq 209; 210; X86-SSE2-LABEL: var_rotate_v4i32: 211; X86-SSE2: # %bb.0: 212; X86-SSE2-NEXT: pslld $23, %xmm1 213; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 214; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 215; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 216; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 217; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 218; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 219; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 220; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 221; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 222; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 223; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 224; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 225; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 226; X86-SSE2-NEXT: por %xmm3, %xmm0 227; X86-SSE2-NEXT: retl 228 %b32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %b 229 %shl = shl <4 x i32> %a, %b 230 %lshr = lshr <4 x i32> %a, %b32 231 %or = or <4 x i32> %shl, %lshr 232 ret <4 x i32> %or 233} 234 235define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 236; SSE2-LABEL: var_rotate_v8i16: 237; SSE2: # %bb.0: 238; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 239; SSE2-NEXT: movdqa %xmm1, %xmm2 240; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 241; SSE2-NEXT: pslld $23, %xmm2 242; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 243; SSE2-NEXT: paddd %xmm3, %xmm2 244; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 245; SSE2-NEXT: pslld $16, %xmm2 246; SSE2-NEXT: psrad $16, %xmm2 247; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 248; SSE2-NEXT: pslld $23, %xmm1 249; SSE2-NEXT: paddd %xmm3, %xmm1 250; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 251; SSE2-NEXT: pslld $16, %xmm1 252; SSE2-NEXT: psrad $16, %xmm1 253; SSE2-NEXT: packssdw %xmm2, %xmm1 254; SSE2-NEXT: movdqa %xmm0, %xmm2 255; SSE2-NEXT: pmulhuw %xmm1, %xmm2 256; SSE2-NEXT: pmullw %xmm1, %xmm0 257; SSE2-NEXT: por %xmm2, %xmm0 258; SSE2-NEXT: retq 259; 260; SSE41-LABEL: var_rotate_v8i16: 261; SSE41: # %bb.0: 262; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 263; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 264; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 265; SSE41-NEXT: pslld $23, %xmm1 266; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 267; SSE41-NEXT: paddd %xmm3, %xmm1 268; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 269; SSE41-NEXT: pslld $23, %xmm2 270; SSE41-NEXT: paddd %xmm3, %xmm2 271; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 272; SSE41-NEXT: packusdw %xmm1, %xmm2 273; SSE41-NEXT: movdqa %xmm0, %xmm1 274; SSE41-NEXT: pmulhuw %xmm2, %xmm1 275; SSE41-NEXT: pmullw %xmm2, %xmm0 276; SSE41-NEXT: por %xmm1, %xmm0 277; SSE41-NEXT: retq 278; 279; AVX1-LABEL: var_rotate_v8i16: 280; AVX1: # %bb.0: 281; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 282; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] 283; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 284; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 285; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 286; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 287; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 288; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 289; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 290; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 291; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 292; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 293; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 294; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 295; AVX1-NEXT: retq 296; 297; AVX2-LABEL: var_rotate_v8i16: 298; AVX2: # %bb.0: 299; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 300; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 301; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 302; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4,4,5,5,6,6,7,7] 303; AVX2-NEXT: vpsllvd %xmm2, %xmm3, %xmm2 304; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 305; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 306; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 307; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 308; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 309; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 310; AVX2-NEXT: retq 311; 312; AVX512F-LABEL: var_rotate_v8i16: 313; AVX512F: # %bb.0: 314; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 315; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 316; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 317; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4,4,5,5,6,6,7,7] 318; AVX512F-NEXT: vpsllvd %xmm2, %xmm3, %xmm2 319; AVX512F-NEXT: vpsrld $16, %xmm2, %xmm2 320; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 321; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 322; AVX512F-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 323; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm0 324; AVX512F-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 325; AVX512F-NEXT: retq 326; 327; AVX512VL-LABEL: var_rotate_v8i16: 328; AVX512VL: # %bb.0: 329; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 330; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 331; AVX512VL-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 332; AVX512VL-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4,4,5,5,6,6,7,7] 333; AVX512VL-NEXT: vpsllvd %xmm2, %xmm3, %xmm2 334; AVX512VL-NEXT: vpsrld $16, %xmm2, %xmm2 335; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 336; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 337; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 338; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm0 339; AVX512VL-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 340; AVX512VL-NEXT: retq 341; 342; AVX512BW-LABEL: var_rotate_v8i16: 343; AVX512BW: # %bb.0: 344; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 345; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 346; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 347; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 348; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 349; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 350; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 351; AVX512BW-NEXT: vzeroupper 352; AVX512BW-NEXT: retq 353; 354; AVX512VLBW-LABEL: var_rotate_v8i16: 355; AVX512VLBW: # %bb.0: 356; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 357; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 358; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 359; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 360; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 361; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 362; AVX512VLBW-NEXT: retq 363; 364; AVX512VBMI2-LABEL: var_rotate_v8i16: 365; AVX512VBMI2: # %bb.0: 366; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 367; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 368; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 369; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 370; AVX512VBMI2-NEXT: vzeroupper 371; AVX512VBMI2-NEXT: retq 372; 373; AVX512VLVBMI2-LABEL: var_rotate_v8i16: 374; AVX512VLVBMI2: # %bb.0: 375; AVX512VLVBMI2-NEXT: vpshldvw %xmm1, %xmm0, %xmm0 376; AVX512VLVBMI2-NEXT: retq 377; 378; XOP-LABEL: var_rotate_v8i16: 379; XOP: # %bb.0: 380; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0 381; XOP-NEXT: retq 382; 383; X86-SSE2-LABEL: var_rotate_v8i16: 384; X86-SSE2: # %bb.0: 385; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 386; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 387; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 388; X86-SSE2-NEXT: pslld $23, %xmm2 389; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 390; X86-SSE2-NEXT: paddd %xmm3, %xmm2 391; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 392; X86-SSE2-NEXT: pslld $16, %xmm2 393; X86-SSE2-NEXT: psrad $16, %xmm2 394; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 395; X86-SSE2-NEXT: pslld $23, %xmm1 396; X86-SSE2-NEXT: paddd %xmm3, %xmm1 397; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 398; X86-SSE2-NEXT: pslld $16, %xmm1 399; X86-SSE2-NEXT: psrad $16, %xmm1 400; X86-SSE2-NEXT: packssdw %xmm2, %xmm1 401; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 402; X86-SSE2-NEXT: pmulhuw %xmm1, %xmm2 403; X86-SSE2-NEXT: pmullw %xmm1, %xmm0 404; X86-SSE2-NEXT: por %xmm2, %xmm0 405; X86-SSE2-NEXT: retl 406 %b16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b 407 %shl = shl <8 x i16> %a, %b 408 %lshr = lshr <8 x i16> %a, %b16 409 %or = or <8 x i16> %shl, %lshr 410 ret <8 x i16> %or 411} 412 413define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 414; SSE2-LABEL: var_rotate_v16i8: 415; SSE2: # %bb.0: 416; SSE2-NEXT: movdqa %xmm0, %xmm2 417; SSE2-NEXT: psllw $5, %xmm1 418; SSE2-NEXT: pxor %xmm0, %xmm0 419; SSE2-NEXT: pxor %xmm3, %xmm3 420; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 421; SSE2-NEXT: movdqa %xmm2, %xmm4 422; SSE2-NEXT: psrlw $4, %xmm4 423; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 424; SSE2-NEXT: movdqa %xmm2, %xmm5 425; SSE2-NEXT: psllw $4, %xmm5 426; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 427; SSE2-NEXT: por %xmm4, %xmm5 428; SSE2-NEXT: pand %xmm3, %xmm5 429; SSE2-NEXT: pandn %xmm2, %xmm3 430; SSE2-NEXT: por %xmm5, %xmm3 431; SSE2-NEXT: movdqa %xmm3, %xmm2 432; SSE2-NEXT: psrlw $6, %xmm2 433; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 434; SSE2-NEXT: movdqa %xmm3, %xmm4 435; SSE2-NEXT: psllw $2, %xmm4 436; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 437; SSE2-NEXT: por %xmm2, %xmm4 438; SSE2-NEXT: paddb %xmm1, %xmm1 439; SSE2-NEXT: pxor %xmm2, %xmm2 440; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 441; SSE2-NEXT: pand %xmm2, %xmm4 442; SSE2-NEXT: pandn %xmm3, %xmm2 443; SSE2-NEXT: por %xmm4, %xmm2 444; SSE2-NEXT: movdqa %xmm2, %xmm3 445; SSE2-NEXT: psrlw $7, %xmm3 446; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 447; SSE2-NEXT: movdqa %xmm2, %xmm4 448; SSE2-NEXT: paddb %xmm2, %xmm4 449; SSE2-NEXT: por %xmm3, %xmm4 450; SSE2-NEXT: paddb %xmm1, %xmm1 451; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 452; SSE2-NEXT: pand %xmm0, %xmm4 453; SSE2-NEXT: pandn %xmm2, %xmm0 454; SSE2-NEXT: por %xmm4, %xmm0 455; SSE2-NEXT: retq 456; 457; SSE41-LABEL: var_rotate_v16i8: 458; SSE41: # %bb.0: 459; SSE41-NEXT: movdqa %xmm1, %xmm2 460; SSE41-NEXT: movdqa %xmm0, %xmm1 461; SSE41-NEXT: psrlw $4, %xmm0 462; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 463; SSE41-NEXT: movdqa %xmm1, %xmm3 464; SSE41-NEXT: psllw $4, %xmm3 465; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 466; SSE41-NEXT: por %xmm0, %xmm3 467; SSE41-NEXT: psllw $5, %xmm2 468; SSE41-NEXT: movdqa %xmm2, %xmm0 469; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 470; SSE41-NEXT: movdqa %xmm1, %xmm0 471; SSE41-NEXT: psrlw $6, %xmm0 472; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 473; SSE41-NEXT: movdqa %xmm1, %xmm3 474; SSE41-NEXT: psllw $2, %xmm3 475; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 476; SSE41-NEXT: por %xmm0, %xmm3 477; SSE41-NEXT: paddb %xmm2, %xmm2 478; SSE41-NEXT: movdqa %xmm2, %xmm0 479; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 480; SSE41-NEXT: movdqa %xmm1, %xmm0 481; SSE41-NEXT: psrlw $7, %xmm0 482; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 483; SSE41-NEXT: movdqa %xmm1, %xmm3 484; SSE41-NEXT: paddb %xmm1, %xmm3 485; SSE41-NEXT: por %xmm0, %xmm3 486; SSE41-NEXT: paddb %xmm2, %xmm2 487; SSE41-NEXT: movdqa %xmm2, %xmm0 488; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 489; SSE41-NEXT: movdqa %xmm1, %xmm0 490; SSE41-NEXT: retq 491; 492; AVX-LABEL: var_rotate_v16i8: 493; AVX: # %bb.0: 494; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 495; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 496; AVX-NEXT: vpsllw $4, %xmm0, %xmm3 497; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 498; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 499; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 500; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 501; AVX-NEXT: vpsrlw $6, %xmm0, %xmm2 502; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 503; AVX-NEXT: vpsllw $2, %xmm0, %xmm3 504; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 505; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 506; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 507; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 508; AVX-NEXT: vpsrlw $7, %xmm0, %xmm2 509; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 510; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm3 511; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 512; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 513; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 514; AVX-NEXT: retq 515; 516; AVX512F-LABEL: var_rotate_v16i8: 517; AVX512F: # %bb.0: 518; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 519; AVX512F-NEXT: vpslld $8, %zmm0, %zmm2 520; AVX512F-NEXT: vpord %zmm2, %zmm0, %zmm0 521; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 522; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 523; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 524; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm0 525; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 526; AVX512F-NEXT: vzeroupper 527; AVX512F-NEXT: retq 528; 529; AVX512VL-LABEL: var_rotate_v16i8: 530; AVX512VL: # %bb.0: 531; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 532; AVX512VL-NEXT: vpslld $8, %zmm0, %zmm2 533; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0 534; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 535; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 536; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 537; AVX512VL-NEXT: vpsrld $8, %zmm0, %zmm0 538; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 539; AVX512VL-NEXT: vzeroupper 540; AVX512VL-NEXT: retq 541; 542; AVX512BW-LABEL: var_rotate_v16i8: 543; AVX512BW: # %bb.0: 544; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 545; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 546; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3 547; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 548; AVX512BW-NEXT: vpsllvw %zmm3, %zmm2, %zmm2 549; AVX512BW-NEXT: vpsrlw $8, %xmm2, %xmm2 550; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 551; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 552; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 553; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 554; AVX512BW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 555; AVX512BW-NEXT: vzeroupper 556; AVX512BW-NEXT: retq 557; 558; AVX512VLBW-LABEL: var_rotate_v16i8: 559; AVX512VLBW: # %bb.0: 560; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 561; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 562; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 563; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 564; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm3, %xmm2 565; AVX512VLBW-NEXT: vpsrlw $8, %xmm2, %xmm2 566; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 567; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 568; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 569; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 570; AVX512VLBW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 571; AVX512VLBW-NEXT: retq 572; 573; AVX512VBMI2-LABEL: var_rotate_v16i8: 574; AVX512VBMI2: # %bb.0: 575; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 576; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 577; AVX512VBMI2-NEXT: vpxor %xmm3, %xmm3, %xmm3 578; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 579; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm2, %zmm2 580; AVX512VBMI2-NEXT: vpsrlw $8, %xmm2, %xmm2 581; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 582; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 583; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 584; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 585; AVX512VBMI2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 586; AVX512VBMI2-NEXT: vzeroupper 587; AVX512VBMI2-NEXT: retq 588; 589; AVX512VLVBMI2-LABEL: var_rotate_v16i8: 590; AVX512VLVBMI2: # %bb.0: 591; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 592; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 593; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 594; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 595; AVX512VLVBMI2-NEXT: vpsllvw %xmm2, %xmm3, %xmm2 596; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm2, %xmm2 597; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 598; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 599; AVX512VLVBMI2-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 600; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 601; AVX512VLVBMI2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 602; AVX512VLVBMI2-NEXT: retq 603; 604; XOP-LABEL: var_rotate_v16i8: 605; XOP: # %bb.0: 606; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0 607; XOP-NEXT: retq 608; 609; X86-SSE2-LABEL: var_rotate_v16i8: 610; X86-SSE2: # %bb.0: 611; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 612; X86-SSE2-NEXT: psllw $5, %xmm1 613; X86-SSE2-NEXT: pxor %xmm0, %xmm0 614; X86-SSE2-NEXT: pxor %xmm3, %xmm3 615; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm3 616; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 617; X86-SSE2-NEXT: psrlw $4, %xmm4 618; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 619; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 620; X86-SSE2-NEXT: psllw $4, %xmm5 621; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5 622; X86-SSE2-NEXT: por %xmm4, %xmm5 623; X86-SSE2-NEXT: pand %xmm3, %xmm5 624; X86-SSE2-NEXT: pandn %xmm2, %xmm3 625; X86-SSE2-NEXT: por %xmm5, %xmm3 626; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 627; X86-SSE2-NEXT: psrlw $6, %xmm2 628; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 629; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 630; X86-SSE2-NEXT: psllw $2, %xmm4 631; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 632; X86-SSE2-NEXT: por %xmm2, %xmm4 633; X86-SSE2-NEXT: paddb %xmm1, %xmm1 634; X86-SSE2-NEXT: pxor %xmm2, %xmm2 635; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 636; X86-SSE2-NEXT: pand %xmm2, %xmm4 637; X86-SSE2-NEXT: pandn %xmm3, %xmm2 638; X86-SSE2-NEXT: por %xmm4, %xmm2 639; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 640; X86-SSE2-NEXT: psrlw $7, %xmm3 641; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 642; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 643; X86-SSE2-NEXT: paddb %xmm2, %xmm4 644; X86-SSE2-NEXT: por %xmm3, %xmm4 645; X86-SSE2-NEXT: paddb %xmm1, %xmm1 646; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 647; X86-SSE2-NEXT: pand %xmm0, %xmm4 648; X86-SSE2-NEXT: pandn %xmm2, %xmm0 649; X86-SSE2-NEXT: por %xmm4, %xmm0 650; X86-SSE2-NEXT: retl 651 %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b 652 %shl = shl <16 x i8> %a, %b 653 %lshr = lshr <16 x i8> %a, %b8 654 %or = or <16 x i8> %shl, %lshr 655 ret <16 x i8> %or 656} 657 658; 659; Uniform Variable Rotates 660; 661 662define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 663; SSE2-LABEL: splatvar_rotate_v2i64: 664; SSE2: # %bb.0: 665; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,64] 666; SSE2-NEXT: psubq %xmm1, %xmm2 667; SSE2-NEXT: movdqa %xmm0, %xmm3 668; SSE2-NEXT: psllq %xmm1, %xmm3 669; SSE2-NEXT: psrlq %xmm2, %xmm0 670; SSE2-NEXT: por %xmm3, %xmm0 671; SSE2-NEXT: retq 672; 673; SSE41-LABEL: splatvar_rotate_v2i64: 674; SSE41: # %bb.0: 675; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [64,64] 676; SSE41-NEXT: psubq %xmm1, %xmm2 677; SSE41-NEXT: movdqa %xmm0, %xmm3 678; SSE41-NEXT: psllq %xmm1, %xmm3 679; SSE41-NEXT: psrlq %xmm2, %xmm0 680; SSE41-NEXT: por %xmm3, %xmm0 681; SSE41-NEXT: retq 682; 683; AVX-LABEL: splatvar_rotate_v2i64: 684; AVX: # %bb.0: 685; AVX-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] 686; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm2 687; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm1 688; AVX-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 689; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 690; AVX-NEXT: retq 691; 692; AVX512NOVLX-LABEL: splatvar_rotate_v2i64: 693; AVX512NOVLX: # %bb.0: 694; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 695; AVX512NOVLX-NEXT: vpbroadcastq %xmm1, %xmm1 696; AVX512NOVLX-NEXT: vprolvq %zmm1, %zmm0, %zmm0 697; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 698; AVX512NOVLX-NEXT: vzeroupper 699; AVX512NOVLX-NEXT: retq 700; 701; AVX512VLX-LABEL: splatvar_rotate_v2i64: 702; AVX512VLX: # %bb.0: 703; AVX512VLX-NEXT: vpbroadcastq %xmm1, %xmm1 704; AVX512VLX-NEXT: vprolvq %xmm1, %xmm0, %xmm0 705; AVX512VLX-NEXT: retq 706; 707; XOPAVX1-LABEL: splatvar_rotate_v2i64: 708; XOPAVX1: # %bb.0: 709; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 710; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 711; XOPAVX1-NEXT: retq 712; 713; XOPAVX2-LABEL: splatvar_rotate_v2i64: 714; XOPAVX2: # %bb.0: 715; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 716; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 717; XOPAVX2-NEXT: retq 718; 719; X86-SSE2-LABEL: splatvar_rotate_v2i64: 720; X86-SSE2: # %bb.0: 721; X86-SSE2-NEXT: movd {{.*#+}} xmm2 = [64,0,0,0] 722; X86-SSE2-NEXT: psubq %xmm1, %xmm2 723; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 724; X86-SSE2-NEXT: psllq %xmm1, %xmm3 725; X86-SSE2-NEXT: psrlq %xmm2, %xmm0 726; X86-SSE2-NEXT: por %xmm3, %xmm0 727; X86-SSE2-NEXT: retl 728 %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer 729 %splat64 = sub <2 x i64> <i64 64, i64 64>, %splat 730 %shl = shl <2 x i64> %a, %splat 731 %lshr = lshr <2 x i64> %a, %splat64 732 %or = or <2 x i64> %shl, %lshr 733 ret <2 x i64> %or 734} 735 736define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 737; SSE-LABEL: splatvar_rotate_v4i32: 738; SSE: # %bb.0: 739; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 740; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] 741; SSE-NEXT: psllq %xmm1, %xmm2 742; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 743; SSE-NEXT: psllq %xmm1, %xmm0 744; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] 745; SSE-NEXT: retq 746; 747; AVX-LABEL: splatvar_rotate_v4i32: 748; AVX: # %bb.0: 749; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 750; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] 751; AVX-NEXT: vpsllq %xmm1, %xmm2, %xmm2 752; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 753; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 754; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] 755; AVX-NEXT: retq 756; 757; AVX512NOVLX-LABEL: splatvar_rotate_v4i32: 758; AVX512NOVLX: # %bb.0: 759; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 760; AVX512NOVLX-NEXT: vpbroadcastd %xmm1, %xmm1 761; AVX512NOVLX-NEXT: vprolvd %zmm1, %zmm0, %zmm0 762; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 763; AVX512NOVLX-NEXT: vzeroupper 764; AVX512NOVLX-NEXT: retq 765; 766; AVX512VLX-LABEL: splatvar_rotate_v4i32: 767; AVX512VLX: # %bb.0: 768; AVX512VLX-NEXT: vpbroadcastd %xmm1, %xmm1 769; AVX512VLX-NEXT: vprolvd %xmm1, %xmm0, %xmm0 770; AVX512VLX-NEXT: retq 771; 772; XOPAVX1-LABEL: splatvar_rotate_v4i32: 773; XOPAVX1: # %bb.0: 774; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 775; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 776; XOPAVX1-NEXT: retq 777; 778; XOPAVX2-LABEL: splatvar_rotate_v4i32: 779; XOPAVX2: # %bb.0: 780; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 781; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 782; XOPAVX2-NEXT: retq 783; 784; X86-SSE2-LABEL: splatvar_rotate_v4i32: 785; X86-SSE2: # %bb.0: 786; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 787; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] 788; X86-SSE2-NEXT: psllq %xmm1, %xmm2 789; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 790; X86-SSE2-NEXT: psllq %xmm1, %xmm0 791; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] 792; X86-SSE2-NEXT: retl 793 %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer 794 %splat32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %splat 795 %shl = shl <4 x i32> %a, %splat 796 %lshr = lshr <4 x i32> %a, %splat32 797 %or = or <4 x i32> %shl, %lshr 798 ret <4 x i32> %or 799} 800 801define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 802; SSE2-LABEL: splatvar_rotate_v8i16: 803; SSE2: # %bb.0: 804; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 805; SSE2-NEXT: movdqa %xmm0, %xmm2 806; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 807; SSE2-NEXT: pslld %xmm1, %xmm2 808; SSE2-NEXT: psrad $16, %xmm2 809; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 810; SSE2-NEXT: pslld %xmm1, %xmm0 811; SSE2-NEXT: psrad $16, %xmm0 812; SSE2-NEXT: packssdw %xmm2, %xmm0 813; SSE2-NEXT: retq 814; 815; SSE41-LABEL: splatvar_rotate_v8i16: 816; SSE41: # %bb.0: 817; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [15,0] 818; SSE41-NEXT: movdqa %xmm1, %xmm3 819; SSE41-NEXT: pandn %xmm2, %xmm3 820; SSE41-NEXT: movdqa %xmm0, %xmm4 821; SSE41-NEXT: psrlw $1, %xmm4 822; SSE41-NEXT: psrlw %xmm3, %xmm4 823; SSE41-NEXT: pand %xmm2, %xmm1 824; SSE41-NEXT: psllw %xmm1, %xmm0 825; SSE41-NEXT: por %xmm4, %xmm0 826; SSE41-NEXT: retq 827; 828; AVX-LABEL: splatvar_rotate_v8i16: 829; AVX: # %bb.0: 830; AVX-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] 831; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm3 832; AVX-NEXT: vpsrlw $1, %xmm0, %xmm4 833; AVX-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 834; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 835; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 836; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0 837; AVX-NEXT: retq 838; 839; AVX512F-LABEL: splatvar_rotate_v8i16: 840; AVX512F: # %bb.0: 841; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] 842; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 843; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm4 844; AVX512F-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 845; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 846; AVX512F-NEXT: vpsllw %xmm1, %xmm0, %xmm0 847; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0 848; AVX512F-NEXT: retq 849; 850; AVX512VL-LABEL: splatvar_rotate_v8i16: 851; AVX512VL: # %bb.0: 852; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] 853; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 854; AVX512VL-NEXT: vpsrlw $1, %xmm0, %xmm4 855; AVX512VL-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 856; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 857; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 858; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0 859; AVX512VL-NEXT: retq 860; 861; AVX512BW-LABEL: splatvar_rotate_v8i16: 862; AVX512BW: # %bb.0: 863; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] 864; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 865; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm4 866; AVX512BW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 867; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 868; AVX512BW-NEXT: vpsllw %xmm1, %xmm0, %xmm0 869; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0 870; AVX512BW-NEXT: retq 871; 872; AVX512VLBW-LABEL: splatvar_rotate_v8i16: 873; AVX512VLBW: # %bb.0: 874; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] 875; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 876; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm4 877; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 878; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 879; AVX512VLBW-NEXT: vpsllw %xmm1, %xmm0, %xmm0 880; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0 881; AVX512VLBW-NEXT: retq 882; 883; AVX512VBMI2-LABEL: splatvar_rotate_v8i16: 884; AVX512VBMI2: # %bb.0: 885; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 886; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %xmm1 887; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 888; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 889; AVX512VBMI2-NEXT: vzeroupper 890; AVX512VBMI2-NEXT: retq 891; 892; AVX512VLVBMI2-LABEL: splatvar_rotate_v8i16: 893; AVX512VLVBMI2: # %bb.0: 894; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %xmm1 895; AVX512VLVBMI2-NEXT: vpshldvw %xmm1, %xmm0, %xmm0 896; AVX512VLVBMI2-NEXT: retq 897; 898; XOPAVX1-LABEL: splatvar_rotate_v8i16: 899; XOPAVX1: # %bb.0: 900; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 901; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 902; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 903; XOPAVX1-NEXT: retq 904; 905; XOPAVX2-LABEL: splatvar_rotate_v8i16: 906; XOPAVX2: # %bb.0: 907; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 908; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 909; XOPAVX2-NEXT: retq 910; 911; X86-SSE2-LABEL: splatvar_rotate_v8i16: 912; X86-SSE2: # %bb.0: 913; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 914; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 915; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 916; X86-SSE2-NEXT: pslld %xmm1, %xmm2 917; X86-SSE2-NEXT: psrad $16, %xmm2 918; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 919; X86-SSE2-NEXT: pslld %xmm1, %xmm0 920; X86-SSE2-NEXT: psrad $16, %xmm0 921; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 922; X86-SSE2-NEXT: retl 923 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer 924 %splat16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat 925 %shl = shl <8 x i16> %a, %splat 926 %lshr = lshr <8 x i16> %a, %splat16 927 %or = or <8 x i16> %shl, %lshr 928 ret <8 x i16> %or 929} 930 931define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 932; SSE-LABEL: splatvar_rotate_v16i8: 933; SSE: # %bb.0: 934; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 935; SSE-NEXT: movdqa %xmm0, %xmm2 936; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 937; SSE-NEXT: psllw %xmm1, %xmm2 938; SSE-NEXT: psrlw $8, %xmm2 939; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 940; SSE-NEXT: psllw %xmm1, %xmm0 941; SSE-NEXT: psrlw $8, %xmm0 942; SSE-NEXT: packuswb %xmm2, %xmm0 943; SSE-NEXT: retq 944; 945; AVX-LABEL: splatvar_rotate_v16i8: 946; AVX: # %bb.0: 947; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 948; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 949; AVX-NEXT: vpsllw %xmm1, %xmm2, %xmm2 950; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 951; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 952; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 953; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 954; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 955; AVX-NEXT: retq 956; 957; AVX512-LABEL: splatvar_rotate_v16i8: 958; AVX512: # %bb.0: 959; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 960; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 961; AVX512-NEXT: vpsllw %xmm1, %xmm2, %xmm2 962; AVX512-NEXT: vpsrlw $8, %xmm2, %xmm2 963; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 964; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0 965; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0 966; AVX512-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 967; AVX512-NEXT: retq 968; 969; XOPAVX1-LABEL: splatvar_rotate_v16i8: 970; XOPAVX1: # %bb.0: 971; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 972; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 973; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 974; XOPAVX1-NEXT: retq 975; 976; XOPAVX2-LABEL: splatvar_rotate_v16i8: 977; XOPAVX2: # %bb.0: 978; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 979; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 980; XOPAVX2-NEXT: retq 981; 982; X86-SSE2-LABEL: splatvar_rotate_v16i8: 983; X86-SSE2: # %bb.0: 984; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 985; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 986; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 987; X86-SSE2-NEXT: psllw %xmm1, %xmm2 988; X86-SSE2-NEXT: psrlw $8, %xmm2 989; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 990; X86-SSE2-NEXT: psllw %xmm1, %xmm0 991; X86-SSE2-NEXT: psrlw $8, %xmm0 992; X86-SSE2-NEXT: packuswb %xmm2, %xmm0 993; X86-SSE2-NEXT: retl 994 %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer 995 %splat8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat 996 %shl = shl <16 x i8> %a, %splat 997 %lshr = lshr <16 x i8> %a, %splat8 998 %or = or <16 x i8> %shl, %lshr 999 ret <16 x i8> %or 1000} 1001 1002; 1003; Constant Rotates 1004; 1005 1006define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind { 1007; SSE2-LABEL: constant_rotate_v2i64: 1008; SSE2: # %bb.0: 1009; SSE2-NEXT: movdqa %xmm0, %xmm1 1010; SSE2-NEXT: psrlq $60, %xmm1 1011; SSE2-NEXT: movdqa %xmm0, %xmm2 1012; SSE2-NEXT: psrlq $50, %xmm2 1013; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1014; SSE2-NEXT: movdqa %xmm0, %xmm1 1015; SSE2-NEXT: psllq $4, %xmm1 1016; SSE2-NEXT: psllq $14, %xmm0 1017; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1018; SSE2-NEXT: orpd %xmm2, %xmm0 1019; SSE2-NEXT: retq 1020; 1021; SSE41-LABEL: constant_rotate_v2i64: 1022; SSE41: # %bb.0: 1023; SSE41-NEXT: movdqa %xmm0, %xmm1 1024; SSE41-NEXT: psrlq $50, %xmm1 1025; SSE41-NEXT: movdqa %xmm0, %xmm2 1026; SSE41-NEXT: psrlq $60, %xmm2 1027; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1028; SSE41-NEXT: movdqa %xmm0, %xmm1 1029; SSE41-NEXT: psllq $14, %xmm1 1030; SSE41-NEXT: psllq $4, %xmm0 1031; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1032; SSE41-NEXT: por %xmm2, %xmm0 1033; SSE41-NEXT: retq 1034; 1035; AVX1-LABEL: constant_rotate_v2i64: 1036; AVX1: # %bb.0: 1037; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm1 1038; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm2 1039; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1040; AVX1-NEXT: vpsllq $14, %xmm0, %xmm2 1041; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0 1042; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1043; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1044; AVX1-NEXT: retq 1045; 1046; AVX2-LABEL: constant_rotate_v2i64: 1047; AVX2: # %bb.0: 1048; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1049; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1050; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1051; AVX2-NEXT: retq 1052; 1053; AVX512NOVLX-LABEL: constant_rotate_v2i64: 1054; AVX512NOVLX: # %bb.0: 1055; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1056; AVX512NOVLX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,14] 1057; AVX512NOVLX-NEXT: vprolvq %zmm1, %zmm0, %zmm0 1058; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1059; AVX512NOVLX-NEXT: vzeroupper 1060; AVX512NOVLX-NEXT: retq 1061; 1062; AVX512VLX-LABEL: constant_rotate_v2i64: 1063; AVX512VLX: # %bb.0: 1064; AVX512VLX-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1065; AVX512VLX-NEXT: retq 1066; 1067; XOP-LABEL: constant_rotate_v2i64: 1068; XOP: # %bb.0: 1069; XOP-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1070; XOP-NEXT: retq 1071; 1072; X86-SSE2-LABEL: constant_rotate_v2i64: 1073; X86-SSE2: # %bb.0: 1074; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1075; X86-SSE2-NEXT: psrlq $60, %xmm1 1076; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1077; X86-SSE2-NEXT: psrlq $50, %xmm2 1078; X86-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1079; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1080; X86-SSE2-NEXT: psllq $4, %xmm1 1081; X86-SSE2-NEXT: psllq $14, %xmm0 1082; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1083; X86-SSE2-NEXT: orpd %xmm2, %xmm0 1084; X86-SSE2-NEXT: retl 1085 %shl = shl <2 x i64> %a, <i64 4, i64 14> 1086 %lshr = lshr <2 x i64> %a, <i64 60, i64 50> 1087 %or = or <2 x i64> %shl, %lshr 1088 ret <2 x i64> %or 1089} 1090 1091define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind { 1092; SSE2-LABEL: constant_rotate_v4i32: 1093; SSE2: # %bb.0: 1094; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1095; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1096; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] 1097; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1098; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] 1099; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1100; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1101; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1102; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1103; SSE2-NEXT: por %xmm2, %xmm0 1104; SSE2-NEXT: retq 1105; 1106; SSE41-LABEL: constant_rotate_v4i32: 1107; SSE41: # %bb.0: 1108; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1109; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1110; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1111; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1112; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 1113; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] 1114; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1115; SSE41-NEXT: por %xmm2, %xmm0 1116; SSE41-NEXT: retq 1117; 1118; AVX1-LABEL: constant_rotate_v4i32: 1119; AVX1: # %bb.0: 1120; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1121; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1122; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1123; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1124; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 1125; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] 1126; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1127; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 1128; AVX1-NEXT: retq 1129; 1130; AVX2-LABEL: constant_rotate_v4i32: 1131; AVX2: # %bb.0: 1132; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1133; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1134; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1135; AVX2-NEXT: retq 1136; 1137; AVX512NOVLX-LABEL: constant_rotate_v4i32: 1138; AVX512NOVLX: # %bb.0: 1139; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1140; AVX512NOVLX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7] 1141; AVX512NOVLX-NEXT: vprolvd %zmm1, %zmm0, %zmm0 1142; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1143; AVX512NOVLX-NEXT: vzeroupper 1144; AVX512NOVLX-NEXT: retq 1145; 1146; AVX512VLX-LABEL: constant_rotate_v4i32: 1147; AVX512VLX: # %bb.0: 1148; AVX512VLX-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1149; AVX512VLX-NEXT: retq 1150; 1151; XOP-LABEL: constant_rotate_v4i32: 1152; XOP: # %bb.0: 1153; XOP-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1154; XOP-NEXT: retq 1155; 1156; X86-SSE2-LABEL: constant_rotate_v4i32: 1157; X86-SSE2: # %bb.0: 1158; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1159; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1160; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] 1161; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1162; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] 1163; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1164; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1165; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1166; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1167; X86-SSE2-NEXT: por %xmm2, %xmm0 1168; X86-SSE2-NEXT: retl 1169 %shl = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 1170 %lshr = lshr <4 x i32> %a, <i32 28, i32 27, i32 26, i32 25> 1171 %or = or <4 x i32> %shl, %lshr 1172 ret <4 x i32> %or 1173} 1174 1175define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind { 1176; SSE2-LABEL: constant_rotate_v8i16: 1177; SSE2: # %bb.0: 1178; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1179; SSE2-NEXT: movdqa %xmm0, %xmm2 1180; SSE2-NEXT: pmulhuw %xmm1, %xmm2 1181; SSE2-NEXT: pmullw %xmm1, %xmm0 1182; SSE2-NEXT: por %xmm2, %xmm0 1183; SSE2-NEXT: retq 1184; 1185; SSE41-LABEL: constant_rotate_v8i16: 1186; SSE41: # %bb.0: 1187; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1188; SSE41-NEXT: movdqa %xmm0, %xmm2 1189; SSE41-NEXT: pmulhuw %xmm1, %xmm2 1190; SSE41-NEXT: pmullw %xmm1, %xmm0 1191; SSE41-NEXT: por %xmm2, %xmm0 1192; SSE41-NEXT: retq 1193; 1194; AVX-LABEL: constant_rotate_v8i16: 1195; AVX: # %bb.0: 1196; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1197; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1198; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1199; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 1200; AVX-NEXT: retq 1201; 1202; AVX512F-LABEL: constant_rotate_v8i16: 1203; AVX512F: # %bb.0: 1204; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1205; AVX512F-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1206; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1207; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 1208; AVX512F-NEXT: retq 1209; 1210; AVX512VL-LABEL: constant_rotate_v8i16: 1211; AVX512VL: # %bb.0: 1212; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1213; AVX512VL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1214; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1215; AVX512VL-NEXT: vpor %xmm2, %xmm0, %xmm0 1216; AVX512VL-NEXT: retq 1217; 1218; AVX512BW-LABEL: constant_rotate_v8i16: 1219; AVX512BW: # %bb.0: 1220; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1221; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 1222; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9] 1223; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2 1224; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 1225; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0 1226; AVX512BW-NEXT: vzeroupper 1227; AVX512BW-NEXT: retq 1228; 1229; AVX512VLBW-LABEL: constant_rotate_v8i16: 1230; AVX512VLBW: # %bb.0: 1231; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1232; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1233; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1234; AVX512VLBW-NEXT: retq 1235; 1236; AVX512VBMI2-LABEL: constant_rotate_v8i16: 1237; AVX512VBMI2: # %bb.0: 1238; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1239; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 1240; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 1241; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1242; AVX512VBMI2-NEXT: vzeroupper 1243; AVX512VBMI2-NEXT: retq 1244; 1245; AVX512VLVBMI2-LABEL: constant_rotate_v8i16: 1246; AVX512VLVBMI2: # %bb.0: 1247; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1248; AVX512VLVBMI2-NEXT: retq 1249; 1250; XOP-LABEL: constant_rotate_v8i16: 1251; XOP: # %bb.0: 1252; XOP-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1253; XOP-NEXT: retq 1254; 1255; X86-SSE2-LABEL: constant_rotate_v8i16: 1256; X86-SSE2: # %bb.0: 1257; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1258; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1259; X86-SSE2-NEXT: pmulhuw %xmm1, %xmm2 1260; X86-SSE2-NEXT: pmullw %xmm1, %xmm0 1261; X86-SSE2-NEXT: por %xmm2, %xmm0 1262; X86-SSE2-NEXT: retl 1263 %shl = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 1264 %lshr = lshr <8 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9> 1265 %or = or <8 x i16> %shl, %lshr 1266 ret <8 x i16> %or 1267} 1268 1269define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { 1270; SSE-LABEL: constant_rotate_v16i8: 1271; SSE: # %bb.0: 1272; SSE-NEXT: movdqa %xmm0, %xmm1 1273; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1274; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,128,64,32,16,8,4,2] 1275; SSE-NEXT: psrlw $8, %xmm1 1276; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1277; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128] 1278; SSE-NEXT: psrlw $8, %xmm0 1279; SSE-NEXT: packuswb %xmm1, %xmm0 1280; SSE-NEXT: retq 1281; 1282; AVX-LABEL: constant_rotate_v16i8: 1283; AVX: # %bb.0: 1284; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1285; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2] 1286; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 1287; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1288; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] 1289; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 1290; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1291; AVX-NEXT: retq 1292; 1293; AVX512F-LABEL: constant_rotate_v16i8: 1294; AVX512F: # %bb.0: 1295; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1296; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2] 1297; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1 1298; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1299; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] 1300; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0 1301; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1302; AVX512F-NEXT: retq 1303; 1304; AVX512VL-LABEL: constant_rotate_v16i8: 1305; AVX512VL: # %bb.0: 1306; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1307; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2] 1308; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 1309; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1310; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] 1311; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 1312; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1313; AVX512VL-NEXT: retq 1314; 1315; AVX512BW-LABEL: constant_rotate_v16i8: 1316; AVX512BW: # %bb.0: 1317; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,6,5,4,3,2,1] 1318; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1319; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 1320; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 1321; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] 1322; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1323; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 1324; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 1325; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1326; AVX512BW-NEXT: vzeroupper 1327; AVX512BW-NEXT: retq 1328; 1329; AVX512VLBW-LABEL: constant_rotate_v16i8: 1330; AVX512VLBW: # %bb.0: 1331; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1332; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1333; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1 1334; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1335; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1336; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 1337; AVX512VLBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1338; AVX512VLBW-NEXT: retq 1339; 1340; AVX512VBMI2-LABEL: constant_rotate_v16i8: 1341; AVX512VBMI2: # %bb.0: 1342; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,6,5,4,3,2,1] 1343; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1344; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 1345; AVX512VBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 1346; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] 1347; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1348; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 1349; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 1350; AVX512VBMI2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1351; AVX512VBMI2-NEXT: vzeroupper 1352; AVX512VBMI2-NEXT: retq 1353; 1354; AVX512VLVBMI2-LABEL: constant_rotate_v16i8: 1355; AVX512VLVBMI2: # %bb.0: 1356; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1357; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1358; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 1359; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1360; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1361; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 1362; AVX512VLVBMI2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1363; AVX512VLVBMI2-NEXT: retq 1364; 1365; XOP-LABEL: constant_rotate_v16i8: 1366; XOP: # %bb.0: 1367; XOP-NEXT: vprotb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1368; XOP-NEXT: retq 1369; 1370; X86-SSE2-LABEL: constant_rotate_v16i8: 1371; X86-SSE2: # %bb.0: 1372; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1373; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1374; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,128,64,32,16,8,4,2] 1375; X86-SSE2-NEXT: psrlw $8, %xmm1 1376; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1377; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,16,32,64,128] 1378; X86-SSE2-NEXT: psrlw $8, %xmm0 1379; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 1380; X86-SSE2-NEXT: retl 1381 %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1> 1382 %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> 1383 %or = or <16 x i8> %shl, %lshr 1384 ret <16 x i8> %or 1385} 1386 1387; 1388; Uniform Constant Rotates 1389; 1390 1391define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind { 1392; SSE-LABEL: splatconstant_rotate_v2i64: 1393; SSE: # %bb.0: 1394; SSE-NEXT: movdqa %xmm0, %xmm1 1395; SSE-NEXT: psrlq $50, %xmm1 1396; SSE-NEXT: psllq $14, %xmm0 1397; SSE-NEXT: por %xmm1, %xmm0 1398; SSE-NEXT: retq 1399; 1400; AVX-LABEL: splatconstant_rotate_v2i64: 1401; AVX: # %bb.0: 1402; AVX-NEXT: vpsrlq $50, %xmm0, %xmm1 1403; AVX-NEXT: vpsllq $14, %xmm0, %xmm0 1404; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1405; AVX-NEXT: retq 1406; 1407; AVX512NOVLX-LABEL: splatconstant_rotate_v2i64: 1408; AVX512NOVLX: # %bb.0: 1409; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1410; AVX512NOVLX-NEXT: vprolq $14, %zmm0, %zmm0 1411; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1412; AVX512NOVLX-NEXT: vzeroupper 1413; AVX512NOVLX-NEXT: retq 1414; 1415; AVX512VLX-LABEL: splatconstant_rotate_v2i64: 1416; AVX512VLX: # %bb.0: 1417; AVX512VLX-NEXT: vprolq $14, %xmm0, %xmm0 1418; AVX512VLX-NEXT: retq 1419; 1420; XOP-LABEL: splatconstant_rotate_v2i64: 1421; XOP: # %bb.0: 1422; XOP-NEXT: vprotq $14, %xmm0, %xmm0 1423; XOP-NEXT: retq 1424; 1425; X86-SSE2-LABEL: splatconstant_rotate_v2i64: 1426; X86-SSE2: # %bb.0: 1427; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1428; X86-SSE2-NEXT: psrlq $50, %xmm1 1429; X86-SSE2-NEXT: psllq $14, %xmm0 1430; X86-SSE2-NEXT: por %xmm1, %xmm0 1431; X86-SSE2-NEXT: retl 1432 %shl = shl <2 x i64> %a, <i64 14, i64 14> 1433 %lshr = lshr <2 x i64> %a, <i64 50, i64 50> 1434 %or = or <2 x i64> %shl, %lshr 1435 ret <2 x i64> %or 1436} 1437 1438define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind { 1439; SSE-LABEL: splatconstant_rotate_v4i32: 1440; SSE: # %bb.0: 1441; SSE-NEXT: movdqa %xmm0, %xmm1 1442; SSE-NEXT: psrld $28, %xmm1 1443; SSE-NEXT: pslld $4, %xmm0 1444; SSE-NEXT: por %xmm1, %xmm0 1445; SSE-NEXT: retq 1446; 1447; AVX-LABEL: splatconstant_rotate_v4i32: 1448; AVX: # %bb.0: 1449; AVX-NEXT: vpsrld $28, %xmm0, %xmm1 1450; AVX-NEXT: vpslld $4, %xmm0, %xmm0 1451; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1452; AVX-NEXT: retq 1453; 1454; AVX512NOVLX-LABEL: splatconstant_rotate_v4i32: 1455; AVX512NOVLX: # %bb.0: 1456; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1457; AVX512NOVLX-NEXT: vprold $4, %zmm0, %zmm0 1458; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1459; AVX512NOVLX-NEXT: vzeroupper 1460; AVX512NOVLX-NEXT: retq 1461; 1462; AVX512VLX-LABEL: splatconstant_rotate_v4i32: 1463; AVX512VLX: # %bb.0: 1464; AVX512VLX-NEXT: vprold $4, %xmm0, %xmm0 1465; AVX512VLX-NEXT: retq 1466; 1467; XOP-LABEL: splatconstant_rotate_v4i32: 1468; XOP: # %bb.0: 1469; XOP-NEXT: vprotd $4, %xmm0, %xmm0 1470; XOP-NEXT: retq 1471; 1472; X86-SSE2-LABEL: splatconstant_rotate_v4i32: 1473; X86-SSE2: # %bb.0: 1474; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1475; X86-SSE2-NEXT: psrld $28, %xmm1 1476; X86-SSE2-NEXT: pslld $4, %xmm0 1477; X86-SSE2-NEXT: por %xmm1, %xmm0 1478; X86-SSE2-NEXT: retl 1479 %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4> 1480 %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28> 1481 %or = or <4 x i32> %shl, %lshr 1482 ret <4 x i32> %or 1483} 1484 1485define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind { 1486; SSE-LABEL: splatconstant_rotate_v8i16: 1487; SSE: # %bb.0: 1488; SSE-NEXT: movdqa %xmm0, %xmm1 1489; SSE-NEXT: psrlw $9, %xmm1 1490; SSE-NEXT: psllw $7, %xmm0 1491; SSE-NEXT: por %xmm1, %xmm0 1492; SSE-NEXT: retq 1493; 1494; AVX-LABEL: splatconstant_rotate_v8i16: 1495; AVX: # %bb.0: 1496; AVX-NEXT: vpsrlw $9, %xmm0, %xmm1 1497; AVX-NEXT: vpsllw $7, %xmm0, %xmm0 1498; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1499; AVX-NEXT: retq 1500; 1501; AVX512F-LABEL: splatconstant_rotate_v8i16: 1502; AVX512F: # %bb.0: 1503; AVX512F-NEXT: vpsrlw $9, %xmm0, %xmm1 1504; AVX512F-NEXT: vpsllw $7, %xmm0, %xmm0 1505; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 1506; AVX512F-NEXT: retq 1507; 1508; AVX512VL-LABEL: splatconstant_rotate_v8i16: 1509; AVX512VL: # %bb.0: 1510; AVX512VL-NEXT: vpsrlw $9, %xmm0, %xmm1 1511; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0 1512; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 1513; AVX512VL-NEXT: retq 1514; 1515; AVX512BW-LABEL: splatconstant_rotate_v8i16: 1516; AVX512BW: # %bb.0: 1517; AVX512BW-NEXT: vpsrlw $9, %xmm0, %xmm1 1518; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 1519; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1520; AVX512BW-NEXT: retq 1521; 1522; AVX512VLBW-LABEL: splatconstant_rotate_v8i16: 1523; AVX512VLBW: # %bb.0: 1524; AVX512VLBW-NEXT: vpsrlw $9, %xmm0, %xmm1 1525; AVX512VLBW-NEXT: vpsllw $7, %xmm0, %xmm0 1526; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1527; AVX512VLBW-NEXT: retq 1528; 1529; AVX512VBMI2-LABEL: splatconstant_rotate_v8i16: 1530; AVX512VBMI2: # %bb.0: 1531; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1532; AVX512VBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0 1533; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1534; AVX512VBMI2-NEXT: vzeroupper 1535; AVX512VBMI2-NEXT: retq 1536; 1537; AVX512VLVBMI2-LABEL: splatconstant_rotate_v8i16: 1538; AVX512VLVBMI2: # %bb.0: 1539; AVX512VLVBMI2-NEXT: vpshldw $7, %xmm0, %xmm0, %xmm0 1540; AVX512VLVBMI2-NEXT: retq 1541; 1542; XOP-LABEL: splatconstant_rotate_v8i16: 1543; XOP: # %bb.0: 1544; XOP-NEXT: vprotw $7, %xmm0, %xmm0 1545; XOP-NEXT: retq 1546; 1547; X86-SSE2-LABEL: splatconstant_rotate_v8i16: 1548; X86-SSE2: # %bb.0: 1549; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1550; X86-SSE2-NEXT: psrlw $9, %xmm1 1551; X86-SSE2-NEXT: psllw $7, %xmm0 1552; X86-SSE2-NEXT: por %xmm1, %xmm0 1553; X86-SSE2-NEXT: retl 1554 %shl = shl <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 1555 %lshr = lshr <8 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9> 1556 %or = or <8 x i16> %shl, %lshr 1557 ret <8 x i16> %or 1558} 1559 1560define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind { 1561; SSE-LABEL: splatconstant_rotate_v16i8: 1562; SSE: # %bb.0: 1563; SSE-NEXT: movdqa %xmm0, %xmm1 1564; SSE-NEXT: psrlw $4, %xmm1 1565; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1566; SSE-NEXT: psllw $4, %xmm0 1567; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1568; SSE-NEXT: por %xmm1, %xmm0 1569; SSE-NEXT: retq 1570; 1571; AVX-LABEL: splatconstant_rotate_v16i8: 1572; AVX: # %bb.0: 1573; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 1574; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1575; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 1576; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1577; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1578; AVX-NEXT: retq 1579; 1580; AVX512NOVLX-LABEL: splatconstant_rotate_v16i8: 1581; AVX512NOVLX: # %bb.0: 1582; AVX512NOVLX-NEXT: vpsllw $4, %xmm0, %xmm1 1583; AVX512NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 1584; AVX512NOVLX-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 1585; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1586; AVX512NOVLX-NEXT: vzeroupper 1587; AVX512NOVLX-NEXT: retq 1588; 1589; AVX512VLX-LABEL: splatconstant_rotate_v16i8: 1590; AVX512VLX: # %bb.0: 1591; AVX512VLX-NEXT: vpsllw $4, %xmm0, %xmm1 1592; AVX512VLX-NEXT: vpsrlw $4, %xmm0, %xmm0 1593; AVX512VLX-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1)) 1594; AVX512VLX-NEXT: retq 1595; 1596; XOP-LABEL: splatconstant_rotate_v16i8: 1597; XOP: # %bb.0: 1598; XOP-NEXT: vprotb $4, %xmm0, %xmm0 1599; XOP-NEXT: retq 1600; 1601; X86-SSE2-LABEL: splatconstant_rotate_v16i8: 1602; X86-SSE2: # %bb.0: 1603; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1604; X86-SSE2-NEXT: psrlw $4, %xmm1 1605; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1606; X86-SSE2-NEXT: psllw $4, %xmm0 1607; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1608; X86-SSE2-NEXT: por %xmm1, %xmm0 1609; X86-SSE2-NEXT: retl 1610 %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1611 %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1612 %or = or <16 x i8> %shl, %lshr 1613 ret <16 x i8> %or 1614} 1615 1616; 1617; Masked Uniform Constant Rotates 1618; 1619 1620define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind { 1621; SSE-LABEL: splatconstant_rotate_mask_v2i64: 1622; SSE: # %bb.0: 1623; SSE-NEXT: psrlq $49, %xmm0 1624; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1625; SSE-NEXT: retq 1626; 1627; AVX-LABEL: splatconstant_rotate_mask_v2i64: 1628; AVX: # %bb.0: 1629; AVX-NEXT: vpsrlq $49, %xmm0, %xmm0 1630; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1631; AVX-NEXT: retq 1632; 1633; AVX512-LABEL: splatconstant_rotate_mask_v2i64: 1634; AVX512: # %bb.0: 1635; AVX512-NEXT: vpsrlq $49, %xmm0, %xmm0 1636; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1637; AVX512-NEXT: retq 1638; 1639; XOP-LABEL: splatconstant_rotate_mask_v2i64: 1640; XOP: # %bb.0: 1641; XOP-NEXT: vpsrlq $49, %xmm0, %xmm0 1642; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1643; XOP-NEXT: retq 1644; 1645; X86-SSE2-LABEL: splatconstant_rotate_mask_v2i64: 1646; X86-SSE2: # %bb.0: 1647; X86-SSE2-NEXT: psrlq $49, %xmm0 1648; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1649; X86-SSE2-NEXT: retl 1650 %shl = shl <2 x i64> %a, <i64 15, i64 15> 1651 %lshr = lshr <2 x i64> %a, <i64 49, i64 49> 1652 %rmask = and <2 x i64> %lshr, <i64 255, i64 127> 1653 %lmask = and <2 x i64> %shl, <i64 65, i64 33> 1654 %or = or <2 x i64> %lmask, %rmask 1655 ret <2 x i64> %or 1656} 1657 1658define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind { 1659; SSE-LABEL: splatconstant_rotate_mask_v4i32: 1660; SSE: # %bb.0: 1661; SSE-NEXT: movdqa %xmm0, %xmm1 1662; SSE-NEXT: psrld $28, %xmm1 1663; SSE-NEXT: pslld $4, %xmm0 1664; SSE-NEXT: por %xmm1, %xmm0 1665; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1666; SSE-NEXT: retq 1667; 1668; AVX-LABEL: splatconstant_rotate_mask_v4i32: 1669; AVX: # %bb.0: 1670; AVX-NEXT: vpsrld $28, %xmm0, %xmm1 1671; AVX-NEXT: vpslld $4, %xmm0, %xmm0 1672; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1673; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1674; AVX-NEXT: retq 1675; 1676; AVX512NOVLX-LABEL: splatconstant_rotate_mask_v4i32: 1677; AVX512NOVLX: # %bb.0: 1678; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1679; AVX512NOVLX-NEXT: vprold $4, %zmm0, %zmm0 1680; AVX512NOVLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1681; AVX512NOVLX-NEXT: vzeroupper 1682; AVX512NOVLX-NEXT: retq 1683; 1684; AVX512VLX-LABEL: splatconstant_rotate_mask_v4i32: 1685; AVX512VLX: # %bb.0: 1686; AVX512VLX-NEXT: vprold $4, %xmm0, %xmm0 1687; AVX512VLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1688; AVX512VLX-NEXT: retq 1689; 1690; XOP-LABEL: splatconstant_rotate_mask_v4i32: 1691; XOP: # %bb.0: 1692; XOP-NEXT: vprotd $4, %xmm0, %xmm0 1693; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1694; XOP-NEXT: retq 1695; 1696; X86-SSE2-LABEL: splatconstant_rotate_mask_v4i32: 1697; X86-SSE2: # %bb.0: 1698; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1699; X86-SSE2-NEXT: psrld $28, %xmm1 1700; X86-SSE2-NEXT: pslld $4, %xmm0 1701; X86-SSE2-NEXT: por %xmm1, %xmm0 1702; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1703; X86-SSE2-NEXT: retl 1704 %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4> 1705 %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28> 1706 %rmask = and <4 x i32> %lshr, <i32 127, i32 255, i32 511, i32 1023> 1707 %lmask = and <4 x i32> %shl, <i32 1023, i32 511, i32 255, i32 127> 1708 %or = or <4 x i32> %lmask, %rmask 1709 ret <4 x i32> %or 1710} 1711 1712define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind { 1713; SSE-LABEL: splatconstant_rotate_mask_v8i16: 1714; SSE: # %bb.0: 1715; SSE-NEXT: movdqa %xmm0, %xmm1 1716; SSE-NEXT: psrlw $11, %xmm1 1717; SSE-NEXT: psllw $5, %xmm0 1718; SSE-NEXT: por %xmm1, %xmm0 1719; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1720; SSE-NEXT: retq 1721; 1722; AVX-LABEL: splatconstant_rotate_mask_v8i16: 1723; AVX: # %bb.0: 1724; AVX-NEXT: vpsrlw $11, %xmm0, %xmm1 1725; AVX-NEXT: vpsllw $5, %xmm0, %xmm0 1726; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1727; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1728; AVX-NEXT: retq 1729; 1730; AVX512F-LABEL: splatconstant_rotate_mask_v8i16: 1731; AVX512F: # %bb.0: 1732; AVX512F-NEXT: vpsrlw $11, %xmm0, %xmm1 1733; AVX512F-NEXT: vpsllw $5, %xmm0, %xmm0 1734; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 1735; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1736; AVX512F-NEXT: retq 1737; 1738; AVX512VL-LABEL: splatconstant_rotate_mask_v8i16: 1739; AVX512VL: # %bb.0: 1740; AVX512VL-NEXT: vpsllw $5, %xmm0, %xmm1 1741; AVX512VL-NEXT: vpsrlw $11, %xmm0, %xmm0 1742; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm0 = mem & (xmm0 | xmm1) 1743; AVX512VL-NEXT: retq 1744; 1745; AVX512BW-LABEL: splatconstant_rotate_mask_v8i16: 1746; AVX512BW: # %bb.0: 1747; AVX512BW-NEXT: vpsrlw $11, %xmm0, %xmm1 1748; AVX512BW-NEXT: vpsllw $5, %xmm0, %xmm0 1749; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1750; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1751; AVX512BW-NEXT: retq 1752; 1753; AVX512VLBW-LABEL: splatconstant_rotate_mask_v8i16: 1754; AVX512VLBW: # %bb.0: 1755; AVX512VLBW-NEXT: vpsllw $5, %xmm0, %xmm1 1756; AVX512VLBW-NEXT: vpsrlw $11, %xmm0, %xmm0 1757; AVX512VLBW-NEXT: vpternlogd {{.*#+}} xmm0 = mem & (xmm0 | xmm1) 1758; AVX512VLBW-NEXT: retq 1759; 1760; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v8i16: 1761; AVX512VBMI2: # %bb.0: 1762; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1763; AVX512VBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0 1764; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1765; AVX512VBMI2-NEXT: vzeroupper 1766; AVX512VBMI2-NEXT: retq 1767; 1768; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v8i16: 1769; AVX512VLVBMI2: # %bb.0: 1770; AVX512VLVBMI2-NEXT: vpshldw $5, %xmm0, %xmm0, %xmm0 1771; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 1772; AVX512VLVBMI2-NEXT: retq 1773; 1774; XOP-LABEL: splatconstant_rotate_mask_v8i16: 1775; XOP: # %bb.0: 1776; XOP-NEXT: vprotw $5, %xmm0, %xmm0 1777; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1778; XOP-NEXT: retq 1779; 1780; X86-SSE2-LABEL: splatconstant_rotate_mask_v8i16: 1781; X86-SSE2: # %bb.0: 1782; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1783; X86-SSE2-NEXT: psrlw $11, %xmm1 1784; X86-SSE2-NEXT: psllw $5, %xmm0 1785; X86-SSE2-NEXT: por %xmm1, %xmm0 1786; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1787; X86-SSE2-NEXT: retl 1788 %shl = shl <8 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> 1789 %lshr = lshr <8 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11> 1790 %rmask = and <8 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55> 1791 %lmask = and <8 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33> 1792 %or = or <8 x i16> %lmask, %rmask 1793 ret <8 x i16> %or 1794} 1795 1796define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind { 1797; SSE-LABEL: splatconstant_rotate_mask_v16i8: 1798; SSE: # %bb.0: 1799; SSE-NEXT: movdqa %xmm0, %xmm1 1800; SSE-NEXT: psrlw $4, %xmm1 1801; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1802; SSE-NEXT: psllw $4, %xmm0 1803; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1804; SSE-NEXT: por %xmm1, %xmm0 1805; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1806; SSE-NEXT: retq 1807; 1808; AVX-LABEL: splatconstant_rotate_mask_v16i8: 1809; AVX: # %bb.0: 1810; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 1811; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1812; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 1813; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1814; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1815; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1816; AVX-NEXT: retq 1817; 1818; AVX512NOVLX-LABEL: splatconstant_rotate_mask_v16i8: 1819; AVX512NOVLX: # %bb.0: 1820; AVX512NOVLX-NEXT: vpsllw $4, %xmm0, %xmm1 1821; AVX512NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 1822; AVX512NOVLX-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 1823; AVX512NOVLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1824; AVX512NOVLX-NEXT: vzeroupper 1825; AVX512NOVLX-NEXT: retq 1826; 1827; AVX512VLX-LABEL: splatconstant_rotate_mask_v16i8: 1828; AVX512VLX: # %bb.0: 1829; AVX512VLX-NEXT: vpsllw $4, %xmm0, %xmm1 1830; AVX512VLX-NEXT: vpsrlw $4, %xmm0, %xmm0 1831; AVX512VLX-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1)) 1832; AVX512VLX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 1833; AVX512VLX-NEXT: retq 1834; 1835; XOP-LABEL: splatconstant_rotate_mask_v16i8: 1836; XOP: # %bb.0: 1837; XOP-NEXT: vprotb $4, %xmm0, %xmm0 1838; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1839; XOP-NEXT: retq 1840; 1841; X86-SSE2-LABEL: splatconstant_rotate_mask_v16i8: 1842; X86-SSE2: # %bb.0: 1843; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1844; X86-SSE2-NEXT: psrlw $4, %xmm1 1845; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1846; X86-SSE2-NEXT: psllw $4, %xmm0 1847; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1848; X86-SSE2-NEXT: por %xmm1, %xmm0 1849; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1850; X86-SSE2-NEXT: retl 1851 %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1852 %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1853 %rmask = and <16 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55> 1854 %lmask = and <16 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33> 1855 %or = or <16 x i8> %lmask, %rmask 1856 ret <16 x i8> %or 1857} 1858 1859define <4 x i32> @rot16_demandedbits(<4 x i32> %x, <4 x i32> %y) nounwind { 1860; X86-LABEL: rot16_demandedbits: 1861; X86: # %bb.0: 1862; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1863; X86-NEXT: movl %eax, %ecx 1864; X86-NEXT: shrl $11, %ecx 1865; X86-NEXT: shll $5, %eax 1866; X86-NEXT: orl %ecx, %eax 1867; X86-NEXT: andl $65536, %eax # imm = 0x10000 1868; X86-NEXT: retl 1869; 1870; X64-LABEL: rot16_demandedbits: 1871; X64: # %bb.0: 1872; X64-NEXT: movl %edi, %eax 1873; X64-NEXT: movl %edi, %ecx 1874; X64-NEXT: shrl $11, %ecx 1875; X64-NEXT: shll $5, %eax 1876; X64-NEXT: orl %ecx, %eax 1877; X64-NEXT: andl $65536, %eax # imm = 0x10000 1878; X64-NEXT: retq 1879; SSE2-LABEL: rot16_demandedbits: 1880; SSE2: # %bb.0: 1881; SSE2-NEXT: movdqa %xmm0, %xmm1 1882; SSE2-NEXT: psrld $11, %xmm1 1883; SSE2-NEXT: pslld $11, %xmm0 1884; SSE2-NEXT: por %xmm1, %xmm0 1885; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1886; SSE2-NEXT: retq 1887; 1888; SSE41-LABEL: rot16_demandedbits: 1889; SSE41: # %bb.0: 1890; SSE41-NEXT: movdqa %xmm0, %xmm1 1891; SSE41-NEXT: psrld $11, %xmm1 1892; SSE41-NEXT: pslld $11, %xmm0 1893; SSE41-NEXT: por %xmm1, %xmm0 1894; SSE41-NEXT: pxor %xmm1, %xmm1 1895; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1896; SSE41-NEXT: retq 1897; 1898; AVX-LABEL: rot16_demandedbits: 1899; AVX: # %bb.0: 1900; AVX-NEXT: vpsrld $11, %xmm0, %xmm1 1901; AVX-NEXT: vpslld $11, %xmm0, %xmm0 1902; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1903; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1904; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1905; AVX-NEXT: retq 1906; 1907; AVX512-LABEL: rot16_demandedbits: 1908; AVX512: # %bb.0: 1909; AVX512-NEXT: vpsrld $11, %xmm0, %xmm1 1910; AVX512-NEXT: vpslld $11, %xmm0, %xmm0 1911; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 1912; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 1913; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1914; AVX512-NEXT: retq 1915; 1916; XOP-LABEL: rot16_demandedbits: 1917; XOP: # %bb.0: 1918; XOP-NEXT: vpsrld $11, %xmm0, %xmm1 1919; XOP-NEXT: vpslld $11, %xmm0, %xmm0 1920; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 1921; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 1922; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1923; XOP-NEXT: retq 1924; 1925; X86-SSE2-LABEL: rot16_demandedbits: 1926; X86-SSE2: # %bb.0: 1927; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1928; X86-SSE2-NEXT: psrld $11, %xmm1 1929; X86-SSE2-NEXT: pslld $11, %xmm0 1930; X86-SSE2-NEXT: por %xmm1, %xmm0 1931; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1932; X86-SSE2-NEXT: retl 1933 %t0 = lshr <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11> 1934 %t1 = shl <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11> 1935 %t2 = or <4 x i32> %t0, %t1 1936 %t3 = and <4 x i32> %t2, <i32 65535, i32 65535, i32 65535, i32 65535> 1937 ret <4 x i32> %t3 1938} 1939 1940define <4 x i16> @rot16_trunc(<4 x i32> %x, <4 x i32> %y) nounwind { 1941; SSE2-LABEL: rot16_trunc: 1942; SSE2: # %bb.0: 1943; SSE2-NEXT: movdqa %xmm0, %xmm1 1944; SSE2-NEXT: psrld $11, %xmm1 1945; SSE2-NEXT: pslld $5, %xmm0 1946; SSE2-NEXT: por %xmm1, %xmm0 1947; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1948; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 1949; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1950; SSE2-NEXT: retq 1951; 1952; SSE41-LABEL: rot16_trunc: 1953; SSE41: # %bb.0: 1954; SSE41-NEXT: movdqa %xmm0, %xmm1 1955; SSE41-NEXT: psrld $11, %xmm1 1956; SSE41-NEXT: pslld $5, %xmm0 1957; SSE41-NEXT: por %xmm1, %xmm0 1958; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1959; SSE41-NEXT: retq 1960; 1961; AVX-LABEL: rot16_trunc: 1962; AVX: # %bb.0: 1963; AVX-NEXT: vpsrld $11, %xmm0, %xmm1 1964; AVX-NEXT: vpslld $5, %xmm0, %xmm0 1965; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1966; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1967; AVX-NEXT: retq 1968; 1969; AVX512NOVLX-LABEL: rot16_trunc: 1970; AVX512NOVLX: # %bb.0: 1971; AVX512NOVLX-NEXT: vpsrld $11, %xmm0, %xmm1 1972; AVX512NOVLX-NEXT: vpslld $5, %xmm0, %xmm0 1973; AVX512NOVLX-NEXT: vpor %xmm0, %xmm1, %xmm0 1974; AVX512NOVLX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1975; AVX512NOVLX-NEXT: retq 1976; 1977; AVX512VLX-LABEL: rot16_trunc: 1978; AVX512VLX: # %bb.0: 1979; AVX512VLX-NEXT: vpsrld $11, %xmm0, %xmm1 1980; AVX512VLX-NEXT: vpslld $5, %xmm0, %xmm0 1981; AVX512VLX-NEXT: vpor %xmm0, %xmm1, %xmm0 1982; AVX512VLX-NEXT: vpmovdw %xmm0, %xmm0 1983; AVX512VLX-NEXT: retq 1984; 1985; XOP-LABEL: rot16_trunc: 1986; XOP: # %bb.0: 1987; XOP-NEXT: vpsrld $11, %xmm0, %xmm1 1988; XOP-NEXT: vpslld $5, %xmm0, %xmm0 1989; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 1990; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1991; XOP-NEXT: retq 1992; 1993; X86-SSE2-LABEL: rot16_trunc: 1994; X86-SSE2: # %bb.0: 1995; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1996; X86-SSE2-NEXT: psrld $11, %xmm1 1997; X86-SSE2-NEXT: pslld $5, %xmm0 1998; X86-SSE2-NEXT: por %xmm1, %xmm0 1999; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2000; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 2001; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2002; X86-SSE2-NEXT: retl 2003 %t0 = lshr <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11> 2004 %t1 = shl <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> 2005 %t2 = or <4 x i32> %t0, %t1 2006 %t3 = trunc <4 x i32> %t2 to <4 x i16> 2007 ret <4 x i16> %t3 2008} 2009