1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512NOVLX,AVX512F 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLX,AVX512VL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512NOVLX,AVX512BW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLX,AVX512VLBW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512NOVLX,AVX512VBMI2 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLX,AVX512VLVBMI2 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2 12 13; 14; Variable Rotates 15; 16 17define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { 18; AVX1-LABEL: var_rotate_v4i64: 19; AVX1: # %bb.0: 20; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] 21; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3 22; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 23; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 24; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 25; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6 26; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] 27; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4 28; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] 29; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm6 30; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 31; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 32; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] 33; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 34; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm4 35; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 36; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm2 37; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 38; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4 39; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 40; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm0 41; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] 42; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 43; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 44; AVX1-NEXT: retq 45; 46; AVX2-LABEL: var_rotate_v4i64: 47; AVX2: # %bb.0: 48; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64] 49; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 50; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1 51; AVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0 52; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 53; AVX2-NEXT: retq 54; 55; AVX512NOVLX-LABEL: var_rotate_v4i64: 56; AVX512NOVLX: # %bb.0: 57; AVX512NOVLX-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 58; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 59; AVX512NOVLX-NEXT: vprolvq %zmm1, %zmm0, %zmm0 60; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 61; AVX512NOVLX-NEXT: retq 62; 63; AVX512VLX-LABEL: var_rotate_v4i64: 64; AVX512VLX: # %bb.0: 65; AVX512VLX-NEXT: vprolvq %ymm1, %ymm0, %ymm0 66; AVX512VLX-NEXT: retq 67; 68; XOPAVX1-LABEL: var_rotate_v4i64: 69; XOPAVX1: # %bb.0: 70; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 71; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 72; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2 73; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 74; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 75; XOPAVX1-NEXT: retq 76; 77; XOPAVX2-LABEL: var_rotate_v4i64: 78; XOPAVX2: # %bb.0: 79; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 80; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 81; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2 82; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 83; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 84; XOPAVX2-NEXT: retq 85 %b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b 86 %shl = shl <4 x i64> %a, %b 87 %lshr = lshr <4 x i64> %a, %b64 88 %or = or <4 x i64> %shl, %lshr 89 ret <4 x i64> %or 90} 91 92define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { 93; AVX1-LABEL: var_rotate_v8i32: 94; AVX1: # %bb.0: 95; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 96; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31] 97; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 98; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 99; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 100; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 101; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 102; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] 103; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 104; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] 105; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 106; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm2 107; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] 108; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] 109; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] 110; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] 111; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2 112; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 113; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 114; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 115; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 116; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 117; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 118; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 119; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 120; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 121; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 122; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] 123; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 124; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 125; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 126; AVX1-NEXT: retq 127; 128; AVX2-LABEL: var_rotate_v8i32: 129; AVX2: # %bb.0: 130; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31] 131; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 132; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm2 133; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32] 134; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1 135; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 136; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 137; AVX2-NEXT: retq 138; 139; AVX512NOVLX-LABEL: var_rotate_v8i32: 140; AVX512NOVLX: # %bb.0: 141; AVX512NOVLX-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 142; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 143; AVX512NOVLX-NEXT: vprolvd %zmm1, %zmm0, %zmm0 144; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 145; AVX512NOVLX-NEXT: retq 146; 147; AVX512VLX-LABEL: var_rotate_v8i32: 148; AVX512VLX: # %bb.0: 149; AVX512VLX-NEXT: vprolvd %ymm1, %ymm0, %ymm0 150; AVX512VLX-NEXT: retq 151; 152; XOPAVX1-LABEL: var_rotate_v8i32: 153; XOPAVX1: # %bb.0: 154; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 155; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 156; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2 157; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 158; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 159; XOPAVX1-NEXT: retq 160; 161; XOPAVX2-LABEL: var_rotate_v8i32: 162; XOPAVX2: # %bb.0: 163; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 164; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 165; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2 166; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 167; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 168; XOPAVX2-NEXT: retq 169 %b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b 170 %shl = shl <8 x i32> %a, %b 171 %lshr = lshr <8 x i32> %a, %b32 172 %or = or <8 x i32> %shl, %lshr 173 ret <8 x i32> %or 174} 175 176define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { 177; AVX1-LABEL: var_rotate_v16i16: 178; AVX1: # %bb.0: 179; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 180; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 181; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 182; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7] 183; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 184; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] 185; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 186; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 187; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 188; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 189; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 190; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 191; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 192; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 193; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm6 194; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2 195; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2 196; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 197; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7] 198; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 199; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3 200; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 201; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 202; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 203; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1 204; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 205; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 206; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3 207; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 208; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 209; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 210; AVX1-NEXT: retq 211; 212; AVX2-LABEL: var_rotate_v16i16: 213; AVX2: # %bb.0: 214; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 215; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 216; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 217; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 218; AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3 219; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 220; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 221; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 222; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 223; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 224; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 225; AVX2-NEXT: retq 226; 227; AVX512F-LABEL: var_rotate_v16i16: 228; AVX512F: # %bb.0: 229; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 230; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 231; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 232; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 233; AVX512F-NEXT: vpsllvd %ymm3, %ymm4, %ymm3 234; AVX512F-NEXT: vpsrld $16, %ymm3, %ymm3 235; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 236; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 237; AVX512F-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 238; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 239; AVX512F-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 240; AVX512F-NEXT: retq 241; 242; AVX512VL-LABEL: var_rotate_v16i16: 243; AVX512VL: # %bb.0: 244; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 245; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 246; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 247; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 248; AVX512VL-NEXT: vpsllvd %ymm3, %ymm4, %ymm3 249; AVX512VL-NEXT: vpsrld $16, %ymm3, %ymm3 250; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 251; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 252; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 253; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 254; AVX512VL-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 255; AVX512VL-NEXT: retq 256; 257; AVX512BW-LABEL: var_rotate_v16i16: 258; AVX512BW: # %bb.0: 259; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 260; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 261; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 262; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 263; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 264; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 265; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0 266; AVX512BW-NEXT: retq 267; 268; AVX512VLBW-LABEL: var_rotate_v16i16: 269; AVX512VLBW: # %bb.0: 270; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 271; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 272; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 273; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 274; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 275; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0 276; AVX512VLBW-NEXT: retq 277; 278; AVX512VBMI2-LABEL: var_rotate_v16i16: 279; AVX512VBMI2: # %bb.0: 280; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 281; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 282; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 283; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 284; AVX512VBMI2-NEXT: retq 285; 286; AVX512VLVBMI2-LABEL: var_rotate_v16i16: 287; AVX512VLVBMI2: # %bb.0: 288; AVX512VLVBMI2-NEXT: vpshldvw %ymm1, %ymm0, %ymm0 289; AVX512VLVBMI2-NEXT: retq 290; 291; XOPAVX1-LABEL: var_rotate_v16i16: 292; XOPAVX1: # %bb.0: 293; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 294; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 295; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2 296; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 297; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 298; XOPAVX1-NEXT: retq 299; 300; XOPAVX2-LABEL: var_rotate_v16i16: 301; XOPAVX2: # %bb.0: 302; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 303; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 304; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2 305; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 306; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 307; XOPAVX2-NEXT: retq 308 %b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b 309 %shl = shl <16 x i16> %a, %b 310 %lshr = lshr <16 x i16> %a, %b16 311 %or = or <16 x i16> %shl, %lshr 312 ret <16 x i16> %or 313} 314 315define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { 316; AVX1-LABEL: var_rotate_v32i8: 317; AVX1: # %bb.0: 318; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 319; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 320; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 321; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 322; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5 323; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5 324; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 325; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 326; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 327; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 328; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3 329; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 330; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3 331; AVX1-NEXT: vpsllw $2, %xmm2, %xmm7 332; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7 333; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3 334; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 335; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 336; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 337; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 338; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 339; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm8 340; AVX1-NEXT: vpor %xmm3, %xmm8, %xmm3 341; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 342; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 343; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 344; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 345; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5 346; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 347; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 348; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 349; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 350; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm3 351; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3 352; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4 353; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 354; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 355; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 356; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 357; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3 358; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 359; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4 360; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 361; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 362; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 363; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 364; AVX1-NEXT: retq 365; 366; AVX2-LABEL: var_rotate_v32i8: 367; AVX2: # %bb.0: 368; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 369; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 370; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3 371; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 372; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 373; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 374; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 375; AVX2-NEXT: vpsrlw $6, %ymm0, %ymm2 376; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 377; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3 378; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 379; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 380; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 381; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 382; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 383; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 384; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3 385; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 386; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 387; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 388; AVX2-NEXT: retq 389; 390; AVX512F-LABEL: var_rotate_v32i8: 391; AVX512F: # %bb.0: 392; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 393; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 394; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) 395; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 396; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 397; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2 398; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 399; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) 400; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 401; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 402; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 403; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 404; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3 405; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 406; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 407; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 408; AVX512F-NEXT: retq 409; 410; AVX512VL-LABEL: var_rotate_v32i8: 411; AVX512VL: # %bb.0: 412; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 413; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 414; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) 415; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 416; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 417; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 418; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 419; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) 420; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 421; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 422; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 423; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 424; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & mem) 425; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 426; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 427; AVX512VL-NEXT: retq 428; 429; AVX512BW-LABEL: var_rotate_v32i8: 430; AVX512BW: # %bb.0: 431; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 432; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 433; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3 434; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] 435; AVX512BW-NEXT: vpsllvw %zmm4, %zmm2, %zmm2 436; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2 437; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 438; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] 439; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 440; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 441; AVX512BW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 442; AVX512BW-NEXT: retq 443; 444; AVX512VLBW-LABEL: var_rotate_v32i8: 445; AVX512VLBW: # %bb.0: 446; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 447; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 448; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] 449; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 450; AVX512VLBW-NEXT: vpsllvw %ymm3, %ymm4, %ymm3 451; AVX512VLBW-NEXT: vpsrlw $8, %ymm3, %ymm3 452; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] 453; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 454; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 455; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0 456; AVX512VLBW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 457; AVX512VLBW-NEXT: retq 458; 459; AVX512VBMI2-LABEL: var_rotate_v32i8: 460; AVX512VBMI2: # %bb.0: 461; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 462; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 463; AVX512VBMI2-NEXT: vpxor %xmm3, %xmm3, %xmm3 464; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] 465; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm2, %zmm2 466; AVX512VBMI2-NEXT: vpsrlw $8, %ymm2, %ymm2 467; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 468; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] 469; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 470; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 471; AVX512VBMI2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 472; AVX512VBMI2-NEXT: retq 473; 474; AVX512VLVBMI2-LABEL: var_rotate_v32i8: 475; AVX512VLVBMI2: # %bb.0: 476; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 477; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 478; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] 479; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 480; AVX512VLVBMI2-NEXT: vpsllvw %ymm3, %ymm4, %ymm3 481; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm3, %ymm3 482; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] 483; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 484; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 485; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 486; AVX512VLVBMI2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 487; AVX512VLVBMI2-NEXT: retq 488; 489; XOPAVX1-LABEL: var_rotate_v32i8: 490; XOPAVX1: # %bb.0: 491; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 492; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 493; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2 494; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 495; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 496; XOPAVX1-NEXT: retq 497; 498; XOPAVX2-LABEL: var_rotate_v32i8: 499; XOPAVX2: # %bb.0: 500; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 501; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 502; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2 503; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 504; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 505; XOPAVX2-NEXT: retq 506 %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b 507 %shl = shl <32 x i8> %a, %b 508 %lshr = lshr <32 x i8> %a, %b8 509 %or = or <32 x i8> %shl, %lshr 510 ret <32 x i8> %or 511} 512 513; 514; Uniform Variable Rotates 515; 516 517define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { 518; AVX1-LABEL: splatvar_rotate_v4i64: 519; AVX1: # %bb.0: 520; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] 521; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 522; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 523; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm4 524; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 525; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 526; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 527; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 528; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 529; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 530; AVX1-NEXT: retq 531; 532; AVX2-LABEL: splatvar_rotate_v4i64: 533; AVX2: # %bb.0: 534; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm2 535; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [64,64] 536; AVX2-NEXT: vpsubq %xmm1, %xmm3, %xmm1 537; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 538; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 539; AVX2-NEXT: retq 540; 541; AVX512NOVLX-LABEL: splatvar_rotate_v4i64: 542; AVX512NOVLX: # %bb.0: 543; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 544; AVX512NOVLX-NEXT: vpbroadcastq %xmm1, %ymm1 545; AVX512NOVLX-NEXT: vprolvq %zmm1, %zmm0, %zmm0 546; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 547; AVX512NOVLX-NEXT: retq 548; 549; AVX512VLX-LABEL: splatvar_rotate_v4i64: 550; AVX512VLX: # %bb.0: 551; AVX512VLX-NEXT: vpbroadcastq %xmm1, %ymm1 552; AVX512VLX-NEXT: vprolvq %ymm1, %ymm0, %ymm0 553; AVX512VLX-NEXT: retq 554; 555; XOPAVX1-LABEL: splatvar_rotate_v4i64: 556; XOPAVX1: # %bb.0: 557; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 558; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 559; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2 560; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 561; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 562; XOPAVX1-NEXT: retq 563; 564; XOPAVX2-LABEL: splatvar_rotate_v4i64: 565; XOPAVX2: # %bb.0: 566; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 567; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 568; XOPAVX2-NEXT: vprotq %xmm1, %xmm2, %xmm2 569; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 570; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 571; XOPAVX2-NEXT: retq 572 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer 573 %splat64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %splat 574 %shl = shl <4 x i64> %a, %splat 575 %lshr = lshr <4 x i64> %a, %splat64 576 %or = or <4 x i64> %shl, %lshr 577 ret <4 x i64> %or 578} 579 580define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { 581; AVX1-LABEL: splatvar_rotate_v8i32: 582; AVX1: # %bb.0: 583; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 584; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 585; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] 586; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm3 587; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] 588; AVX1-NEXT: vpsllq %xmm1, %xmm4, %xmm4 589; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 590; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] 591; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2 592; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 593; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 594; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 595; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7] 596; AVX1-NEXT: retq 597; 598; AVX2-LABEL: splatvar_rotate_v8i32: 599; AVX2: # %bb.0: 600; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,2,3,3,6,6,7,7] 601; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 602; AVX2-NEXT: vpsllq %xmm1, %ymm2, %ymm2 603; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] 604; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0 605; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 606; AVX2-NEXT: retq 607; 608; AVX512NOVLX-LABEL: splatvar_rotate_v8i32: 609; AVX512NOVLX: # %bb.0: 610; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 611; AVX512NOVLX-NEXT: vpbroadcastd %xmm1, %ymm1 612; AVX512NOVLX-NEXT: vprolvd %zmm1, %zmm0, %zmm0 613; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 614; AVX512NOVLX-NEXT: retq 615; 616; AVX512VLX-LABEL: splatvar_rotate_v8i32: 617; AVX512VLX: # %bb.0: 618; AVX512VLX-NEXT: vpbroadcastd %xmm1, %ymm1 619; AVX512VLX-NEXT: vprolvd %ymm1, %ymm0, %ymm0 620; AVX512VLX-NEXT: retq 621; 622; XOPAVX1-LABEL: splatvar_rotate_v8i32: 623; XOPAVX1: # %bb.0: 624; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 625; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 626; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2 627; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 628; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 629; XOPAVX1-NEXT: retq 630; 631; XOPAVX2-LABEL: splatvar_rotate_v8i32: 632; XOPAVX2: # %bb.0: 633; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 634; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 635; XOPAVX2-NEXT: vprotd %xmm1, %xmm2, %xmm2 636; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 637; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 638; XOPAVX2-NEXT: retq 639 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer 640 %splat32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat 641 %shl = shl <8 x i32> %a, %splat 642 %lshr = lshr <8 x i32> %a, %splat32 643 %or = or <8 x i32> %shl, %lshr 644 ret <8 x i32> %or 645} 646 647define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { 648; AVX1-LABEL: splatvar_rotate_v16i16: 649; AVX1: # %bb.0: 650; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0] 651; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm3 652; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 653; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5 654; AVX1-NEXT: vpsrlw %xmm3, %xmm5, %xmm5 655; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 656; AVX1-NEXT: vpsllw %xmm1, %xmm4, %xmm2 657; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2 658; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm4 659; AVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 660; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 661; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 662; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 663; AVX1-NEXT: retq 664; 665; AVX2-LABEL: splatvar_rotate_v16i16: 666; AVX2: # %bb.0: 667; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] 668; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm3 669; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm4 670; AVX2-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 671; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 672; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 673; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0 674; AVX2-NEXT: retq 675; 676; AVX512F-LABEL: splatvar_rotate_v16i16: 677; AVX512F: # %bb.0: 678; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] 679; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 680; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4 681; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 682; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 683; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 684; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 685; AVX512F-NEXT: retq 686; 687; AVX512VL-LABEL: splatvar_rotate_v16i16: 688; AVX512VL: # %bb.0: 689; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] 690; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 691; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 692; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 693; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 694; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 695; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 696; AVX512VL-NEXT: retq 697; 698; AVX512BW-LABEL: splatvar_rotate_v16i16: 699; AVX512BW: # %bb.0: 700; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] 701; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 702; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm4 703; AVX512BW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 704; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 705; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 706; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 707; AVX512BW-NEXT: retq 708; 709; AVX512VLBW-LABEL: splatvar_rotate_v16i16: 710; AVX512VLBW: # %bb.0: 711; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0] 712; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 713; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4 714; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 715; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 716; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 717; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 718; AVX512VLBW-NEXT: retq 719; 720; AVX512VBMI2-LABEL: splatvar_rotate_v16i16: 721; AVX512VBMI2: # %bb.0: 722; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 723; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %ymm1 724; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 725; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 726; AVX512VBMI2-NEXT: retq 727; 728; AVX512VLVBMI2-LABEL: splatvar_rotate_v16i16: 729; AVX512VLVBMI2: # %bb.0: 730; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %ymm1 731; AVX512VLVBMI2-NEXT: vpshldvw %ymm1, %ymm0, %ymm0 732; AVX512VLVBMI2-NEXT: retq 733; 734; XOPAVX1-LABEL: splatvar_rotate_v16i16: 735; XOPAVX1: # %bb.0: 736; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 737; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 738; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 739; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2 740; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 741; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 742; XOPAVX1-NEXT: retq 743; 744; XOPAVX2-LABEL: splatvar_rotate_v16i16: 745; XOPAVX2: # %bb.0: 746; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 747; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 748; XOPAVX2-NEXT: vprotw %xmm1, %xmm2, %xmm2 749; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 750; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 751; XOPAVX2-NEXT: retq 752 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer 753 %splat16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat 754 %shl = shl <16 x i16> %a, %splat 755 %lshr = lshr <16 x i16> %a, %splat16 756 %or = or <16 x i16> %shl, %lshr 757 ret <16 x i16> %or 758} 759 760define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { 761; AVX1-LABEL: splatvar_rotate_v32i8: 762; AVX1: # %bb.0: 763; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 764; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 765; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 766; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3 767; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 768; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 769; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2 770; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 771; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 772; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 773; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3 774; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 775; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 776; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 777; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 778; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 779; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 780; AVX1-NEXT: retq 781; 782; AVX2-LABEL: splatvar_rotate_v32i8: 783; AVX2: # %bb.0: 784; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 785; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 786; AVX2-NEXT: vpsllw %xmm1, %ymm2, %ymm2 787; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 788; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 789; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 790; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 791; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 792; AVX2-NEXT: retq 793; 794; AVX512-LABEL: splatvar_rotate_v32i8: 795; AVX512: # %bb.0: 796; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 797; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 798; AVX512-NEXT: vpsllw %xmm1, %ymm2, %ymm2 799; AVX512-NEXT: vpsrlw $8, %ymm2, %ymm2 800; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 801; AVX512-NEXT: vpsllw %xmm1, %ymm0, %ymm0 802; AVX512-NEXT: vpsrlw $8, %ymm0, %ymm0 803; AVX512-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 804; AVX512-NEXT: retq 805; 806; XOPAVX1-LABEL: splatvar_rotate_v32i8: 807; XOPAVX1: # %bb.0: 808; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 809; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 810; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 811; XOPAVX1-NEXT: vprotb %xmm1, %xmm2, %xmm2 812; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 813; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 814; XOPAVX1-NEXT: retq 815; 816; XOPAVX2-LABEL: splatvar_rotate_v32i8: 817; XOPAVX2: # %bb.0: 818; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 819; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 820; XOPAVX2-NEXT: vprotb %xmm1, %xmm2, %xmm2 821; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 822; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 823; XOPAVX2-NEXT: retq 824 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer 825 %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat 826 %shl = shl <32 x i8> %a, %splat 827 %lshr = lshr <32 x i8> %a, %splat8 828 %or = or <32 x i8> %shl, %lshr 829 ret <32 x i8> %or 830} 831 832; 833; Constant Rotates 834; 835 836define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind { 837; AVX1-LABEL: constant_rotate_v4i64: 838; AVX1: # %bb.0: 839; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 840; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm2 841; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm3 842; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 843; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm3 844; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm4 845; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 846; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 847; AVX1-NEXT: vpsllq $60, %xmm1, %xmm3 848; AVX1-NEXT: vpsllq $50, %xmm1, %xmm1 849; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] 850; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3 851; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0 852; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 853; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 854; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 855; AVX1-NEXT: retq 856; 857; AVX2-LABEL: constant_rotate_v4i64: 858; AVX2: # %bb.0: 859; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 860; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 861; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 862; AVX2-NEXT: retq 863; 864; AVX512NOVLX-LABEL: constant_rotate_v4i64: 865; AVX512NOVLX: # %bb.0: 866; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 867; AVX512NOVLX-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60] 868; AVX512NOVLX-NEXT: vprolvq %zmm1, %zmm0, %zmm0 869; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 870; AVX512NOVLX-NEXT: retq 871; 872; AVX512VLX-LABEL: constant_rotate_v4i64: 873; AVX512VLX: # %bb.0: 874; AVX512VLX-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 875; AVX512VLX-NEXT: retq 876; 877; XOPAVX1-LABEL: constant_rotate_v4i64: 878; XOPAVX1: # %bb.0: 879; XOPAVX1-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 880; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 881; XOPAVX1-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 882; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 883; XOPAVX1-NEXT: retq 884; 885; XOPAVX2-LABEL: constant_rotate_v4i64: 886; XOPAVX2: # %bb.0: 887; XOPAVX2-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 888; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 889; XOPAVX2-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 890; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 891; XOPAVX2-NEXT: retq 892 %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60> 893 %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 4> 894 %or = or <4 x i64> %shl, %lshr 895 ret <4 x i64> %or 896} 897 898define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind { 899; AVX1-LABEL: constant_rotate_v8i32: 900; AVX1: # %bb.0: 901; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 902; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 903; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 904; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 905; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 906; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 907; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] 908; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 909; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 910; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 911; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 912; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 913; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 914; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 915; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] 916; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 917; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 918; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 919; AVX1-NEXT: retq 920; 921; AVX2-LABEL: constant_rotate_v8i32: 922; AVX2: # %bb.0: 923; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 924; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 925; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 926; AVX2-NEXT: retq 927; 928; AVX512NOVLX-LABEL: constant_rotate_v8i32: 929; AVX512NOVLX: # %bb.0: 930; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 931; AVX512NOVLX-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] 932; AVX512NOVLX-NEXT: vprolvd %zmm1, %zmm0, %zmm0 933; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 934; AVX512NOVLX-NEXT: retq 935; 936; AVX512VLX-LABEL: constant_rotate_v8i32: 937; AVX512VLX: # %bb.0: 938; AVX512VLX-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 939; AVX512VLX-NEXT: retq 940; 941; XOPAVX1-LABEL: constant_rotate_v8i32: 942; XOPAVX1: # %bb.0: 943; XOPAVX1-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 944; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 945; XOPAVX1-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 946; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 947; XOPAVX1-NEXT: retq 948; 949; XOPAVX2-LABEL: constant_rotate_v8i32: 950; XOPAVX2: # %bb.0: 951; XOPAVX2-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 952; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 953; XOPAVX2-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 954; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 955; XOPAVX2-NEXT: retq 956 %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 957 %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21> 958 %or = or <8 x i32> %shl, %lshr 959 ret <8 x i32> %or 960} 961 962define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind { 963; AVX1-LABEL: constant_rotate_v16i16: 964; AVX1: # %bb.0: 965; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 966; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] 967; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3 968; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 969; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 970; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] 971; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm3 972; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 973; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 974; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 975; AVX1-NEXT: retq 976; 977; AVX2-LABEL: constant_rotate_v16i16: 978; AVX2: # %bb.0: 979; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] 980; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 981; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 982; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 983; AVX2-NEXT: retq 984; 985; AVX512F-LABEL: constant_rotate_v16i16: 986; AVX512F: # %bb.0: 987; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] 988; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 989; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 990; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 991; AVX512F-NEXT: retq 992; 993; AVX512VL-LABEL: constant_rotate_v16i16: 994; AVX512VL: # %bb.0: 995; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] 996; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 997; AVX512VL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 998; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 999; AVX512VL-NEXT: retq 1000; 1001; AVX512BW-LABEL: constant_rotate_v16i16: 1002; AVX512BW: # %bb.0: 1003; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1004; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1005; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] 1006; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2 1007; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 1008; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0 1009; AVX512BW-NEXT: retq 1010; 1011; AVX512VLBW-LABEL: constant_rotate_v16i16: 1012; AVX512VLBW: # %bb.0: 1013; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 1014; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1015; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 1016; AVX512VLBW-NEXT: retq 1017; 1018; AVX512VBMI2-LABEL: constant_rotate_v16i16: 1019; AVX512VBMI2: # %bb.0: 1020; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1021; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1022; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 1023; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1024; AVX512VBMI2-NEXT: retq 1025; 1026; AVX512VLVBMI2-LABEL: constant_rotate_v16i16: 1027; AVX512VLVBMI2: # %bb.0: 1028; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1029; AVX512VLVBMI2-NEXT: retq 1030; 1031; XOPAVX1-LABEL: constant_rotate_v16i16: 1032; XOPAVX1: # %bb.0: 1033; XOPAVX1-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1034; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1035; XOPAVX1-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1036; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1037; XOPAVX1-NEXT: retq 1038; 1039; XOPAVX2-LABEL: constant_rotate_v16i16: 1040; XOPAVX2: # %bb.0: 1041; XOPAVX2-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1042; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1043; XOPAVX2-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1044; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1045; XOPAVX2-NEXT: retq 1046 %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1047 %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1> 1048 %or = or <16 x i16> %shl, %lshr 1049 ret <16 x i16> %or 1050} 1051 1052define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind { 1053; AVX1-LABEL: constant_rotate_v32i8: 1054; AVX1: # %bb.0: 1055; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1056; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1057; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2] 1058; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 1059; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1060; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1061; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] 1062; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 1063; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1064; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1065; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1066; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 1067; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1068; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1069; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0 1070; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1071; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1072; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1073; AVX1-NEXT: retq 1074; 1075; AVX2-LABEL: constant_rotate_v32i8: 1076; AVX2: # %bb.0: 1077; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1078; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] 1079; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 1080; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1081; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 1082; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1083; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1084; AVX2-NEXT: retq 1085; 1086; AVX512F-LABEL: constant_rotate_v32i8: 1087; AVX512F: # %bb.0: 1088; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1089; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] 1090; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 1091; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1092; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 1093; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 1094; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1095; AVX512F-NEXT: retq 1096; 1097; AVX512VL-LABEL: constant_rotate_v32i8: 1098; AVX512VL: # %bb.0: 1099; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1100; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2] 1101; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 1102; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1103; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] 1104; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 1105; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1106; AVX512VL-NEXT: retq 1107; 1108; AVX512BW-LABEL: constant_rotate_v32i8: 1109; AVX512BW: # %bb.0: 1110; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] 1111; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] 1112; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1113; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 1114; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 1115; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] 1116; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] 1117; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1118; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 1119; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 1120; AVX512BW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1121; AVX512BW-NEXT: retq 1122; 1123; AVX512VLBW-LABEL: constant_rotate_v32i8: 1124; AVX512VLBW: # %bb.0: 1125; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1126; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1127; AVX512VLBW-NEXT: vpsrlw $8, %ymm1, %ymm1 1128; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1129; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1130; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0 1131; AVX512VLBW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1132; AVX512VLBW-NEXT: retq 1133; 1134; AVX512VBMI2-LABEL: constant_rotate_v32i8: 1135; AVX512VBMI2: # %bb.0: 1136; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] 1137; AVX512VBMI2-NEXT: # ymm1 = mem[0,1,0,1] 1138; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1139; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 1140; AVX512VBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1 1141; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] 1142; AVX512VBMI2-NEXT: # ymm2 = mem[0,1,0,1] 1143; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1144; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 1145; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 1146; AVX512VBMI2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1147; AVX512VBMI2-NEXT: retq 1148; 1149; AVX512VLVBMI2-LABEL: constant_rotate_v32i8: 1150; AVX512VLVBMI2: # %bb.0: 1151; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1152; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1153; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1 1154; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1155; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1156; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 1157; AVX512VLVBMI2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1158; AVX512VLVBMI2-NEXT: retq 1159; 1160; XOPAVX1-LABEL: constant_rotate_v32i8: 1161; XOPAVX1: # %bb.0: 1162; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1163; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] 1164; XOPAVX1-NEXT: vprotb %xmm2, %xmm1, %xmm1 1165; XOPAVX1-NEXT: vprotb %xmm2, %xmm0, %xmm0 1166; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1167; XOPAVX1-NEXT: retq 1168; 1169; XOPAVX2-LABEL: constant_rotate_v32i8: 1170; XOPAVX2: # %bb.0: 1171; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1172; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] 1173; XOPAVX2-NEXT: vprotb %xmm2, %xmm1, %xmm1 1174; XOPAVX2-NEXT: vprotb %xmm2, %xmm0, %xmm0 1175; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1176; XOPAVX2-NEXT: retq 1177 %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1> 1178 %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> 1179 %or = or <32 x i8> %shl, %lshr 1180 ret <32 x i8> %or 1181} 1182 1183; 1184; Uniform Constant Rotates 1185; 1186 1187define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind { 1188; AVX1-LABEL: splatconstant_rotate_v4i64: 1189; AVX1: # %bb.0: 1190; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm1 1191; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1192; AVX1-NEXT: vpsrlq $50, %xmm2, %xmm3 1193; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1194; AVX1-NEXT: vpsllq $14, %xmm0, %xmm0 1195; AVX1-NEXT: vpsllq $14, %xmm2, %xmm2 1196; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1197; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 1198; AVX1-NEXT: retq 1199; 1200; AVX2-LABEL: splatconstant_rotate_v4i64: 1201; AVX2: # %bb.0: 1202; AVX2-NEXT: vpsrlq $50, %ymm0, %ymm1 1203; AVX2-NEXT: vpsllq $14, %ymm0, %ymm0 1204; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1205; AVX2-NEXT: retq 1206; 1207; AVX512NOVLX-LABEL: splatconstant_rotate_v4i64: 1208; AVX512NOVLX: # %bb.0: 1209; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1210; AVX512NOVLX-NEXT: vprolq $14, %zmm0, %zmm0 1211; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1212; AVX512NOVLX-NEXT: retq 1213; 1214; AVX512VLX-LABEL: splatconstant_rotate_v4i64: 1215; AVX512VLX: # %bb.0: 1216; AVX512VLX-NEXT: vprolq $14, %ymm0, %ymm0 1217; AVX512VLX-NEXT: retq 1218; 1219; XOPAVX1-LABEL: splatconstant_rotate_v4i64: 1220; XOPAVX1: # %bb.0: 1221; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1 1222; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1223; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0 1224; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1225; XOPAVX1-NEXT: retq 1226; 1227; XOPAVX2-LABEL: splatconstant_rotate_v4i64: 1228; XOPAVX2: # %bb.0: 1229; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1 1230; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1231; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0 1232; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1233; XOPAVX2-NEXT: retq 1234 %shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14> 1235 %lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50> 1236 %or = or <4 x i64> %shl, %lshr 1237 ret <4 x i64> %or 1238} 1239 1240define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind { 1241; AVX1-LABEL: splatconstant_rotate_v8i32: 1242; AVX1: # %bb.0: 1243; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1244; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2 1245; AVX1-NEXT: vpslld $4, %xmm1, %xmm1 1246; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 1247; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2 1248; AVX1-NEXT: vpslld $4, %xmm0, %xmm0 1249; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 1250; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1251; AVX1-NEXT: retq 1252; 1253; AVX2-LABEL: splatconstant_rotate_v8i32: 1254; AVX2: # %bb.0: 1255; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1 1256; AVX2-NEXT: vpslld $4, %ymm0, %ymm0 1257; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1258; AVX2-NEXT: retq 1259; 1260; AVX512NOVLX-LABEL: splatconstant_rotate_v8i32: 1261; AVX512NOVLX: # %bb.0: 1262; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1263; AVX512NOVLX-NEXT: vprold $4, %zmm0, %zmm0 1264; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1265; AVX512NOVLX-NEXT: retq 1266; 1267; AVX512VLX-LABEL: splatconstant_rotate_v8i32: 1268; AVX512VLX: # %bb.0: 1269; AVX512VLX-NEXT: vprold $4, %ymm0, %ymm0 1270; AVX512VLX-NEXT: retq 1271; 1272; XOPAVX1-LABEL: splatconstant_rotate_v8i32: 1273; XOPAVX1: # %bb.0: 1274; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1 1275; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1276; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0 1277; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1278; XOPAVX1-NEXT: retq 1279; 1280; XOPAVX2-LABEL: splatconstant_rotate_v8i32: 1281; XOPAVX2: # %bb.0: 1282; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1 1283; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1284; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0 1285; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1286; XOPAVX2-NEXT: retq 1287 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 1288 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28> 1289 %or = or <8 x i32> %shl, %lshr 1290 ret <8 x i32> %or 1291} 1292 1293define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind { 1294; AVX1-LABEL: splatconstant_rotate_v16i16: 1295; AVX1: # %bb.0: 1296; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1297; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm2 1298; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 1299; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 1300; AVX1-NEXT: vpsrlw $9, %xmm0, %xmm2 1301; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 1302; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 1303; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1304; AVX1-NEXT: retq 1305; 1306; AVX2-LABEL: splatconstant_rotate_v16i16: 1307; AVX2: # %bb.0: 1308; AVX2-NEXT: vpsrlw $9, %ymm0, %ymm1 1309; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 1310; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1311; AVX2-NEXT: retq 1312; 1313; AVX512F-LABEL: splatconstant_rotate_v16i16: 1314; AVX512F: # %bb.0: 1315; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm1 1316; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 1317; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 1318; AVX512F-NEXT: retq 1319; 1320; AVX512VL-LABEL: splatconstant_rotate_v16i16: 1321; AVX512VL: # %bb.0: 1322; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm1 1323; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 1324; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 1325; AVX512VL-NEXT: retq 1326; 1327; AVX512BW-LABEL: splatconstant_rotate_v16i16: 1328; AVX512BW: # %bb.0: 1329; AVX512BW-NEXT: vpsrlw $9, %ymm0, %ymm1 1330; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0 1331; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 1332; AVX512BW-NEXT: retq 1333; 1334; AVX512VLBW-LABEL: splatconstant_rotate_v16i16: 1335; AVX512VLBW: # %bb.0: 1336; AVX512VLBW-NEXT: vpsrlw $9, %ymm0, %ymm1 1337; AVX512VLBW-NEXT: vpsllw $7, %ymm0, %ymm0 1338; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 1339; AVX512VLBW-NEXT: retq 1340; 1341; AVX512VBMI2-LABEL: splatconstant_rotate_v16i16: 1342; AVX512VBMI2: # %bb.0: 1343; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1344; AVX512VBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0 1345; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1346; AVX512VBMI2-NEXT: retq 1347; 1348; AVX512VLVBMI2-LABEL: splatconstant_rotate_v16i16: 1349; AVX512VLVBMI2: # %bb.0: 1350; AVX512VLVBMI2-NEXT: vpshldw $7, %ymm0, %ymm0, %ymm0 1351; AVX512VLVBMI2-NEXT: retq 1352; 1353; XOPAVX1-LABEL: splatconstant_rotate_v16i16: 1354; XOPAVX1: # %bb.0: 1355; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1 1356; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1357; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0 1358; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1359; XOPAVX1-NEXT: retq 1360; 1361; XOPAVX2-LABEL: splatconstant_rotate_v16i16: 1362; XOPAVX2: # %bb.0: 1363; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1 1364; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1365; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0 1366; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1367; XOPAVX2-NEXT: retq 1368 %shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 1369 %lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9> 1370 %or = or <16 x i16> %shl, %lshr 1371 ret <16 x i16> %or 1372} 1373 1374define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind { 1375; AVX1-LABEL: splatconstant_rotate_v32i8: 1376; AVX1: # %bb.0: 1377; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1378; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 1379; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1380; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 1381; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 1382; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1383; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 1384; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 1385; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 1386; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 1387; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1388; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 1389; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1390; AVX1-NEXT: retq 1391; 1392; AVX2-LABEL: splatconstant_rotate_v32i8: 1393; AVX2: # %bb.0: 1394; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1 1395; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1396; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0 1397; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1398; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1399; AVX2-NEXT: retq 1400; 1401; AVX512NOVLX-LABEL: splatconstant_rotate_v32i8: 1402; AVX512NOVLX: # %bb.0: 1403; AVX512NOVLX-NEXT: vpsllw $4, %ymm0, %ymm1 1404; AVX512NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 1405; AVX512NOVLX-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 1406; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1407; AVX512NOVLX-NEXT: retq 1408; 1409; AVX512VLX-LABEL: splatconstant_rotate_v32i8: 1410; AVX512VLX: # %bb.0: 1411; AVX512VLX-NEXT: vpsllw $4, %ymm0, %ymm1 1412; AVX512VLX-NEXT: vpsrlw $4, %ymm0, %ymm0 1413; AVX512VLX-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) 1414; AVX512VLX-NEXT: retq 1415; 1416; XOPAVX1-LABEL: splatconstant_rotate_v32i8: 1417; XOPAVX1: # %bb.0: 1418; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1 1419; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1420; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0 1421; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1422; XOPAVX1-NEXT: retq 1423; 1424; XOPAVX2-LABEL: splatconstant_rotate_v32i8: 1425; XOPAVX2: # %bb.0: 1426; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1 1427; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1428; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0 1429; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1430; XOPAVX2-NEXT: retq 1431 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1432 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1433 %or = or <32 x i8> %shl, %lshr 1434 ret <32 x i8> %or 1435} 1436 1437; 1438; Masked Uniform Constant Rotates 1439; 1440 1441define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind { 1442; AVX1-LABEL: splatconstant_rotate_mask_v4i64: 1443; AVX1: # %bb.0: 1444; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm1 1445; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1446; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm0 1447; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1448; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1449; AVX1-NEXT: retq 1450; 1451; AVX2-LABEL: splatconstant_rotate_mask_v4i64: 1452; AVX2: # %bb.0: 1453; AVX2-NEXT: vpsrlq $49, %ymm0, %ymm0 1454; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1455; AVX2-NEXT: retq 1456; 1457; AVX512-LABEL: splatconstant_rotate_mask_v4i64: 1458; AVX512: # %bb.0: 1459; AVX512-NEXT: vpsrlq $49, %ymm0, %ymm0 1460; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1461; AVX512-NEXT: retq 1462; 1463; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64: 1464; XOPAVX1: # %bb.0: 1465; XOPAVX1-NEXT: vpsrlq $49, %xmm0, %xmm1 1466; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1467; XOPAVX1-NEXT: vpsrlq $49, %xmm0, %xmm0 1468; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1469; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1470; XOPAVX1-NEXT: retq 1471; 1472; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64: 1473; XOPAVX2: # %bb.0: 1474; XOPAVX2-NEXT: vpsrlq $49, %ymm0, %ymm0 1475; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1476; XOPAVX2-NEXT: retq 1477 %shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15> 1478 %lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49> 1479 %rmask = and <4 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255> 1480 %lmask = and <4 x i64> %shl, <i64 33, i64 65, i64 129, i64 257> 1481 %or = or <4 x i64> %lmask, %rmask 1482 ret <4 x i64> %or 1483} 1484 1485define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind { 1486; AVX1-LABEL: splatconstant_rotate_mask_v8i32: 1487; AVX1: # %bb.0: 1488; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1489; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2 1490; AVX1-NEXT: vpslld $4, %xmm1, %xmm1 1491; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 1492; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2 1493; AVX1-NEXT: vpslld $4, %xmm0, %xmm0 1494; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 1495; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1496; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1497; AVX1-NEXT: retq 1498; 1499; AVX2-LABEL: splatconstant_rotate_mask_v8i32: 1500; AVX2: # %bb.0: 1501; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1 1502; AVX2-NEXT: vpslld $4, %ymm0, %ymm0 1503; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1504; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1505; AVX2-NEXT: retq 1506; 1507; AVX512NOVLX-LABEL: splatconstant_rotate_mask_v8i32: 1508; AVX512NOVLX: # %bb.0: 1509; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1510; AVX512NOVLX-NEXT: vprold $4, %zmm0, %zmm0 1511; AVX512NOVLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1512; AVX512NOVLX-NEXT: retq 1513; 1514; AVX512VLX-LABEL: splatconstant_rotate_mask_v8i32: 1515; AVX512VLX: # %bb.0: 1516; AVX512VLX-NEXT: vprold $4, %ymm0, %ymm0 1517; AVX512VLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1518; AVX512VLX-NEXT: retq 1519; 1520; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32: 1521; XOPAVX1: # %bb.0: 1522; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1 1523; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1524; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0 1525; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1526; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1527; XOPAVX1-NEXT: retq 1528; 1529; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32: 1530; XOPAVX2: # %bb.0: 1531; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1 1532; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1533; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0 1534; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1535; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1536; XOPAVX2-NEXT: retq 1537 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 1538 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28> 1539 %rmask = and <8 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511> 1540 %lmask = and <8 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3> 1541 %or = or <8 x i32> %lmask, %rmask 1542 ret <8 x i32> %or 1543} 1544 1545define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind { 1546; AVX1-LABEL: splatconstant_rotate_mask_v16i16: 1547; AVX1: # %bb.0: 1548; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1549; AVX1-NEXT: vpsrlw $11, %xmm1, %xmm2 1550; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 1551; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 1552; AVX1-NEXT: vpsrlw $11, %xmm0, %xmm2 1553; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 1554; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 1555; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1556; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1557; AVX1-NEXT: retq 1558; 1559; AVX2-LABEL: splatconstant_rotate_mask_v16i16: 1560; AVX2: # %bb.0: 1561; AVX2-NEXT: vpsrlw $11, %ymm0, %ymm1 1562; AVX2-NEXT: vpsllw $5, %ymm0, %ymm0 1563; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1564; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1565; AVX2-NEXT: retq 1566; 1567; AVX512F-LABEL: splatconstant_rotate_mask_v16i16: 1568; AVX512F: # %bb.0: 1569; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm1 1570; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0 1571; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 1572; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1573; AVX512F-NEXT: retq 1574; 1575; AVX512VL-LABEL: splatconstant_rotate_mask_v16i16: 1576; AVX512VL: # %bb.0: 1577; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm1 1578; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0 1579; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm0 = mem & (ymm0 | ymm1) 1580; AVX512VL-NEXT: retq 1581; 1582; AVX512BW-LABEL: splatconstant_rotate_mask_v16i16: 1583; AVX512BW: # %bb.0: 1584; AVX512BW-NEXT: vpsrlw $11, %ymm0, %ymm1 1585; AVX512BW-NEXT: vpsllw $5, %ymm0, %ymm0 1586; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 1587; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1588; AVX512BW-NEXT: retq 1589; 1590; AVX512VLBW-LABEL: splatconstant_rotate_mask_v16i16: 1591; AVX512VLBW: # %bb.0: 1592; AVX512VLBW-NEXT: vpsllw $5, %ymm0, %ymm1 1593; AVX512VLBW-NEXT: vpsrlw $11, %ymm0, %ymm0 1594; AVX512VLBW-NEXT: vpternlogd {{.*#+}} ymm0 = mem & (ymm0 | ymm1) 1595; AVX512VLBW-NEXT: retq 1596; 1597; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i16: 1598; AVX512VBMI2: # %bb.0: 1599; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1600; AVX512VBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0 1601; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1602; AVX512VBMI2-NEXT: retq 1603; 1604; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v16i16: 1605; AVX512VLVBMI2: # %bb.0: 1606; AVX512VLVBMI2-NEXT: vpshldw $5, %ymm0, %ymm0, %ymm0 1607; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 1608; AVX512VLVBMI2-NEXT: retq 1609; 1610; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16: 1611; XOPAVX1: # %bb.0: 1612; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1 1613; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1614; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm0 1615; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1616; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1617; XOPAVX1-NEXT: retq 1618; 1619; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16: 1620; XOPAVX2: # %bb.0: 1621; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm1 1622; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1623; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0 1624; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1625; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1626; XOPAVX2-NEXT: retq 1627 %shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> 1628 %lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11> 1629 %rmask = and <16 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55> 1630 %lmask = and <16 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33> 1631 %or = or <16 x i16> %lmask, %rmask 1632 ret <16 x i16> %or 1633} 1634 1635define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { 1636; AVX1-LABEL: splatconstant_rotate_mask_v32i8: 1637; AVX1: # %bb.0: 1638; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1639; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 1640; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1641; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 1642; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 1643; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1644; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 1645; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 1646; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 1647; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 1648; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1649; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 1650; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1651; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1652; AVX1-NEXT: retq 1653; 1654; AVX2-LABEL: splatconstant_rotate_mask_v32i8: 1655; AVX2: # %bb.0: 1656; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1 1657; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1658; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0 1659; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1660; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1661; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1662; AVX2-NEXT: retq 1663; 1664; AVX512NOVLX-LABEL: splatconstant_rotate_mask_v32i8: 1665; AVX512NOVLX: # %bb.0: 1666; AVX512NOVLX-NEXT: vpsllw $4, %ymm0, %ymm1 1667; AVX512NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 1668; AVX512NOVLX-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 1669; AVX512NOVLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1670; AVX512NOVLX-NEXT: retq 1671; 1672; AVX512VLX-LABEL: splatconstant_rotate_mask_v32i8: 1673; AVX512VLX: # %bb.0: 1674; AVX512VLX-NEXT: vpsllw $4, %ymm0, %ymm1 1675; AVX512VLX-NEXT: vpsrlw $4, %ymm0, %ymm0 1676; AVX512VLX-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) 1677; AVX512VLX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 1678; AVX512VLX-NEXT: retq 1679; 1680; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8: 1681; XOPAVX1: # %bb.0: 1682; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1 1683; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1684; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0 1685; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1686; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1687; XOPAVX1-NEXT: retq 1688; 1689; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8: 1690; XOPAVX2: # %bb.0: 1691; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1 1692; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1693; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0 1694; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1695; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1696; XOPAVX2-NEXT: retq 1697 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1698 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1699 %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55> 1700 %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33> 1701 %or = or <32 x i8> %lmask, %rmask 1702 ret <32 x i8> %or 1703} 1704