1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-256 | FileCheck %s --check-prefixes=AVX512VLVBMI2 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-512 | FileCheck %s --check-prefixes=AVX512VLVBMI2 14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 16 17; Just one 32-bit run to make sure we do reasonable things for i64 cases. 18; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2 19 20declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) 21declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) 22declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) 23declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) 24 25; 26; Variable Shifts 27; 28 29define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind { 30; SSE2-LABEL: var_funnnel_v2i64: 31; SSE2: # %bb.0: 32; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63] 33; SSE2-NEXT: movdqa %xmm2, %xmm4 34; SSE2-NEXT: pandn %xmm3, %xmm4 35; SSE2-NEXT: psrlq $1, %xmm1 36; SSE2-NEXT: movdqa %xmm1, %xmm5 37; SSE2-NEXT: psrlq %xmm4, %xmm5 38; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] 39; SSE2-NEXT: psrlq %xmm4, %xmm1 40; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1] 41; SSE2-NEXT: pand %xmm3, %xmm2 42; SSE2-NEXT: movdqa %xmm0, %xmm1 43; SSE2-NEXT: psllq %xmm2, %xmm1 44; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 45; SSE2-NEXT: psllq %xmm2, %xmm0 46; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 47; SSE2-NEXT: orpd %xmm5, %xmm0 48; SSE2-NEXT: retq 49; 50; SSE41-LABEL: var_funnnel_v2i64: 51; SSE41: # %bb.0: 52; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [63,63] 53; SSE41-NEXT: movdqa %xmm2, %xmm4 54; SSE41-NEXT: pandn %xmm3, %xmm4 55; SSE41-NEXT: psrlq $1, %xmm1 56; SSE41-NEXT: movdqa %xmm1, %xmm5 57; SSE41-NEXT: psrlq %xmm4, %xmm5 58; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] 59; SSE41-NEXT: psrlq %xmm4, %xmm1 60; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7] 61; SSE41-NEXT: pand %xmm3, %xmm2 62; SSE41-NEXT: movdqa %xmm0, %xmm1 63; SSE41-NEXT: psllq %xmm2, %xmm1 64; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 65; SSE41-NEXT: psllq %xmm2, %xmm0 66; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 67; SSE41-NEXT: por %xmm5, %xmm0 68; SSE41-NEXT: retq 69; 70; AVX1-LABEL: var_funnnel_v2i64: 71; AVX1: # %bb.0: 72; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 73; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 74; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 75; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 76; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] 77; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 78; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] 79; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 80; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3 81; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 82; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 83; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 84; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 85; AVX1-NEXT: retq 86; 87; AVX2-LABEL: var_funnnel_v2i64: 88; AVX2: # %bb.0: 89; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 90; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 91; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 92; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 93; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 94; AVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 95; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 96; AVX2-NEXT: retq 97; 98; AVX512F-LABEL: var_funnnel_v2i64: 99; AVX512F: # %bb.0: 100; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 101; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 102; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1 103; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 104; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 105; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 106; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 107; AVX512F-NEXT: retq 108; 109; AVX512VL-LABEL: var_funnnel_v2i64: 110; AVX512VL: # %bb.0: 111; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] 112; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 113; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1 114; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 115; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 116; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 117; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 118; AVX512VL-NEXT: retq 119; 120; AVX512BW-LABEL: var_funnnel_v2i64: 121; AVX512BW: # %bb.0: 122; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 123; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 124; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1 125; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 126; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 127; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 128; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 129; AVX512BW-NEXT: retq 130; 131; AVX512VBMI2-LABEL: var_funnnel_v2i64: 132; AVX512VBMI2: # %bb.0: 133; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 134; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 135; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 136; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0 137; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 138; AVX512VBMI2-NEXT: vzeroupper 139; AVX512VBMI2-NEXT: retq 140; 141; AVX512VLBW-LABEL: var_funnnel_v2i64: 142; AVX512VLBW: # %bb.0: 143; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] 144; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 145; AVX512VLBW-NEXT: vpsrlq $1, %xmm1, %xmm1 146; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 147; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 148; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 149; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 150; AVX512VLBW-NEXT: retq 151; 152; AVX512VLVBMI2-LABEL: var_funnnel_v2i64: 153; AVX512VLVBMI2: # %bb.0: 154; AVX512VLVBMI2-NEXT: vpshldvq %xmm2, %xmm1, %xmm0 155; AVX512VLVBMI2-NEXT: retq 156; 157; XOPAVX1-LABEL: var_funnnel_v2i64: 158; XOPAVX1: # %bb.0: 159; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 160; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 161; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0 162; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 163; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 164; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 165; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 166; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1 167; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 168; XOPAVX1-NEXT: retq 169; 170; XOPAVX2-LABEL: var_funnnel_v2i64: 171; XOPAVX2: # %bb.0: 172; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 173; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 174; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 175; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 176; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 177; XOPAVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 178; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 179; XOPAVX2-NEXT: retq 180; 181; X86-SSE2-LABEL: var_funnnel_v2i64: 182; X86-SSE2: # %bb.0: 183; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0] 184; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 185; X86-SSE2-NEXT: pandn %xmm4, %xmm5 186; X86-SSE2-NEXT: psrlq $1, %xmm1 187; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 188; X86-SSE2-NEXT: psrlq %xmm5, %xmm3 189; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] 190; X86-SSE2-NEXT: psrlq %xmm5, %xmm1 191; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1] 192; X86-SSE2-NEXT: pand %xmm4, %xmm2 193; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 194; X86-SSE2-NEXT: psllq %xmm2, %xmm1 195; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 196; X86-SSE2-NEXT: psllq %xmm2, %xmm0 197; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 198; X86-SSE2-NEXT: orpd %xmm3, %xmm0 199; X86-SSE2-NEXT: retl 200 %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) 201 ret <2 x i64> %res 202} 203 204define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind { 205; SSE2-LABEL: var_funnnel_v4i32: 206; SSE2: # %bb.0: 207; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] 208; SSE2-NEXT: movdqa %xmm2, %xmm5 209; SSE2-NEXT: pandn %xmm4, %xmm5 210; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] 211; SSE2-NEXT: psrld $1, %xmm1 212; SSE2-NEXT: movdqa %xmm1, %xmm6 213; SSE2-NEXT: psrld %xmm3, %xmm6 214; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] 215; SSE2-NEXT: movdqa %xmm1, %xmm3 216; SSE2-NEXT: psrld %xmm7, %xmm3 217; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] 218; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] 219; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 220; SSE2-NEXT: movdqa %xmm1, %xmm7 221; SSE2-NEXT: psrld %xmm6, %xmm7 222; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 223; SSE2-NEXT: psrld %xmm5, %xmm1 224; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] 225; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] 226; SSE2-NEXT: pand %xmm4, %xmm2 227; SSE2-NEXT: pslld $23, %xmm2 228; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 229; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 230; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 231; SSE2-NEXT: pmuludq %xmm1, %xmm0 232; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 233; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 234; SSE2-NEXT: pmuludq %xmm2, %xmm1 235; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 236; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 237; SSE2-NEXT: por %xmm3, %xmm0 238; SSE2-NEXT: retq 239; 240; SSE41-LABEL: var_funnnel_v4i32: 241; SSE41: # %bb.0: 242; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31] 243; SSE41-NEXT: movdqa %xmm2, %xmm4 244; SSE41-NEXT: pandn %xmm3, %xmm4 245; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] 246; SSE41-NEXT: psrld $1, %xmm1 247; SSE41-NEXT: movdqa %xmm1, %xmm6 248; SSE41-NEXT: psrld %xmm5, %xmm6 249; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] 250; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] 251; SSE41-NEXT: movdqa %xmm1, %xmm8 252; SSE41-NEXT: psrld %xmm7, %xmm8 253; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] 254; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] 255; SSE41-NEXT: movdqa %xmm1, %xmm6 256; SSE41-NEXT: psrld %xmm4, %xmm6 257; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7] 258; SSE41-NEXT: psrld %xmm4, %xmm1 259; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7] 260; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7] 261; SSE41-NEXT: pand %xmm3, %xmm2 262; SSE41-NEXT: pslld $23, %xmm2 263; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 264; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 265; SSE41-NEXT: pmulld %xmm1, %xmm0 266; SSE41-NEXT: por %xmm6, %xmm0 267; SSE41-NEXT: retq 268; 269; AVX1-LABEL: var_funnnel_v4i32: 270; AVX1: # %bb.0: 271; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31] 272; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 273; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 274; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 275; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5 276; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6 277; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 278; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] 279; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 280; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] 281; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 282; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 283; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 284; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] 285; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] 286; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 287; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 288; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 289; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 290; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 291; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 292; AVX1-NEXT: retq 293; 294; AVX2-LABEL: var_funnnel_v4i32: 295; AVX2: # %bb.0: 296; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 297; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 298; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 299; AVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 300; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 301; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 302; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 303; AVX2-NEXT: retq 304; 305; AVX512F-LABEL: var_funnnel_v4i32: 306; AVX512F: # %bb.0: 307; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 308; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 309; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1 310; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 311; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 312; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 313; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 314; AVX512F-NEXT: retq 315; 316; AVX512VL-LABEL: var_funnnel_v4i32: 317; AVX512VL: # %bb.0: 318; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 319; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 320; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 321; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 322; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 323; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 324; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 325; AVX512VL-NEXT: retq 326; 327; AVX512BW-LABEL: var_funnnel_v4i32: 328; AVX512BW: # %bb.0: 329; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 330; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 331; AVX512BW-NEXT: vpsrld $1, %xmm1, %xmm1 332; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 333; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 334; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 335; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 336; AVX512BW-NEXT: retq 337; 338; AVX512VBMI2-LABEL: var_funnnel_v4i32: 339; AVX512VBMI2: # %bb.0: 340; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 341; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 342; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 343; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0 344; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 345; AVX512VBMI2-NEXT: vzeroupper 346; AVX512VBMI2-NEXT: retq 347; 348; AVX512VLBW-LABEL: var_funnnel_v4i32: 349; AVX512VLBW: # %bb.0: 350; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 351; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 352; AVX512VLBW-NEXT: vpsrld $1, %xmm1, %xmm1 353; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 354; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 355; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 356; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 357; AVX512VLBW-NEXT: retq 358; 359; AVX512VLVBMI2-LABEL: var_funnnel_v4i32: 360; AVX512VLVBMI2: # %bb.0: 361; AVX512VLVBMI2-NEXT: vpshldvd %xmm2, %xmm1, %xmm0 362; AVX512VLVBMI2-NEXT: retq 363; 364; XOPAVX1-LABEL: var_funnnel_v4i32: 365; XOPAVX1: # %bb.0: 366; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31] 367; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 368; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0 369; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 370; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 371; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 372; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1 373; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1 374; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 375; XOPAVX1-NEXT: retq 376; 377; XOPAVX2-LABEL: var_funnnel_v4i32: 378; XOPAVX2: # %bb.0: 379; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 380; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 381; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1 382; XOPAVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 383; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 384; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 385; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 386; XOPAVX2-NEXT: retq 387; 388; X86-SSE2-LABEL: var_funnnel_v4i32: 389; X86-SSE2: # %bb.0: 390; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] 391; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 392; X86-SSE2-NEXT: pandn %xmm4, %xmm5 393; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] 394; X86-SSE2-NEXT: psrld $1, %xmm1 395; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 396; X86-SSE2-NEXT: psrld %xmm3, %xmm6 397; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] 398; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 399; X86-SSE2-NEXT: psrld %xmm7, %xmm3 400; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] 401; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] 402; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 403; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 404; X86-SSE2-NEXT: psrld %xmm6, %xmm7 405; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 406; X86-SSE2-NEXT: psrld %xmm5, %xmm1 407; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] 408; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] 409; X86-SSE2-NEXT: pand %xmm4, %xmm2 410; X86-SSE2-NEXT: pslld $23, %xmm2 411; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 412; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1 413; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 414; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 415; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 416; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 417; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 418; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 419; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 420; X86-SSE2-NEXT: por %xmm3, %xmm0 421; X86-SSE2-NEXT: retl 422 %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) 423 ret <4 x i32> %res 424} 425 426define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind { 427; SSE2-LABEL: var_funnnel_v8i16: 428; SSE2: # %bb.0: 429; SSE2-NEXT: movdqa %xmm1, %xmm3 430; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 431; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 432; SSE2-NEXT: movdqa %xmm2, %xmm4 433; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] 434; SSE2-NEXT: pslld $23, %xmm4 435; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] 436; SSE2-NEXT: paddd %xmm5, %xmm4 437; SSE2-NEXT: cvttps2dq %xmm4, %xmm4 438; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] 439; SSE2-NEXT: pmuludq %xmm4, %xmm3 440; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 441; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 442; SSE2-NEXT: pmuludq %xmm6, %xmm4 443; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 444; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 445; SSE2-NEXT: psrad $16, %xmm3 446; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 447; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 448; SSE2-NEXT: pslld $23, %xmm2 449; SSE2-NEXT: paddd %xmm5, %xmm2 450; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 451; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 452; SSE2-NEXT: pmuludq %xmm2, %xmm1 453; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 454; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 455; SSE2-NEXT: pmuludq %xmm4, %xmm1 456; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 457; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 458; SSE2-NEXT: psrad $16, %xmm0 459; SSE2-NEXT: packssdw %xmm3, %xmm0 460; SSE2-NEXT: retq 461; 462; SSE41-LABEL: var_funnnel_v8i16: 463; SSE41: # %bb.0: 464; SSE41-NEXT: movdqa %xmm1, %xmm3 465; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 466; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 467; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 468; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 469; SSE41-NEXT: pslld $23, %xmm2 470; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] 471; SSE41-NEXT: paddd %xmm5, %xmm2 472; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 473; SSE41-NEXT: pmulld %xmm3, %xmm2 474; SSE41-NEXT: psrld $16, %xmm2 475; SSE41-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 476; SSE41-NEXT: pslld $23, %xmm4 477; SSE41-NEXT: paddd %xmm5, %xmm4 478; SSE41-NEXT: cvttps2dq %xmm4, %xmm0 479; SSE41-NEXT: pmulld %xmm1, %xmm0 480; SSE41-NEXT: psrld $16, %xmm0 481; SSE41-NEXT: packusdw %xmm2, %xmm0 482; SSE41-NEXT: retq 483; 484; AVX1-LABEL: var_funnnel_v8i16: 485; AVX1: # %bb.0: 486; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 487; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 488; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7] 489; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 490; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] 491; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 492; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 493; AVX1-NEXT: vpmulld %xmm4, %xmm3, %xmm3 494; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 495; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 496; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 497; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 498; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1 499; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 500; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 501; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 502; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 503; AVX1-NEXT: retq 504; 505; AVX2-LABEL: var_funnnel_v8i16: 506; AVX2: # %bb.0: 507; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 508; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 509; AVX2-NEXT: vpslld $16, %ymm0, %ymm0 510; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 511; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 512; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 513; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 514; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 515; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 516; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 517; AVX2-NEXT: vzeroupper 518; AVX2-NEXT: retq 519; 520; AVX512F-LABEL: var_funnnel_v8i16: 521; AVX512F: # %bb.0: 522; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 523; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 524; AVX512F-NEXT: vpslld $16, %ymm0, %ymm0 525; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 526; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 527; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 528; AVX512F-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 529; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 530; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 531; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 532; AVX512F-NEXT: vzeroupper 533; AVX512F-NEXT: retq 534; 535; AVX512VL-LABEL: var_funnnel_v8i16: 536; AVX512VL: # %bb.0: 537; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 538; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 539; AVX512VL-NEXT: vpslld $16, %ymm0, %ymm0 540; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 541; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 542; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 543; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 544; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 545; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 546; AVX512VL-NEXT: vzeroupper 547; AVX512VL-NEXT: retq 548; 549; AVX512BW-LABEL: var_funnnel_v8i16: 550; AVX512BW: # %bb.0: 551; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 552; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 553; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 554; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1 555; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 556; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 557; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 558; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 559; AVX512BW-NEXT: vzeroupper 560; AVX512BW-NEXT: retq 561; 562; AVX512VBMI2-LABEL: var_funnnel_v8i16: 563; AVX512VBMI2: # %bb.0: 564; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 565; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 566; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 567; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0 568; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 569; AVX512VBMI2-NEXT: vzeroupper 570; AVX512VBMI2-NEXT: retq 571; 572; AVX512VLBW-LABEL: var_funnnel_v8i16: 573; AVX512VLBW: # %bb.0: 574; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 575; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 576; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 577; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1 578; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 579; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm0 580; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 581; AVX512VLBW-NEXT: retq 582; 583; AVX512VLVBMI2-LABEL: var_funnnel_v8i16: 584; AVX512VLVBMI2: # %bb.0: 585; AVX512VLVBMI2-NEXT: vpshldvw %xmm2, %xmm1, %xmm0 586; AVX512VLVBMI2-NEXT: retq 587; 588; XOPAVX1-LABEL: var_funnnel_v8i16: 589; XOPAVX1: # %bb.0: 590; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 591; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 592; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0 593; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 594; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 595; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2 596; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 597; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1 598; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 599; XOPAVX1-NEXT: retq 600; 601; XOPAVX2-LABEL: var_funnnel_v8i16: 602; XOPAVX2: # %bb.0: 603; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 604; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 605; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0 606; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 607; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 608; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2 609; XOPAVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 610; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1 611; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 612; XOPAVX2-NEXT: retq 613; 614; X86-SSE2-LABEL: var_funnnel_v8i16: 615; X86-SSE2: # %bb.0: 616; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 617; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 618; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 619; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 620; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 621; X86-SSE2-NEXT: pslld $23, %xmm5 622; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 623; X86-SSE2-NEXT: paddd %xmm4, %xmm5 624; X86-SSE2-NEXT: cvttps2dq %xmm5, %xmm5 625; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] 626; X86-SSE2-NEXT: pmuludq %xmm5, %xmm3 627; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 628; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 629; X86-SSE2-NEXT: pmuludq %xmm6, %xmm5 630; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 631; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] 632; X86-SSE2-NEXT: psrad $16, %xmm3 633; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 634; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 635; X86-SSE2-NEXT: pslld $23, %xmm2 636; X86-SSE2-NEXT: paddd %xmm4, %xmm2 637; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 638; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 639; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 640; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 641; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 642; X86-SSE2-NEXT: pmuludq %xmm4, %xmm1 643; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 644; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 645; X86-SSE2-NEXT: psrad $16, %xmm0 646; X86-SSE2-NEXT: packssdw %xmm3, %xmm0 647; X86-SSE2-NEXT: retl 648 %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) 649 ret <8 x i16> %res 650} 651 652define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind { 653; SSE2-LABEL: var_funnnel_v16i8: 654; SSE2: # %bb.0: 655; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 656; SSE2-NEXT: pxor %xmm5, %xmm5 657; SSE2-NEXT: movdqa %xmm2, %xmm4 658; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 659; SSE2-NEXT: movdqa %xmm4, %xmm6 660; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] 661; SSE2-NEXT: pslld $23, %xmm6 662; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 663; SSE2-NEXT: paddd %xmm3, %xmm6 664; SSE2-NEXT: cvttps2dq %xmm6, %xmm6 665; SSE2-NEXT: pslld $16, %xmm6 666; SSE2-NEXT: psrad $16, %xmm6 667; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 668; SSE2-NEXT: pslld $23, %xmm4 669; SSE2-NEXT: paddd %xmm3, %xmm4 670; SSE2-NEXT: cvttps2dq %xmm4, %xmm7 671; SSE2-NEXT: pslld $16, %xmm7 672; SSE2-NEXT: psrad $16, %xmm7 673; SSE2-NEXT: packssdw %xmm6, %xmm7 674; SSE2-NEXT: movdqa %xmm1, %xmm4 675; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 676; SSE2-NEXT: pmullw %xmm7, %xmm4 677; SSE2-NEXT: psrlw $8, %xmm4 678; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 679; SSE2-NEXT: movdqa %xmm2, %xmm5 680; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 681; SSE2-NEXT: pslld $23, %xmm5 682; SSE2-NEXT: paddd %xmm3, %xmm5 683; SSE2-NEXT: cvttps2dq %xmm5, %xmm5 684; SSE2-NEXT: pslld $16, %xmm5 685; SSE2-NEXT: psrad $16, %xmm5 686; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 687; SSE2-NEXT: pslld $23, %xmm2 688; SSE2-NEXT: paddd %xmm3, %xmm2 689; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 690; SSE2-NEXT: pslld $16, %xmm2 691; SSE2-NEXT: psrad $16, %xmm2 692; SSE2-NEXT: packssdw %xmm5, %xmm2 693; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 694; SSE2-NEXT: pmullw %xmm1, %xmm2 695; SSE2-NEXT: psrlw $8, %xmm2 696; SSE2-NEXT: packuswb %xmm4, %xmm2 697; SSE2-NEXT: movdqa %xmm2, %xmm0 698; SSE2-NEXT: retq 699; 700; SSE41-LABEL: var_funnnel_v16i8: 701; SSE41: # %bb.0: 702; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 703; SSE41-NEXT: pxor %xmm3, %xmm3 704; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 705; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 706; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 707; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 708; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 709; SSE41-NEXT: pslld $23, %xmm2 710; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] 711; SSE41-NEXT: paddd %xmm6, %xmm2 712; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 713; SSE41-NEXT: pslld $23, %xmm3 714; SSE41-NEXT: paddd %xmm6, %xmm3 715; SSE41-NEXT: cvttps2dq %xmm3, %xmm3 716; SSE41-NEXT: packusdw %xmm2, %xmm3 717; SSE41-NEXT: movdqa %xmm1, %xmm7 718; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] 719; SSE41-NEXT: pmullw %xmm3, %xmm7 720; SSE41-NEXT: psrlw $8, %xmm7 721; SSE41-NEXT: pslld $23, %xmm4 722; SSE41-NEXT: paddd %xmm6, %xmm4 723; SSE41-NEXT: cvttps2dq %xmm4, %xmm2 724; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 725; SSE41-NEXT: pslld $23, %xmm5 726; SSE41-NEXT: paddd %xmm6, %xmm5 727; SSE41-NEXT: cvttps2dq %xmm5, %xmm3 728; SSE41-NEXT: packusdw %xmm3, %xmm2 729; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 730; SSE41-NEXT: pmullw %xmm1, %xmm2 731; SSE41-NEXT: psrlw $8, %xmm2 732; SSE41-NEXT: packuswb %xmm7, %xmm2 733; SSE41-NEXT: movdqa %xmm2, %xmm0 734; SSE41-NEXT: retq 735; 736; AVX1-LABEL: var_funnnel_v16i8: 737; AVX1: # %bb.0: 738; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 739; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 740; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] 741; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7] 742; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 743; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] 744; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 745; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 746; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 747; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 748; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3 749; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 750; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 751; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 752; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 753; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 754; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 755; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 756; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 757; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 758; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 759; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 760; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 761; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 762; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 763; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2 764; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 765; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 766; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 767; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 768; AVX1-NEXT: retq 769; 770; AVX2-LABEL: var_funnnel_v16i8: 771; AVX2: # %bb.0: 772; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 773; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 774; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 775; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 776; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 777; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 778; AVX2-NEXT: vpsllvd %ymm4, %ymm3, %ymm3 779; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 780; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 781; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] 782; AVX2-NEXT: vpsrlw $8, %xmm3, %xmm3 783; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 784; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 785; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero 786; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 787; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 788; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 789; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 790; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 791; AVX2-NEXT: vzeroupper 792; AVX2-NEXT: retq 793; 794; AVX512F-LABEL: var_funnnel_v16i8: 795; AVX512F: # %bb.0: 796; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 797; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 798; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero 799; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 800; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0 801; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 802; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero 803; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 804; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 805; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 806; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 807; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 808; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 809; AVX512F-NEXT: vzeroupper 810; AVX512F-NEXT: retq 811; 812; AVX512VL-LABEL: var_funnnel_v16i8: 813; AVX512VL: # %bb.0: 814; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 815; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 816; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero 817; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 818; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0 819; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 820; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero 821; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1 822; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 823; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 824; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 825; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 826; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 827; AVX512VL-NEXT: vzeroupper 828; AVX512VL-NEXT: retq 829; 830; AVX512BW-LABEL: var_funnnel_v16i8: 831; AVX512BW: # %bb.0: 832; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 833; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 834; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 835; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 836; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0 837; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 838; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 839; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 840; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 841; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 842; AVX512BW-NEXT: vzeroupper 843; AVX512BW-NEXT: retq 844; 845; AVX512VBMI2-LABEL: var_funnnel_v16i8: 846; AVX512VBMI2: # %bb.0: 847; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 848; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 849; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79] 850; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm3, %zmm1 851; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 852; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 853; AVX512VBMI2-NEXT: vpsllvw %zmm0, %zmm1, %zmm0 854; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 855; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 856; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 857; AVX512VBMI2-NEXT: vzeroupper 858; AVX512VBMI2-NEXT: retq 859; 860; AVX512VLBW-LABEL: var_funnnel_v16i8: 861; AVX512VLBW: # %bb.0: 862; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 863; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 864; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0 865; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 866; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 867; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 868; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 869; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0 870; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 871; AVX512VLBW-NEXT: vzeroupper 872; AVX512VLBW-NEXT: retq 873; 874; AVX512VLVBMI2-LABEL: var_funnnel_v16i8: 875; AVX512VLVBMI2: # %bb.0: 876; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 877; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 878; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] 879; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm3 880; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 881; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 882; AVX512VLVBMI2-NEXT: vpsllvw %ymm0, %ymm3, %ymm0 883; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 884; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 885; AVX512VLVBMI2-NEXT: vzeroupper 886; AVX512VLVBMI2-NEXT: retq 887; 888; XOPAVX1-LABEL: var_funnnel_v16i8: 889; XOPAVX1: # %bb.0: 890; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 891; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 892; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 893; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 894; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 895; XOPAVX1-NEXT: vpsubb %xmm4, %xmm5, %xmm4 896; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1 897; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 898; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 899; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 900; XOPAVX1-NEXT: retq 901; 902; XOPAVX2-LABEL: var_funnnel_v16i8: 903; XOPAVX2: # %bb.0: 904; XOPAVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 905; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1 906; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 907; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 908; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 909; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm4 910; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1 911; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 912; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0 913; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 914; XOPAVX2-NEXT: retq 915; 916; X86-SSE2-LABEL: var_funnnel_v16i8: 917; X86-SSE2: # %bb.0: 918; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 919; X86-SSE2-NEXT: pxor %xmm5, %xmm5 920; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 921; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 922; X86-SSE2-NEXT: movdqa %xmm4, %xmm6 923; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] 924; X86-SSE2-NEXT: pslld $23, %xmm6 925; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 926; X86-SSE2-NEXT: paddd %xmm3, %xmm6 927; X86-SSE2-NEXT: cvttps2dq %xmm6, %xmm6 928; X86-SSE2-NEXT: pslld $16, %xmm6 929; X86-SSE2-NEXT: psrad $16, %xmm6 930; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 931; X86-SSE2-NEXT: pslld $23, %xmm4 932; X86-SSE2-NEXT: paddd %xmm3, %xmm4 933; X86-SSE2-NEXT: cvttps2dq %xmm4, %xmm7 934; X86-SSE2-NEXT: pslld $16, %xmm7 935; X86-SSE2-NEXT: psrad $16, %xmm7 936; X86-SSE2-NEXT: packssdw %xmm6, %xmm7 937; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 938; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 939; X86-SSE2-NEXT: pmullw %xmm7, %xmm4 940; X86-SSE2-NEXT: psrlw $8, %xmm4 941; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 942; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 943; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 944; X86-SSE2-NEXT: pslld $23, %xmm5 945; X86-SSE2-NEXT: paddd %xmm3, %xmm5 946; X86-SSE2-NEXT: cvttps2dq %xmm5, %xmm5 947; X86-SSE2-NEXT: pslld $16, %xmm5 948; X86-SSE2-NEXT: psrad $16, %xmm5 949; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 950; X86-SSE2-NEXT: pslld $23, %xmm2 951; X86-SSE2-NEXT: paddd %xmm3, %xmm2 952; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 953; X86-SSE2-NEXT: pslld $16, %xmm2 954; X86-SSE2-NEXT: psrad $16, %xmm2 955; X86-SSE2-NEXT: packssdw %xmm5, %xmm2 956; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 957; X86-SSE2-NEXT: pmullw %xmm1, %xmm2 958; X86-SSE2-NEXT: psrlw $8, %xmm2 959; X86-SSE2-NEXT: packuswb %xmm4, %xmm2 960; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 961; X86-SSE2-NEXT: retl 962 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) 963 ret <16 x i8> %res 964} 965 966; 967; Uniform Variable Shifts 968; 969 970define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind { 971; SSE2-LABEL: splatvar_funnnel_v2i64: 972; SSE2: # %bb.0: 973; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63] 974; SSE2-NEXT: movdqa %xmm2, %xmm4 975; SSE2-NEXT: pandn %xmm3, %xmm4 976; SSE2-NEXT: psrlq $1, %xmm1 977; SSE2-NEXT: psrlq %xmm4, %xmm1 978; SSE2-NEXT: pand %xmm3, %xmm2 979; SSE2-NEXT: psllq %xmm2, %xmm0 980; SSE2-NEXT: por %xmm1, %xmm0 981; SSE2-NEXT: retq 982; 983; SSE41-LABEL: splatvar_funnnel_v2i64: 984; SSE41: # %bb.0: 985; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [63,63] 986; SSE41-NEXT: movdqa %xmm2, %xmm4 987; SSE41-NEXT: pandn %xmm3, %xmm4 988; SSE41-NEXT: psrlq $1, %xmm1 989; SSE41-NEXT: psrlq %xmm4, %xmm1 990; SSE41-NEXT: pand %xmm3, %xmm2 991; SSE41-NEXT: psllq %xmm2, %xmm0 992; SSE41-NEXT: por %xmm1, %xmm0 993; SSE41-NEXT: retq 994; 995; AVX-LABEL: splatvar_funnnel_v2i64: 996; AVX: # %bb.0: 997; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 998; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm4 999; AVX-NEXT: vpsrlq $1, %xmm1, %xmm1 1000; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 1001; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 1002; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1003; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1004; AVX-NEXT: retq 1005; 1006; AVX512F-LABEL: splatvar_funnnel_v2i64: 1007; AVX512F: # %bb.0: 1008; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 1009; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 1010; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1 1011; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 1012; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 1013; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1014; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 1015; AVX512F-NEXT: retq 1016; 1017; AVX512VL-LABEL: splatvar_funnnel_v2i64: 1018; AVX512VL: # %bb.0: 1019; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] 1020; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 1021; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1 1022; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 1023; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 1024; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1025; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 1026; AVX512VL-NEXT: retq 1027; 1028; AVX512BW-LABEL: splatvar_funnnel_v2i64: 1029; AVX512BW: # %bb.0: 1030; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 1031; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 1032; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1 1033; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 1034; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 1035; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1036; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1037; AVX512BW-NEXT: retq 1038; 1039; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64: 1040; AVX512VBMI2: # %bb.0: 1041; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1042; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1043; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2 1044; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0 1045; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1046; AVX512VBMI2-NEXT: vzeroupper 1047; AVX512VBMI2-NEXT: retq 1048; 1049; AVX512VLBW-LABEL: splatvar_funnnel_v2i64: 1050; AVX512VLBW: # %bb.0: 1051; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] 1052; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 1053; AVX512VLBW-NEXT: vpsrlq $1, %xmm1, %xmm1 1054; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 1055; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 1056; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1057; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1058; AVX512VLBW-NEXT: retq 1059; 1060; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64: 1061; AVX512VLVBMI2: # %bb.0: 1062; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %xmm2 1063; AVX512VLVBMI2-NEXT: vpshldvq %xmm2, %xmm1, %xmm0 1064; AVX512VLVBMI2-NEXT: retq 1065; 1066; XOP-LABEL: splatvar_funnnel_v2i64: 1067; XOP: # %bb.0: 1068; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 1069; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4 1070; XOP-NEXT: vpsrlq $1, %xmm1, %xmm1 1071; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 1072; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2 1073; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1074; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 1075; XOP-NEXT: retq 1076; 1077; X86-SSE2-LABEL: splatvar_funnnel_v2i64: 1078; X86-SSE2: # %bb.0: 1079; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0] 1080; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 1081; X86-SSE2-NEXT: pandn %xmm3, %xmm4 1082; X86-SSE2-NEXT: psrlq $1, %xmm1 1083; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 1084; X86-SSE2-NEXT: pand %xmm3, %xmm2 1085; X86-SSE2-NEXT: psllq %xmm2, %xmm0 1086; X86-SSE2-NEXT: por %xmm1, %xmm0 1087; X86-SSE2-NEXT: retl 1088 %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer 1089 %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat) 1090 ret <2 x i64> %res 1091} 1092 1093define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind { 1094; SSE-LABEL: splatvar_funnnel_v4i32: 1095; SSE: # %bb.0: 1096; SSE-NEXT: movdqa %xmm1, %xmm3 1097; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] 1098; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1099; SSE-NEXT: psllq %xmm2, %xmm3 1100; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1101; SSE-NEXT: psllq %xmm2, %xmm1 1102; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] 1103; SSE-NEXT: movaps %xmm1, %xmm0 1104; SSE-NEXT: retq 1105; 1106; AVX-LABEL: splatvar_funnnel_v4i32: 1107; AVX: # %bb.0: 1108; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1109; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1110; AVX-NEXT: vpsllq %xmm2, %xmm3, %xmm3 1111; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1112; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1113; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3] 1114; AVX-NEXT: retq 1115; 1116; AVX512F-LABEL: splatvar_funnnel_v4i32: 1117; AVX512F: # %bb.0: 1118; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1119; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1120; AVX512F-NEXT: vpsllq %xmm2, %xmm3, %xmm3 1121; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1122; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1123; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3] 1124; AVX512F-NEXT: retq 1125; 1126; AVX512VL-LABEL: splatvar_funnnel_v4i32: 1127; AVX512VL: # %bb.0: 1128; AVX512VL-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1129; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1130; AVX512VL-NEXT: vpsllq %xmm2, %xmm3, %xmm3 1131; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1132; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1133; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3] 1134; AVX512VL-NEXT: retq 1135; 1136; AVX512BW-LABEL: splatvar_funnnel_v4i32: 1137; AVX512BW: # %bb.0: 1138; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1139; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1140; AVX512BW-NEXT: vpsllq %xmm2, %xmm3, %xmm3 1141; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1142; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1143; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3] 1144; AVX512BW-NEXT: retq 1145; 1146; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32: 1147; AVX512VBMI2: # %bb.0: 1148; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1149; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1150; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2 1151; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0 1152; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1153; AVX512VBMI2-NEXT: vzeroupper 1154; AVX512VBMI2-NEXT: retq 1155; 1156; AVX512VLBW-LABEL: splatvar_funnnel_v4i32: 1157; AVX512VLBW: # %bb.0: 1158; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1159; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1160; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm3, %xmm3 1161; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1162; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1163; AVX512VLBW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3] 1164; AVX512VLBW-NEXT: retq 1165; 1166; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32: 1167; AVX512VLVBMI2: # %bb.0: 1168; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %xmm2 1169; AVX512VLVBMI2-NEXT: vpshldvd %xmm2, %xmm1, %xmm0 1170; AVX512VLVBMI2-NEXT: retq 1171; 1172; XOP-LABEL: splatvar_funnnel_v4i32: 1173; XOP: # %bb.0: 1174; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1175; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1176; XOP-NEXT: vpsllq %xmm2, %xmm3, %xmm3 1177; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1178; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1179; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3] 1180; XOP-NEXT: retq 1181; 1182; X86-SSE2-LABEL: splatvar_funnnel_v4i32: 1183; X86-SSE2: # %bb.0: 1184; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 1185; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] 1186; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 1187; X86-SSE2-NEXT: psllq %xmm2, %xmm3 1188; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1189; X86-SSE2-NEXT: psllq %xmm2, %xmm1 1190; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] 1191; X86-SSE2-NEXT: movaps %xmm1, %xmm0 1192; X86-SSE2-NEXT: retl 1193 %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer 1194 %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat) 1195 ret <4 x i32> %res 1196} 1197 1198define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind { 1199; SSE2-LABEL: splatvar_funnnel_v8i16: 1200; SSE2: # %bb.0: 1201; SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] 1202; SSE2-NEXT: movdqa %xmm2, %xmm4 1203; SSE2-NEXT: pandn %xmm3, %xmm4 1204; SSE2-NEXT: psrlw $1, %xmm1 1205; SSE2-NEXT: psrlw %xmm4, %xmm1 1206; SSE2-NEXT: pand %xmm3, %xmm2 1207; SSE2-NEXT: psllw %xmm2, %xmm0 1208; SSE2-NEXT: por %xmm1, %xmm0 1209; SSE2-NEXT: retq 1210; 1211; SSE41-LABEL: splatvar_funnnel_v8i16: 1212; SSE41: # %bb.0: 1213; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [15,0] 1214; SSE41-NEXT: movdqa %xmm2, %xmm4 1215; SSE41-NEXT: pandn %xmm3, %xmm4 1216; SSE41-NEXT: psrlw $1, %xmm1 1217; SSE41-NEXT: psrlw %xmm4, %xmm1 1218; SSE41-NEXT: pand %xmm3, %xmm2 1219; SSE41-NEXT: psllw %xmm2, %xmm0 1220; SSE41-NEXT: por %xmm1, %xmm0 1221; SSE41-NEXT: retq 1222; 1223; AVX-LABEL: splatvar_funnnel_v8i16: 1224; AVX: # %bb.0: 1225; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] 1226; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm4 1227; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 1228; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 1229; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 1230; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1231; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1232; AVX-NEXT: retq 1233; 1234; AVX512F-LABEL: splatvar_funnnel_v8i16: 1235; AVX512F: # %bb.0: 1236; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] 1237; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 1238; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 1239; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 1240; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 1241; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1242; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 1243; AVX512F-NEXT: retq 1244; 1245; AVX512VL-LABEL: splatvar_funnnel_v8i16: 1246; AVX512VL: # %bb.0: 1247; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] 1248; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 1249; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1 1250; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 1251; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 1252; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1253; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 1254; AVX512VL-NEXT: retq 1255; 1256; AVX512BW-LABEL: splatvar_funnnel_v8i16: 1257; AVX512BW: # %bb.0: 1258; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] 1259; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 1260; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1 1261; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 1262; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 1263; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1264; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1265; AVX512BW-NEXT: retq 1266; 1267; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16: 1268; AVX512VBMI2: # %bb.0: 1269; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1270; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1271; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2 1272; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0 1273; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1274; AVX512VBMI2-NEXT: vzeroupper 1275; AVX512VBMI2-NEXT: retq 1276; 1277; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: 1278; AVX512VLBW: # %bb.0: 1279; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] 1280; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 1281; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 1282; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 1283; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 1284; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1285; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1286; AVX512VLBW-NEXT: retq 1287; 1288; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16: 1289; AVX512VLVBMI2: # %bb.0: 1290; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %xmm2 1291; AVX512VLVBMI2-NEXT: vpshldvw %xmm2, %xmm1, %xmm0 1292; AVX512VLVBMI2-NEXT: retq 1293; 1294; XOP-LABEL: splatvar_funnnel_v8i16: 1295; XOP: # %bb.0: 1296; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] 1297; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4 1298; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 1299; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 1300; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2 1301; XOP-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1302; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 1303; XOP-NEXT: retq 1304; 1305; X86-SSE2-LABEL: splatvar_funnnel_v8i16: 1306; X86-SSE2: # %bb.0: 1307; X86-SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] 1308; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 1309; X86-SSE2-NEXT: pandn %xmm3, %xmm4 1310; X86-SSE2-NEXT: psrlw $1, %xmm1 1311; X86-SSE2-NEXT: psrlw %xmm4, %xmm1 1312; X86-SSE2-NEXT: pand %xmm3, %xmm2 1313; X86-SSE2-NEXT: psllw %xmm2, %xmm0 1314; X86-SSE2-NEXT: por %xmm1, %xmm0 1315; X86-SSE2-NEXT: retl 1316 %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer 1317 %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat) 1318 ret <8 x i16> %res 1319} 1320 1321define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind { 1322; SSE-LABEL: splatvar_funnnel_v16i8: 1323; SSE: # %bb.0: 1324; SSE-NEXT: movdqa %xmm1, %xmm3 1325; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 1326; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1327; SSE-NEXT: psllw %xmm2, %xmm3 1328; SSE-NEXT: psrlw $8, %xmm3 1329; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1330; SSE-NEXT: psllw %xmm2, %xmm1 1331; SSE-NEXT: psrlw $8, %xmm1 1332; SSE-NEXT: packuswb %xmm3, %xmm1 1333; SSE-NEXT: movdqa %xmm1, %xmm0 1334; SSE-NEXT: retq 1335; 1336; AVX-LABEL: splatvar_funnnel_v16i8: 1337; AVX: # %bb.0: 1338; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1339; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1340; AVX-NEXT: vpsllw %xmm2, %xmm3, %xmm3 1341; AVX-NEXT: vpsrlw $8, %xmm3, %xmm3 1342; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1343; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1344; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 1345; AVX-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1346; AVX-NEXT: retq 1347; 1348; AVX512F-LABEL: splatvar_funnnel_v16i8: 1349; AVX512F: # %bb.0: 1350; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1351; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1352; AVX512F-NEXT: vpsllw %xmm2, %xmm3, %xmm3 1353; AVX512F-NEXT: vpsrlw $8, %xmm3, %xmm3 1354; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1355; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1356; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0 1357; AVX512F-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1358; AVX512F-NEXT: retq 1359; 1360; AVX512VL-LABEL: splatvar_funnnel_v16i8: 1361; AVX512VL: # %bb.0: 1362; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1363; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1364; AVX512VL-NEXT: vpsllw %xmm2, %xmm3, %xmm3 1365; AVX512VL-NEXT: vpsrlw $8, %xmm3, %xmm3 1366; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1367; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1368; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 1369; AVX512VL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1370; AVX512VL-NEXT: retq 1371; 1372; AVX512BW-LABEL: splatvar_funnnel_v16i8: 1373; AVX512BW: # %bb.0: 1374; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1375; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1376; AVX512BW-NEXT: vpsllw %xmm2, %xmm3, %xmm3 1377; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3 1378; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1379; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1380; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 1381; AVX512BW-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1382; AVX512BW-NEXT: retq 1383; 1384; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8: 1385; AVX512VBMI2: # %bb.0: 1386; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1387; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1388; AVX512VBMI2-NEXT: vpsllw %xmm2, %xmm3, %xmm3 1389; AVX512VBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3 1390; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1391; AVX512VBMI2-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1392; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 1393; AVX512VBMI2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1394; AVX512VBMI2-NEXT: retq 1395; 1396; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: 1397; AVX512VLBW: # %bb.0: 1398; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1399; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1400; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm3, %xmm3 1401; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3 1402; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1403; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1404; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 1405; AVX512VLBW-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1406; AVX512VLBW-NEXT: retq 1407; 1408; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: 1409; AVX512VLVBMI2: # %bb.0: 1410; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1411; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1412; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %xmm3, %xmm3 1413; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3 1414; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1415; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1416; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 1417; AVX512VLVBMI2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1418; AVX512VLVBMI2-NEXT: retq 1419; 1420; XOP-LABEL: splatvar_funnnel_v16i8: 1421; XOP: # %bb.0: 1422; XOP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1423; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1424; XOP-NEXT: vpsllw %xmm2, %xmm3, %xmm3 1425; XOP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1426; XOP-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1427; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15],xmm3[1,3,5,7,9,11,13,15] 1428; XOP-NEXT: retq 1429; 1430; X86-SSE2-LABEL: splatvar_funnnel_v16i8: 1431; X86-SSE2: # %bb.0: 1432; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 1433; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 1434; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 1435; X86-SSE2-NEXT: psllw %xmm2, %xmm3 1436; X86-SSE2-NEXT: psrlw $8, %xmm3 1437; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1438; X86-SSE2-NEXT: psllw %xmm2, %xmm1 1439; X86-SSE2-NEXT: psrlw $8, %xmm1 1440; X86-SSE2-NEXT: packuswb %xmm3, %xmm1 1441; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 1442; X86-SSE2-NEXT: retl 1443 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer 1444 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat) 1445 ret <16 x i8> %res 1446} 1447 1448; CGP should allow a cross-block splat shift amount to be seen in SDAG. 1449; PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426 1450 1451define void @sink_splatvar(ptr %p, i32 %shift_amt) { 1452; SSE-LABEL: sink_splatvar: 1453; SSE: # %bb.0: # %entry 1454; SSE-NEXT: movd %esi, %xmm0 1455; SSE-NEXT: movq $-1024, %rax # imm = 0xFC00 1456; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1457; SSE-NEXT: .p2align 4 1458; SSE-NEXT: .LBB8_1: # %loop 1459; SSE-NEXT: # =>This Inner Loop Header: Depth=1 1460; SSE-NEXT: movdqu 1024(%rdi,%rax), %xmm1 1461; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] 1462; SSE-NEXT: psllq %xmm0, %xmm2 1463; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 1464; SSE-NEXT: psllq %xmm0, %xmm1 1465; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] 1466; SSE-NEXT: movups %xmm1, 1024(%rdi,%rax) 1467; SSE-NEXT: addq $16, %rax 1468; SSE-NEXT: jne .LBB8_1 1469; SSE-NEXT: # %bb.2: # %end 1470; SSE-NEXT: retq 1471; 1472; AVX1-LABEL: sink_splatvar: 1473; AVX1: # %bb.0: # %entry 1474; AVX1-NEXT: vmovd %esi, %xmm0 1475; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 1476; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1477; AVX1-NEXT: .p2align 4 1478; AVX1-NEXT: .LBB8_1: # %loop 1479; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 1480; AVX1-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1 1481; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] 1482; AVX1-NEXT: vpsllq %xmm0, %xmm2, %xmm2 1483; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 1484; AVX1-NEXT: vpsllq %xmm0, %xmm1, %xmm1 1485; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] 1486; AVX1-NEXT: vmovups %xmm1, 1024(%rdi,%rax) 1487; AVX1-NEXT: addq $16, %rax 1488; AVX1-NEXT: jne .LBB8_1 1489; AVX1-NEXT: # %bb.2: # %end 1490; AVX1-NEXT: retq 1491; 1492; AVX2-LABEL: sink_splatvar: 1493; AVX2: # %bb.0: # %entry 1494; AVX2-NEXT: vmovd %esi, %xmm0 1495; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 1496; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 1497; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31] 1498; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 1499; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] 1500; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm1 1501; AVX2-NEXT: .p2align 4 1502; AVX2-NEXT: .LBB8_1: # %loop 1503; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 1504; AVX2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm2 1505; AVX2-NEXT: vpsllvd %xmm0, %xmm2, %xmm3 1506; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm2 1507; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 1508; AVX2-NEXT: vmovdqu %xmm2, 1024(%rdi,%rax) 1509; AVX2-NEXT: addq $16, %rax 1510; AVX2-NEXT: jne .LBB8_1 1511; AVX2-NEXT: # %bb.2: # %end 1512; AVX2-NEXT: retq 1513; 1514; AVX512F-LABEL: sink_splatvar: 1515; AVX512F: # %bb.0: # %entry 1516; AVX512F-NEXT: vmovd %esi, %xmm0 1517; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0 1518; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 1519; AVX512F-NEXT: .p2align 4 1520; AVX512F-NEXT: .LBB8_1: # %loop 1521; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 1522; AVX512F-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1 1523; AVX512F-NEXT: vprolvd %zmm0, %zmm1, %zmm1 1524; AVX512F-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) 1525; AVX512F-NEXT: addq $16, %rax 1526; AVX512F-NEXT: jne .LBB8_1 1527; AVX512F-NEXT: # %bb.2: # %end 1528; AVX512F-NEXT: vzeroupper 1529; AVX512F-NEXT: retq 1530; 1531; AVX512VL-LABEL: sink_splatvar: 1532; AVX512VL: # %bb.0: # %entry 1533; AVX512VL-NEXT: vpbroadcastd %esi, %xmm0 1534; AVX512VL-NEXT: movq $-1024, %rax # imm = 0xFC00 1535; AVX512VL-NEXT: .p2align 4 1536; AVX512VL-NEXT: .LBB8_1: # %loop 1537; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 1538; AVX512VL-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1 1539; AVX512VL-NEXT: vprolvd %xmm0, %xmm1, %xmm1 1540; AVX512VL-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) 1541; AVX512VL-NEXT: addq $16, %rax 1542; AVX512VL-NEXT: jne .LBB8_1 1543; AVX512VL-NEXT: # %bb.2: # %end 1544; AVX512VL-NEXT: retq 1545; 1546; AVX512BW-LABEL: sink_splatvar: 1547; AVX512BW: # %bb.0: # %entry 1548; AVX512BW-NEXT: vmovd %esi, %xmm0 1549; AVX512BW-NEXT: vpbroadcastd %xmm0, %xmm0 1550; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 1551; AVX512BW-NEXT: .p2align 4 1552; AVX512BW-NEXT: .LBB8_1: # %loop 1553; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 1554; AVX512BW-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1 1555; AVX512BW-NEXT: vprolvd %zmm0, %zmm1, %zmm1 1556; AVX512BW-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) 1557; AVX512BW-NEXT: addq $16, %rax 1558; AVX512BW-NEXT: jne .LBB8_1 1559; AVX512BW-NEXT: # %bb.2: # %end 1560; AVX512BW-NEXT: vzeroupper 1561; AVX512BW-NEXT: retq 1562; 1563; AVX512VBMI2-LABEL: sink_splatvar: 1564; AVX512VBMI2: # %bb.0: # %entry 1565; AVX512VBMI2-NEXT: vmovd %esi, %xmm0 1566; AVX512VBMI2-NEXT: vpbroadcastd %xmm0, %xmm0 1567; AVX512VBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00 1568; AVX512VBMI2-NEXT: .p2align 4 1569; AVX512VBMI2-NEXT: .LBB8_1: # %loop 1570; AVX512VBMI2-NEXT: # =>This Inner Loop Header: Depth=1 1571; AVX512VBMI2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1 1572; AVX512VBMI2-NEXT: vprolvd %zmm0, %zmm1, %zmm1 1573; AVX512VBMI2-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) 1574; AVX512VBMI2-NEXT: addq $16, %rax 1575; AVX512VBMI2-NEXT: jne .LBB8_1 1576; AVX512VBMI2-NEXT: # %bb.2: # %end 1577; AVX512VBMI2-NEXT: vzeroupper 1578; AVX512VBMI2-NEXT: retq 1579; 1580; AVX512VLBW-LABEL: sink_splatvar: 1581; AVX512VLBW: # %bb.0: # %entry 1582; AVX512VLBW-NEXT: vpbroadcastd %esi, %xmm0 1583; AVX512VLBW-NEXT: movq $-1024, %rax # imm = 0xFC00 1584; AVX512VLBW-NEXT: .p2align 4 1585; AVX512VLBW-NEXT: .LBB8_1: # %loop 1586; AVX512VLBW-NEXT: # =>This Inner Loop Header: Depth=1 1587; AVX512VLBW-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1 1588; AVX512VLBW-NEXT: vprolvd %xmm0, %xmm1, %xmm1 1589; AVX512VLBW-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) 1590; AVX512VLBW-NEXT: addq $16, %rax 1591; AVX512VLBW-NEXT: jne .LBB8_1 1592; AVX512VLBW-NEXT: # %bb.2: # %end 1593; AVX512VLBW-NEXT: retq 1594; 1595; AVX512VLVBMI2-LABEL: sink_splatvar: 1596; AVX512VLVBMI2: # %bb.0: # %entry 1597; AVX512VLVBMI2-NEXT: vpbroadcastd %esi, %xmm0 1598; AVX512VLVBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00 1599; AVX512VLVBMI2-NEXT: .p2align 4 1600; AVX512VLVBMI2-NEXT: .LBB8_1: # %loop 1601; AVX512VLVBMI2-NEXT: # =>This Inner Loop Header: Depth=1 1602; AVX512VLVBMI2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1 1603; AVX512VLVBMI2-NEXT: vprolvd %xmm0, %xmm1, %xmm1 1604; AVX512VLVBMI2-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) 1605; AVX512VLVBMI2-NEXT: addq $16, %rax 1606; AVX512VLVBMI2-NEXT: jne .LBB8_1 1607; AVX512VLVBMI2-NEXT: # %bb.2: # %end 1608; AVX512VLVBMI2-NEXT: retq 1609; 1610; XOPAVX1-LABEL: sink_splatvar: 1611; XOPAVX1: # %bb.0: # %entry 1612; XOPAVX1-NEXT: vmovd %esi, %xmm0 1613; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1614; XOPAVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 1615; XOPAVX1-NEXT: .p2align 4 1616; XOPAVX1-NEXT: .LBB8_1: # %loop 1617; XOPAVX1-NEXT: # =>This Inner Loop Header: Depth=1 1618; XOPAVX1-NEXT: vprotd %xmm0, 1024(%rdi,%rax), %xmm1 1619; XOPAVX1-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) 1620; XOPAVX1-NEXT: addq $16, %rax 1621; XOPAVX1-NEXT: jne .LBB8_1 1622; XOPAVX1-NEXT: # %bb.2: # %end 1623; XOPAVX1-NEXT: retq 1624; 1625; XOPAVX2-LABEL: sink_splatvar: 1626; XOPAVX2: # %bb.0: # %entry 1627; XOPAVX2-NEXT: vmovd %esi, %xmm0 1628; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0 1629; XOPAVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 1630; XOPAVX2-NEXT: .p2align 4 1631; XOPAVX2-NEXT: .LBB8_1: # %loop 1632; XOPAVX2-NEXT: # =>This Inner Loop Header: Depth=1 1633; XOPAVX2-NEXT: vprotd %xmm0, 1024(%rdi,%rax), %xmm1 1634; XOPAVX2-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax) 1635; XOPAVX2-NEXT: addq $16, %rax 1636; XOPAVX2-NEXT: jne .LBB8_1 1637; XOPAVX2-NEXT: # %bb.2: # %end 1638; XOPAVX2-NEXT: retq 1639; 1640; X86-SSE2-LABEL: sink_splatvar: 1641; X86-SSE2: # %bb.0: # %entry 1642; X86-SSE2-NEXT: pushl %esi 1643; X86-SSE2-NEXT: .cfi_def_cfa_offset 8 1644; X86-SSE2-NEXT: .cfi_offset %esi, -8 1645; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax 1646; X86-SSE2-NEXT: xorl %ecx, %ecx 1647; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1648; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1649; X86-SSE2-NEXT: xorl %edx, %edx 1650; X86-SSE2-NEXT: .p2align 4 1651; X86-SSE2-NEXT: .LBB8_1: # %loop 1652; X86-SSE2-NEXT: # =>This Inner Loop Header: Depth=1 1653; X86-SSE2-NEXT: movdqu (%eax,%ecx,4), %xmm1 1654; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] 1655; X86-SSE2-NEXT: psllq %xmm0, %xmm2 1656; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 1657; X86-SSE2-NEXT: psllq %xmm0, %xmm1 1658; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] 1659; X86-SSE2-NEXT: movups %xmm1, (%eax,%ecx,4) 1660; X86-SSE2-NEXT: addl $4, %ecx 1661; X86-SSE2-NEXT: adcl $0, %edx 1662; X86-SSE2-NEXT: movl %ecx, %esi 1663; X86-SSE2-NEXT: xorl $256, %esi # imm = 0x100 1664; X86-SSE2-NEXT: orl %edx, %esi 1665; X86-SSE2-NEXT: jne .LBB8_1 1666; X86-SSE2-NEXT: # %bb.2: # %end 1667; X86-SSE2-NEXT: popl %esi 1668; X86-SSE2-NEXT: .cfi_def_cfa_offset 4 1669; X86-SSE2-NEXT: retl 1670entry: 1671 %ins = insertelement <4 x i32> undef, i32 %shift_amt, i32 0 1672 %splat = shufflevector <4 x i32> %ins, <4 x i32> undef, <4 x i32> zeroinitializer 1673 br label %loop 1674 1675loop: 1676 %index = phi i64 [ 0, %entry ], [ %inc, %loop ] 1677 %addr = getelementptr inbounds i32, ptr %p, i64 %index 1678 %x = load <4 x i32>, ptr %addr, align 4 1679 %fsh = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %splat) 1680 store <4 x i32> %fsh, ptr %addr, align 4 1681 %inc = add i64 %index, 4 1682 %iv = icmp eq i64 %inc, 256 1683 br i1 %iv, label %end, label %loop 1684 1685end: 1686 ret void 1687} 1688 1689; 1690; Constant Shifts 1691; 1692 1693define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { 1694; SSE2-LABEL: constant_funnnel_v2i64: 1695; SSE2: # %bb.0: 1696; SSE2-NEXT: movdqa %xmm1, %xmm2 1697; SSE2-NEXT: psrlq $60, %xmm2 1698; SSE2-NEXT: psrlq $50, %xmm1 1699; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] 1700; SSE2-NEXT: movdqa %xmm0, %xmm1 1701; SSE2-NEXT: psllq $4, %xmm1 1702; SSE2-NEXT: psllq $14, %xmm0 1703; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1704; SSE2-NEXT: orpd %xmm2, %xmm0 1705; SSE2-NEXT: retq 1706; 1707; SSE41-LABEL: constant_funnnel_v2i64: 1708; SSE41: # %bb.0: 1709; SSE41-NEXT: movdqa %xmm1, %xmm2 1710; SSE41-NEXT: psrlq $50, %xmm2 1711; SSE41-NEXT: psrlq $60, %xmm1 1712; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1713; SSE41-NEXT: movdqa %xmm0, %xmm1 1714; SSE41-NEXT: psllq $14, %xmm1 1715; SSE41-NEXT: psllq $4, %xmm0 1716; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1717; SSE41-NEXT: por %xmm2, %xmm0 1718; SSE41-NEXT: retq 1719; 1720; AVX1-LABEL: constant_funnnel_v2i64: 1721; AVX1: # %bb.0: 1722; AVX1-NEXT: vpsrlq $50, %xmm1, %xmm2 1723; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm1 1724; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1725; AVX1-NEXT: vpsllq $14, %xmm0, %xmm2 1726; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0 1727; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1728; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1729; AVX1-NEXT: retq 1730; 1731; AVX2-LABEL: constant_funnnel_v2i64: 1732; AVX2: # %bb.0: 1733; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1734; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1735; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1736; AVX2-NEXT: retq 1737; 1738; AVX512F-LABEL: constant_funnnel_v2i64: 1739; AVX512F: # %bb.0: 1740; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1741; AVX512F-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1742; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 1743; AVX512F-NEXT: retq 1744; 1745; AVX512VL-LABEL: constant_funnnel_v2i64: 1746; AVX512VL: # %bb.0: 1747; AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1748; AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1749; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 1750; AVX512VL-NEXT: retq 1751; 1752; AVX512BW-LABEL: constant_funnnel_v2i64: 1753; AVX512BW: # %bb.0: 1754; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1755; AVX512BW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1756; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1757; AVX512BW-NEXT: retq 1758; 1759; AVX512VBMI2-LABEL: constant_funnnel_v2i64: 1760; AVX512VBMI2: # %bb.0: 1761; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1762; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1763; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,14] 1764; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0 1765; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1766; AVX512VBMI2-NEXT: vzeroupper 1767; AVX512VBMI2-NEXT: retq 1768; 1769; AVX512VLBW-LABEL: constant_funnnel_v2i64: 1770; AVX512VLBW: # %bb.0: 1771; AVX512VLBW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1772; AVX512VLBW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1773; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1774; AVX512VLBW-NEXT: retq 1775; 1776; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64: 1777; AVX512VLVBMI2: # %bb.0: 1778; AVX512VLVBMI2-NEXT: vpshldvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 1779; AVX512VLVBMI2-NEXT: retq 1780; 1781; XOPAVX1-LABEL: constant_funnnel_v2i64: 1782; XOPAVX1: # %bb.0: 1783; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1784; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1785; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1786; XOPAVX1-NEXT: retq 1787; 1788; XOPAVX2-LABEL: constant_funnnel_v2i64: 1789; XOPAVX2: # %bb.0: 1790; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1791; XOPAVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1792; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1793; XOPAVX2-NEXT: retq 1794; 1795; X86-SSE2-LABEL: constant_funnnel_v2i64: 1796; X86-SSE2: # %bb.0: 1797; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 1798; X86-SSE2-NEXT: psrlq $60, %xmm2 1799; X86-SSE2-NEXT: psrlq $50, %xmm1 1800; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] 1801; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1802; X86-SSE2-NEXT: psllq $4, %xmm1 1803; X86-SSE2-NEXT: psllq $14, %xmm0 1804; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1805; X86-SSE2-NEXT: orpd %xmm2, %xmm0 1806; X86-SSE2-NEXT: retl 1807 %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>) 1808 ret <2 x i64> %res 1809} 1810 1811define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { 1812; SSE2-LABEL: constant_funnnel_v4i32: 1813; SSE2: # %bb.0: 1814; SSE2-NEXT: movdqa %xmm1, %xmm2 1815; SSE2-NEXT: psrld $25, %xmm2 1816; SSE2-NEXT: movdqa %xmm1, %xmm3 1817; SSE2-NEXT: psrld $26, %xmm3 1818; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1819; SSE2-NEXT: movdqa %xmm1, %xmm2 1820; SSE2-NEXT: psrld $27, %xmm2 1821; SSE2-NEXT: psrld $28, %xmm1 1822; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1823; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] 1824; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1825; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1826; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1827; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1828; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1829; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1830; SSE2-NEXT: por %xmm1, %xmm0 1831; SSE2-NEXT: retq 1832; 1833; SSE41-LABEL: constant_funnnel_v4i32: 1834; SSE41: # %bb.0: 1835; SSE41-NEXT: movdqa %xmm1, %xmm2 1836; SSE41-NEXT: psrld $25, %xmm2 1837; SSE41-NEXT: movdqa %xmm1, %xmm3 1838; SSE41-NEXT: psrld $27, %xmm3 1839; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1840; SSE41-NEXT: movdqa %xmm1, %xmm2 1841; SSE41-NEXT: psrld $26, %xmm2 1842; SSE41-NEXT: psrld $28, %xmm1 1843; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1844; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1845; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1846; SSE41-NEXT: por %xmm2, %xmm0 1847; SSE41-NEXT: retq 1848; 1849; AVX1-LABEL: constant_funnnel_v4i32: 1850; AVX1: # %bb.0: 1851; AVX1-NEXT: vpsrld $25, %xmm1, %xmm2 1852; AVX1-NEXT: vpsrld $27, %xmm1, %xmm3 1853; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1854; AVX1-NEXT: vpsrld $26, %xmm1, %xmm3 1855; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1 1856; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] 1857; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1858; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1859; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1860; AVX1-NEXT: retq 1861; 1862; AVX2-LABEL: constant_funnnel_v4i32: 1863; AVX2: # %bb.0: 1864; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1865; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1866; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1867; AVX2-NEXT: retq 1868; 1869; AVX512F-LABEL: constant_funnnel_v4i32: 1870; AVX512F: # %bb.0: 1871; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1872; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1873; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 1874; AVX512F-NEXT: retq 1875; 1876; AVX512VL-LABEL: constant_funnnel_v4i32: 1877; AVX512VL: # %bb.0: 1878; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1879; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1880; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 1881; AVX512VL-NEXT: retq 1882; 1883; AVX512BW-LABEL: constant_funnnel_v4i32: 1884; AVX512BW: # %bb.0: 1885; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1886; AVX512BW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1887; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1888; AVX512BW-NEXT: retq 1889; 1890; AVX512VBMI2-LABEL: constant_funnnel_v4i32: 1891; AVX512VBMI2: # %bb.0: 1892; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1893; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1894; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,5,6,7] 1895; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0 1896; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1897; AVX512VBMI2-NEXT: vzeroupper 1898; AVX512VBMI2-NEXT: retq 1899; 1900; AVX512VLBW-LABEL: constant_funnnel_v4i32: 1901; AVX512VLBW: # %bb.0: 1902; AVX512VLBW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1903; AVX512VLBW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1904; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1905; AVX512VLBW-NEXT: retq 1906; 1907; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32: 1908; AVX512VLVBMI2: # %bb.0: 1909; AVX512VLVBMI2-NEXT: vpshldvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 1910; AVX512VLVBMI2-NEXT: retq 1911; 1912; XOPAVX1-LABEL: constant_funnnel_v4i32: 1913; XOPAVX1: # %bb.0: 1914; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1915; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1916; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1917; XOPAVX1-NEXT: retq 1918; 1919; XOPAVX2-LABEL: constant_funnnel_v4i32: 1920; XOPAVX2: # %bb.0: 1921; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1922; XOPAVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1923; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1924; XOPAVX2-NEXT: retq 1925; 1926; X86-SSE2-LABEL: constant_funnnel_v4i32: 1927; X86-SSE2: # %bb.0: 1928; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 1929; X86-SSE2-NEXT: psrld $25, %xmm2 1930; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 1931; X86-SSE2-NEXT: psrld $26, %xmm3 1932; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1933; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 1934; X86-SSE2-NEXT: psrld $27, %xmm2 1935; X86-SSE2-NEXT: psrld $28, %xmm1 1936; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1937; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] 1938; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1939; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1940; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1941; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 1942; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1943; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1944; X86-SSE2-NEXT: por %xmm1, %xmm0 1945; X86-SSE2-NEXT: retl 1946 %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>) 1947 ret <4 x i32> %res 1948} 1949 1950define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { 1951; SSE-LABEL: constant_funnnel_v8i16: 1952; SSE: # %bb.0: 1953; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128] 1954; SSE-NEXT: psrlw $1, %xmm1 1955; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,4,8,16,32,64,128,256] 1956; SSE-NEXT: por %xmm1, %xmm0 1957; SSE-NEXT: retq 1958; 1959; AVX-LABEL: constant_funnnel_v8i16: 1960; AVX: # %bb.0: 1961; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] 1962; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 1963; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256] 1964; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1965; AVX-NEXT: retq 1966; 1967; AVX512F-LABEL: constant_funnnel_v8i16: 1968; AVX512F: # %bb.0: 1969; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] 1970; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 1971; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256] 1972; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 1973; AVX512F-NEXT: retq 1974; 1975; AVX512VL-LABEL: constant_funnnel_v8i16: 1976; AVX512VL: # %bb.0: 1977; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] 1978; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1 1979; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256] 1980; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 1981; AVX512VL-NEXT: retq 1982; 1983; AVX512BW-LABEL: constant_funnnel_v8i16: 1984; AVX512BW: # %bb.0: 1985; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1986; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] 1987; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 1988; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8] 1989; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1 1990; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 1991; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1992; AVX512BW-NEXT: vzeroupper 1993; AVX512BW-NEXT: retq 1994; 1995; AVX512VBMI2-LABEL: constant_funnnel_v8i16: 1996; AVX512VBMI2: # %bb.0: 1997; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1998; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1999; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] 2000; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0 2001; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2002; AVX512VBMI2-NEXT: vzeroupper 2003; AVX512VBMI2-NEXT: retq 2004; 2005; AVX512VLBW-LABEL: constant_funnnel_v8i16: 2006; AVX512VLBW: # %bb.0: 2007; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2008; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 2009; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2010; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 2011; AVX512VLBW-NEXT: retq 2012; 2013; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16: 2014; AVX512VLVBMI2: # %bb.0: 2015; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 2016; AVX512VLVBMI2-NEXT: retq 2017; 2018; XOP-LABEL: constant_funnnel_v8i16: 2019; XOP: # %bb.0: 2020; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2021; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 2022; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2023; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 2024; XOP-NEXT: retq 2025; 2026; X86-SSE2-LABEL: constant_funnnel_v8i16: 2027; X86-SSE2: # %bb.0: 2028; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,16,32,64,128] 2029; X86-SSE2-NEXT: psrlw $1, %xmm1 2030; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2,4,8,16,32,64,128,256] 2031; X86-SSE2-NEXT: por %xmm1, %xmm0 2032; X86-SSE2-NEXT: retl 2033 %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>) 2034 ret <8 x i16> %res 2035} 2036 2037define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { 2038; SSE-LABEL: constant_funnnel_v16i8: 2039; SSE: # %bb.0: 2040; SSE-NEXT: movdqa %xmm1, %xmm2 2041; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2042; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,128,64,32,16,8,4,2] 2043; SSE-NEXT: psrlw $8, %xmm2 2044; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2045; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,2,4,8,16,32,64,128] 2046; SSE-NEXT: psrlw $8, %xmm1 2047; SSE-NEXT: packuswb %xmm2, %xmm1 2048; SSE-NEXT: movdqa %xmm1, %xmm0 2049; SSE-NEXT: retq 2050; 2051; AVX-LABEL: constant_funnnel_v16i8: 2052; AVX: # %bb.0: 2053; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 2054; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2] 2055; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 2056; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2057; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] 2058; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 2059; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2060; AVX-NEXT: retq 2061; 2062; AVX512F-LABEL: constant_funnnel_v16i8: 2063; AVX512F: # %bb.0: 2064; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 2065; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2] 2066; AVX512F-NEXT: vpsrlw $8, %xmm2, %xmm2 2067; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2068; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] 2069; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0 2070; AVX512F-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2071; AVX512F-NEXT: retq 2072; 2073; AVX512VL-LABEL: constant_funnnel_v16i8: 2074; AVX512VL: # %bb.0: 2075; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 2076; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2] 2077; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2 2078; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2079; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] 2080; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 2081; AVX512VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2082; AVX512VL-NEXT: retq 2083; 2084; AVX512BW-LABEL: constant_funnnel_v16i8: 2085; AVX512BW: # %bb.0: 2086; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] 2087; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2088; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 2089; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0 2090; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 2091; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 2092; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 2093; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2094; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2095; AVX512BW-NEXT: vzeroupper 2096; AVX512BW-NEXT: retq 2097; 2098; AVX512VBMI2-LABEL: constant_funnnel_v16i8: 2099; AVX512VBMI2: # %bb.0: 2100; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 2101; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2102; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79] 2103; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm2, %zmm1 2104; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] 2105; AVX512VBMI2-NEXT: vpsllvw %zmm0, %zmm1, %zmm0 2106; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 2107; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 2108; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2109; AVX512VBMI2-NEXT: vzeroupper 2110; AVX512VBMI2-NEXT: retq 2111; 2112; AVX512VLBW-LABEL: constant_funnnel_v16i8: 2113; AVX512VLBW: # %bb.0: 2114; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2115; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 2116; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0 2117; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 2118; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2119; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0 2120; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 2121; AVX512VLBW-NEXT: vzeroupper 2122; AVX512VLBW-NEXT: retq 2123; 2124; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8: 2125; AVX512VLVBMI2: # %bb.0: 2126; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 2127; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 2128; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] 2129; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm2 2130; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 2131; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 2132; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 2133; AVX512VLVBMI2-NEXT: vzeroupper 2134; AVX512VLVBMI2-NEXT: retq 2135; 2136; XOP-LABEL: constant_funnnel_v16i8: 2137; XOP: # %bb.0: 2138; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 2139; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1 2140; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2141; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2142; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 2143; XOP-NEXT: retq 2144; 2145; X86-SSE2-LABEL: constant_funnnel_v16i8: 2146; X86-SSE2: # %bb.0: 2147; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 2148; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2149; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [1,128,64,32,16,8,4,2] 2150; X86-SSE2-NEXT: psrlw $8, %xmm2 2151; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2152; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,2,4,8,16,32,64,128] 2153; X86-SSE2-NEXT: psrlw $8, %xmm1 2154; X86-SSE2-NEXT: packuswb %xmm2, %xmm1 2155; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 2156; X86-SSE2-NEXT: retl 2157 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>) 2158 ret <16 x i8> %res 2159} 2160 2161; 2162; Uniform Constant Shifts 2163; 2164 2165define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { 2166; SSE-LABEL: splatconstant_funnnel_v2i64: 2167; SSE: # %bb.0: 2168; SSE-NEXT: psrlq $50, %xmm1 2169; SSE-NEXT: psllq $14, %xmm0 2170; SSE-NEXT: por %xmm1, %xmm0 2171; SSE-NEXT: retq 2172; 2173; AVX-LABEL: splatconstant_funnnel_v2i64: 2174; AVX: # %bb.0: 2175; AVX-NEXT: vpsrlq $50, %xmm1, %xmm1 2176; AVX-NEXT: vpsllq $14, %xmm0, %xmm0 2177; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2178; AVX-NEXT: retq 2179; 2180; AVX512F-LABEL: splatconstant_funnnel_v2i64: 2181; AVX512F: # %bb.0: 2182; AVX512F-NEXT: vpsrlq $50, %xmm1, %xmm1 2183; AVX512F-NEXT: vpsllq $14, %xmm0, %xmm0 2184; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 2185; AVX512F-NEXT: retq 2186; 2187; AVX512VL-LABEL: splatconstant_funnnel_v2i64: 2188; AVX512VL: # %bb.0: 2189; AVX512VL-NEXT: vpsrlq $50, %xmm1, %xmm1 2190; AVX512VL-NEXT: vpsllq $14, %xmm0, %xmm0 2191; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 2192; AVX512VL-NEXT: retq 2193; 2194; AVX512BW-LABEL: splatconstant_funnnel_v2i64: 2195; AVX512BW: # %bb.0: 2196; AVX512BW-NEXT: vpsrlq $50, %xmm1, %xmm1 2197; AVX512BW-NEXT: vpsllq $14, %xmm0, %xmm0 2198; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 2199; AVX512BW-NEXT: retq 2200; 2201; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64: 2202; AVX512VBMI2: # %bb.0: 2203; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 2204; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2205; AVX512VBMI2-NEXT: vpshldq $14, %zmm1, %zmm0, %zmm0 2206; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2207; AVX512VBMI2-NEXT: vzeroupper 2208; AVX512VBMI2-NEXT: retq 2209; 2210; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64: 2211; AVX512VLBW: # %bb.0: 2212; AVX512VLBW-NEXT: vpsrlq $50, %xmm1, %xmm1 2213; AVX512VLBW-NEXT: vpsllq $14, %xmm0, %xmm0 2214; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 2215; AVX512VLBW-NEXT: retq 2216; 2217; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64: 2218; AVX512VLVBMI2: # %bb.0: 2219; AVX512VLVBMI2-NEXT: vpshldq $14, %xmm1, %xmm0, %xmm0 2220; AVX512VLVBMI2-NEXT: retq 2221; 2222; XOP-LABEL: splatconstant_funnnel_v2i64: 2223; XOP: # %bb.0: 2224; XOP-NEXT: vpsrlq $50, %xmm1, %xmm1 2225; XOP-NEXT: vpsllq $14, %xmm0, %xmm0 2226; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 2227; XOP-NEXT: retq 2228; 2229; X86-SSE2-LABEL: splatconstant_funnnel_v2i64: 2230; X86-SSE2: # %bb.0: 2231; X86-SSE2-NEXT: psrlq $50, %xmm1 2232; X86-SSE2-NEXT: psllq $14, %xmm0 2233; X86-SSE2-NEXT: por %xmm1, %xmm0 2234; X86-SSE2-NEXT: retl 2235 %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>) 2236 ret <2 x i64> %res 2237} 2238 2239define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { 2240; SSE-LABEL: splatconstant_funnnel_v4i32: 2241; SSE: # %bb.0: 2242; SSE-NEXT: psrld $28, %xmm1 2243; SSE-NEXT: pslld $4, %xmm0 2244; SSE-NEXT: por %xmm1, %xmm0 2245; SSE-NEXT: retq 2246; 2247; AVX-LABEL: splatconstant_funnnel_v4i32: 2248; AVX: # %bb.0: 2249; AVX-NEXT: vpsrld $28, %xmm1, %xmm1 2250; AVX-NEXT: vpslld $4, %xmm0, %xmm0 2251; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2252; AVX-NEXT: retq 2253; 2254; AVX512F-LABEL: splatconstant_funnnel_v4i32: 2255; AVX512F: # %bb.0: 2256; AVX512F-NEXT: vpsrld $28, %xmm1, %xmm1 2257; AVX512F-NEXT: vpslld $4, %xmm0, %xmm0 2258; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 2259; AVX512F-NEXT: retq 2260; 2261; AVX512VL-LABEL: splatconstant_funnnel_v4i32: 2262; AVX512VL: # %bb.0: 2263; AVX512VL-NEXT: vpsrld $28, %xmm1, %xmm1 2264; AVX512VL-NEXT: vpslld $4, %xmm0, %xmm0 2265; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 2266; AVX512VL-NEXT: retq 2267; 2268; AVX512BW-LABEL: splatconstant_funnnel_v4i32: 2269; AVX512BW: # %bb.0: 2270; AVX512BW-NEXT: vpsrld $28, %xmm1, %xmm1 2271; AVX512BW-NEXT: vpslld $4, %xmm0, %xmm0 2272; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 2273; AVX512BW-NEXT: retq 2274; 2275; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32: 2276; AVX512VBMI2: # %bb.0: 2277; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 2278; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2279; AVX512VBMI2-NEXT: vpshldd $4, %zmm1, %zmm0, %zmm0 2280; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2281; AVX512VBMI2-NEXT: vzeroupper 2282; AVX512VBMI2-NEXT: retq 2283; 2284; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32: 2285; AVX512VLBW: # %bb.0: 2286; AVX512VLBW-NEXT: vpsrld $28, %xmm1, %xmm1 2287; AVX512VLBW-NEXT: vpslld $4, %xmm0, %xmm0 2288; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 2289; AVX512VLBW-NEXT: retq 2290; 2291; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32: 2292; AVX512VLVBMI2: # %bb.0: 2293; AVX512VLVBMI2-NEXT: vpshldd $4, %xmm1, %xmm0, %xmm0 2294; AVX512VLVBMI2-NEXT: retq 2295; 2296; XOP-LABEL: splatconstant_funnnel_v4i32: 2297; XOP: # %bb.0: 2298; XOP-NEXT: vpsrld $28, %xmm1, %xmm1 2299; XOP-NEXT: vpslld $4, %xmm0, %xmm0 2300; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 2301; XOP-NEXT: retq 2302; 2303; X86-SSE2-LABEL: splatconstant_funnnel_v4i32: 2304; X86-SSE2: # %bb.0: 2305; X86-SSE2-NEXT: psrld $28, %xmm1 2306; X86-SSE2-NEXT: pslld $4, %xmm0 2307; X86-SSE2-NEXT: por %xmm1, %xmm0 2308; X86-SSE2-NEXT: retl 2309 %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>) 2310 ret <4 x i32> %res 2311} 2312 2313define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { 2314; SSE-LABEL: splatconstant_funnnel_v8i16: 2315; SSE: # %bb.0: 2316; SSE-NEXT: psrlw $9, %xmm1 2317; SSE-NEXT: psllw $7, %xmm0 2318; SSE-NEXT: por %xmm1, %xmm0 2319; SSE-NEXT: retq 2320; 2321; AVX-LABEL: splatconstant_funnnel_v8i16: 2322; AVX: # %bb.0: 2323; AVX-NEXT: vpsrlw $9, %xmm1, %xmm1 2324; AVX-NEXT: vpsllw $7, %xmm0, %xmm0 2325; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2326; AVX-NEXT: retq 2327; 2328; AVX512F-LABEL: splatconstant_funnnel_v8i16: 2329; AVX512F: # %bb.0: 2330; AVX512F-NEXT: vpsrlw $9, %xmm1, %xmm1 2331; AVX512F-NEXT: vpsllw $7, %xmm0, %xmm0 2332; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 2333; AVX512F-NEXT: retq 2334; 2335; AVX512VL-LABEL: splatconstant_funnnel_v8i16: 2336; AVX512VL: # %bb.0: 2337; AVX512VL-NEXT: vpsrlw $9, %xmm1, %xmm1 2338; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0 2339; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 2340; AVX512VL-NEXT: retq 2341; 2342; AVX512BW-LABEL: splatconstant_funnnel_v8i16: 2343; AVX512BW: # %bb.0: 2344; AVX512BW-NEXT: vpsrlw $9, %xmm1, %xmm1 2345; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 2346; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 2347; AVX512BW-NEXT: retq 2348; 2349; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i16: 2350; AVX512VBMI2: # %bb.0: 2351; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 2352; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2353; AVX512VBMI2-NEXT: vpshldw $7, %zmm1, %zmm0, %zmm0 2354; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2355; AVX512VBMI2-NEXT: vzeroupper 2356; AVX512VBMI2-NEXT: retq 2357; 2358; AVX512VLBW-LABEL: splatconstant_funnnel_v8i16: 2359; AVX512VLBW: # %bb.0: 2360; AVX512VLBW-NEXT: vpsrlw $9, %xmm1, %xmm1 2361; AVX512VLBW-NEXT: vpsllw $7, %xmm0, %xmm0 2362; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 2363; AVX512VLBW-NEXT: retq 2364; 2365; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i16: 2366; AVX512VLVBMI2: # %bb.0: 2367; AVX512VLVBMI2-NEXT: vpshldw $7, %xmm1, %xmm0, %xmm0 2368; AVX512VLVBMI2-NEXT: retq 2369; 2370; XOP-LABEL: splatconstant_funnnel_v8i16: 2371; XOP: # %bb.0: 2372; XOP-NEXT: vpsrlw $9, %xmm1, %xmm1 2373; XOP-NEXT: vpsllw $7, %xmm0, %xmm0 2374; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 2375; XOP-NEXT: retq 2376; 2377; X86-SSE2-LABEL: splatconstant_funnnel_v8i16: 2378; X86-SSE2: # %bb.0: 2379; X86-SSE2-NEXT: psrlw $9, %xmm1 2380; X86-SSE2-NEXT: psllw $7, %xmm0 2381; X86-SSE2-NEXT: por %xmm1, %xmm0 2382; X86-SSE2-NEXT: retl 2383 %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>) 2384 ret <8 x i16> %res 2385} 2386 2387define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { 2388; SSE-LABEL: splatconstant_funnnel_v16i8: 2389; SSE: # %bb.0: 2390; SSE-NEXT: psrlw $4, %xmm1 2391; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2392; SSE-NEXT: psllw $4, %xmm0 2393; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2394; SSE-NEXT: por %xmm1, %xmm0 2395; SSE-NEXT: retq 2396; 2397; AVX-LABEL: splatconstant_funnnel_v16i8: 2398; AVX: # %bb.0: 2399; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1 2400; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2401; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 2402; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2403; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2404; AVX-NEXT: retq 2405; 2406; AVX512F-LABEL: splatconstant_funnnel_v16i8: 2407; AVX512F: # %bb.0: 2408; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm2 2409; AVX512F-NEXT: vpsrlw $4, %xmm1, %xmm0 2410; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) 2411; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2412; AVX512F-NEXT: vzeroupper 2413; AVX512F-NEXT: retq 2414; 2415; AVX512VL-LABEL: splatconstant_funnnel_v16i8: 2416; AVX512VL: # %bb.0: 2417; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2 2418; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm0 2419; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2)) 2420; AVX512VL-NEXT: retq 2421; 2422; AVX512BW-LABEL: splatconstant_funnnel_v16i8: 2423; AVX512BW: # %bb.0: 2424; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm2 2425; AVX512BW-NEXT: vpsrlw $4, %xmm1, %xmm0 2426; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) 2427; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2428; AVX512BW-NEXT: vzeroupper 2429; AVX512BW-NEXT: retq 2430; 2431; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8: 2432; AVX512VBMI2: # %bb.0: 2433; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm2 2434; AVX512VBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0 2435; AVX512VBMI2-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) 2436; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2437; AVX512VBMI2-NEXT: vzeroupper 2438; AVX512VBMI2-NEXT: retq 2439; 2440; AVX512VLBW-LABEL: splatconstant_funnnel_v16i8: 2441; AVX512VLBW: # %bb.0: 2442; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm2 2443; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm0 2444; AVX512VLBW-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2)) 2445; AVX512VLBW-NEXT: retq 2446; 2447; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8: 2448; AVX512VLVBMI2: # %bb.0: 2449; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm2 2450; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0 2451; AVX512VLVBMI2-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2)) 2452; AVX512VLVBMI2-NEXT: retq 2453; 2454; XOP-LABEL: splatconstant_funnnel_v16i8: 2455; XOP: # %bb.0: 2456; XOP-NEXT: vpsrlw $4, %xmm1, %xmm1 2457; XOP-NEXT: vpsllw $4, %xmm0, %xmm0 2458; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0, %xmm0 2459; XOP-NEXT: retq 2460; 2461; X86-SSE2-LABEL: splatconstant_funnnel_v16i8: 2462; X86-SSE2: # %bb.0: 2463; X86-SSE2-NEXT: psrlw $4, %xmm1 2464; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 2465; X86-SSE2-NEXT: psllw $4, %xmm0 2466; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 2467; X86-SSE2-NEXT: por %xmm1, %xmm0 2468; X86-SSE2-NEXT: retl 2469 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>) 2470 ret <16 x i8> %res 2471} 2472