1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-256 | FileCheck %s --check-prefixes=AVX512VLVBMI2 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-512 | FileCheck %s --check-prefixes=AVX512VLVBMI2 14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 16 17; Just one 32-bit run to make sure we do reasonable things for i64 cases. 18; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2 19 20declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) 21declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) 22declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) 23declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) 24 25; 26; Variable Shifts 27; 28 29define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind { 30; SSE2-LABEL: var_funnnel_v2i64: 31; SSE2: # %bb.0: 32; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63] 33; SSE2-NEXT: movdqa %xmm2, %xmm4 34; SSE2-NEXT: pand %xmm3, %xmm4 35; SSE2-NEXT: movdqa %xmm1, %xmm5 36; SSE2-NEXT: psrlq %xmm4, %xmm5 37; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] 38; SSE2-NEXT: psrlq %xmm4, %xmm1 39; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1] 40; SSE2-NEXT: pandn %xmm3, %xmm2 41; SSE2-NEXT: paddq %xmm0, %xmm0 42; SSE2-NEXT: movdqa %xmm0, %xmm1 43; SSE2-NEXT: psllq %xmm2, %xmm1 44; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 45; SSE2-NEXT: psllq %xmm2, %xmm0 46; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 47; SSE2-NEXT: orpd %xmm5, %xmm0 48; SSE2-NEXT: retq 49; 50; SSE41-LABEL: var_funnnel_v2i64: 51; SSE41: # %bb.0: 52; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [63,63] 53; SSE41-NEXT: movdqa %xmm2, %xmm4 54; SSE41-NEXT: pand %xmm3, %xmm4 55; SSE41-NEXT: movdqa %xmm1, %xmm5 56; SSE41-NEXT: psrlq %xmm4, %xmm5 57; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] 58; SSE41-NEXT: psrlq %xmm4, %xmm1 59; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7] 60; SSE41-NEXT: pandn %xmm3, %xmm2 61; SSE41-NEXT: paddq %xmm0, %xmm0 62; SSE41-NEXT: movdqa %xmm0, %xmm1 63; SSE41-NEXT: psllq %xmm2, %xmm1 64; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 65; SSE41-NEXT: psllq %xmm2, %xmm0 66; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 67; SSE41-NEXT: por %xmm5, %xmm0 68; SSE41-NEXT: retq 69; 70; AVX1-LABEL: var_funnnel_v2i64: 71; AVX1: # %bb.0: 72; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 73; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 74; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 75; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] 76; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 77; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] 78; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 79; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 80; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3 81; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 82; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 83; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 84; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 85; AVX1-NEXT: retq 86; 87; AVX2-LABEL: var_funnnel_v2i64: 88; AVX2: # %bb.0: 89; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 90; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 91; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 92; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 93; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 94; AVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 95; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 96; AVX2-NEXT: retq 97; 98; AVX512F-LABEL: var_funnnel_v2i64: 99; AVX512F: # %bb.0: 100; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 101; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 102; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 103; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 104; AVX512F-NEXT: vpaddq %xmm0, %xmm0, %xmm0 105; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 106; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 107; AVX512F-NEXT: retq 108; 109; AVX512VL-LABEL: var_funnnel_v2i64: 110; AVX512VL: # %bb.0: 111; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] 112; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 113; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 114; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 115; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 116; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 117; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 118; AVX512VL-NEXT: retq 119; 120; AVX512BW-LABEL: var_funnnel_v2i64: 121; AVX512BW: # %bb.0: 122; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 123; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 124; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 125; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 126; AVX512BW-NEXT: vpaddq %xmm0, %xmm0, %xmm0 127; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 128; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 129; AVX512BW-NEXT: retq 130; 131; AVX512VBMI2-LABEL: var_funnnel_v2i64: 132; AVX512VBMI2: # %bb.0: 133; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 134; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 135; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 136; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1 137; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 138; AVX512VBMI2-NEXT: vzeroupper 139; AVX512VBMI2-NEXT: retq 140; 141; AVX512VLBW-LABEL: var_funnnel_v2i64: 142; AVX512VLBW: # %bb.0: 143; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] 144; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 145; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 146; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 147; AVX512VLBW-NEXT: vpaddq %xmm0, %xmm0, %xmm0 148; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 149; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 150; AVX512VLBW-NEXT: retq 151; 152; AVX512VLVBMI2-LABEL: var_funnnel_v2i64: 153; AVX512VLVBMI2: # %bb.0: 154; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1 155; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 156; AVX512VLVBMI2-NEXT: retq 157; 158; XOPAVX1-LABEL: var_funnnel_v2i64: 159; XOPAVX1: # %bb.0: 160; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 161; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 162; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 163; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0 164; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 165; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 166; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 167; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1 168; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 169; XOPAVX1-NEXT: retq 170; 171; XOPAVX2-LABEL: var_funnnel_v2i64: 172; XOPAVX2: # %bb.0: 173; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 174; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 175; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 176; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 177; XOPAVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 178; XOPAVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 179; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 180; XOPAVX2-NEXT: retq 181; 182; X86-SSE2-LABEL: var_funnnel_v2i64: 183; X86-SSE2: # %bb.0: 184; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0] 185; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 186; X86-SSE2-NEXT: pand %xmm4, %xmm5 187; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 188; X86-SSE2-NEXT: psrlq %xmm5, %xmm3 189; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] 190; X86-SSE2-NEXT: psrlq %xmm5, %xmm1 191; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1] 192; X86-SSE2-NEXT: pandn %xmm4, %xmm2 193; X86-SSE2-NEXT: paddq %xmm0, %xmm0 194; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 195; X86-SSE2-NEXT: psllq %xmm2, %xmm1 196; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 197; X86-SSE2-NEXT: psllq %xmm2, %xmm0 198; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 199; X86-SSE2-NEXT: orpd %xmm3, %xmm0 200; X86-SSE2-NEXT: retl 201 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) 202 ret <2 x i64> %res 203} 204 205define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind { 206; SSE2-LABEL: var_funnnel_v4i32: 207; SSE2: # %bb.0: 208; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] 209; SSE2-NEXT: movdqa %xmm2, %xmm5 210; SSE2-NEXT: pand %xmm4, %xmm5 211; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] 212; SSE2-NEXT: movdqa %xmm1, %xmm6 213; SSE2-NEXT: psrld %xmm3, %xmm6 214; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] 215; SSE2-NEXT: movdqa %xmm1, %xmm3 216; SSE2-NEXT: psrld %xmm7, %xmm3 217; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] 218; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] 219; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 220; SSE2-NEXT: movdqa %xmm1, %xmm7 221; SSE2-NEXT: psrld %xmm6, %xmm7 222; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 223; SSE2-NEXT: psrld %xmm5, %xmm1 224; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] 225; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] 226; SSE2-NEXT: pandn %xmm4, %xmm2 227; SSE2-NEXT: pslld $23, %xmm2 228; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 229; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 230; SSE2-NEXT: paddd %xmm0, %xmm0 231; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 232; SSE2-NEXT: pmuludq %xmm1, %xmm0 233; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 234; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 235; SSE2-NEXT: pmuludq %xmm2, %xmm1 236; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 237; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 238; SSE2-NEXT: por %xmm3, %xmm0 239; SSE2-NEXT: retq 240; 241; SSE41-LABEL: var_funnnel_v4i32: 242; SSE41: # %bb.0: 243; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31] 244; SSE41-NEXT: movdqa %xmm2, %xmm4 245; SSE41-NEXT: pand %xmm3, %xmm4 246; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] 247; SSE41-NEXT: movdqa %xmm1, %xmm6 248; SSE41-NEXT: psrld %xmm5, %xmm6 249; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] 250; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7] 251; SSE41-NEXT: movdqa %xmm1, %xmm8 252; SSE41-NEXT: psrld %xmm7, %xmm8 253; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] 254; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] 255; SSE41-NEXT: movdqa %xmm1, %xmm6 256; SSE41-NEXT: psrld %xmm4, %xmm6 257; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7] 258; SSE41-NEXT: psrld %xmm4, %xmm1 259; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7] 260; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7] 261; SSE41-NEXT: pandn %xmm3, %xmm2 262; SSE41-NEXT: pslld $23, %xmm2 263; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 264; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 265; SSE41-NEXT: paddd %xmm0, %xmm0 266; SSE41-NEXT: pmulld %xmm1, %xmm0 267; SSE41-NEXT: por %xmm6, %xmm0 268; SSE41-NEXT: retq 269; 270; AVX1-LABEL: var_funnnel_v4i32: 271; AVX1: # %bb.0: 272; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31] 273; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 274; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 275; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5 276; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6 277; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 278; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] 279; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 280; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] 281; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 282; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 283; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 284; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] 285; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] 286; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 287; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 288; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 289; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 290; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 291; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 292; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 293; AVX1-NEXT: retq 294; 295; AVX2-LABEL: var_funnnel_v4i32: 296; AVX2: # %bb.0: 297; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 298; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 299; AVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 300; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 301; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 302; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 303; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 304; AVX2-NEXT: retq 305; 306; AVX512F-LABEL: var_funnnel_v4i32: 307; AVX512F: # %bb.0: 308; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 309; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 310; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 311; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 312; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0 313; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 314; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 315; AVX512F-NEXT: retq 316; 317; AVX512VL-LABEL: var_funnnel_v4i32: 318; AVX512VL: # %bb.0: 319; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 320; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 321; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 322; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 323; AVX512VL-NEXT: vpaddd %xmm0, %xmm0, %xmm0 324; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 325; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 326; AVX512VL-NEXT: retq 327; 328; AVX512BW-LABEL: var_funnnel_v4i32: 329; AVX512BW: # %bb.0: 330; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 331; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 332; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 333; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 334; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 335; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 336; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 337; AVX512BW-NEXT: retq 338; 339; AVX512VBMI2-LABEL: var_funnnel_v4i32: 340; AVX512VBMI2: # %bb.0: 341; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 342; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 343; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 344; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1 345; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 346; AVX512VBMI2-NEXT: vzeroupper 347; AVX512VBMI2-NEXT: retq 348; 349; AVX512VLBW-LABEL: var_funnnel_v4i32: 350; AVX512VLBW: # %bb.0: 351; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 352; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 353; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 354; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 355; AVX512VLBW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 356; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 357; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 358; AVX512VLBW-NEXT: retq 359; 360; AVX512VLVBMI2-LABEL: var_funnnel_v4i32: 361; AVX512VLVBMI2: # %bb.0: 362; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1 363; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 364; AVX512VLVBMI2-NEXT: retq 365; 366; XOPAVX1-LABEL: var_funnnel_v4i32: 367; XOPAVX1: # %bb.0: 368; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31] 369; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 370; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 371; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0 372; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 373; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 374; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 375; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1 376; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 377; XOPAVX1-NEXT: retq 378; 379; XOPAVX2-LABEL: var_funnnel_v4i32: 380; XOPAVX2: # %bb.0: 381; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 382; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 383; XOPAVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 384; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 385; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 386; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 387; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 388; XOPAVX2-NEXT: retq 389; 390; X86-SSE2-LABEL: var_funnnel_v4i32: 391; X86-SSE2: # %bb.0: 392; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31] 393; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 394; X86-SSE2-NEXT: pand %xmm4, %xmm5 395; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] 396; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 397; X86-SSE2-NEXT: psrld %xmm3, %xmm6 398; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7] 399; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 400; X86-SSE2-NEXT: psrld %xmm7, %xmm3 401; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] 402; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] 403; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 404; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 405; X86-SSE2-NEXT: psrld %xmm6, %xmm7 406; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 407; X86-SSE2-NEXT: psrld %xmm5, %xmm1 408; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] 409; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] 410; X86-SSE2-NEXT: pandn %xmm4, %xmm2 411; X86-SSE2-NEXT: pslld $23, %xmm2 412; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 413; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1 414; X86-SSE2-NEXT: paddd %xmm0, %xmm0 415; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 416; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 417; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 418; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 419; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 420; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 421; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 422; X86-SSE2-NEXT: por %xmm3, %xmm0 423; X86-SSE2-NEXT: retl 424 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) 425 ret <4 x i32> %res 426} 427 428define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind { 429; SSE2-LABEL: var_funnnel_v8i16: 430; SSE2: # %bb.0: 431; SSE2-NEXT: movdqa %xmm2, %xmm4 432; SSE2-NEXT: psllw $12, %xmm4 433; SSE2-NEXT: movdqa %xmm4, %xmm3 434; SSE2-NEXT: psraw $15, %xmm3 435; SSE2-NEXT: movdqa %xmm3, %xmm5 436; SSE2-NEXT: pandn %xmm1, %xmm5 437; SSE2-NEXT: psrlw $8, %xmm1 438; SSE2-NEXT: pand %xmm1, %xmm3 439; SSE2-NEXT: por %xmm5, %xmm3 440; SSE2-NEXT: paddw %xmm4, %xmm4 441; SSE2-NEXT: movdqa %xmm4, %xmm1 442; SSE2-NEXT: psraw $15, %xmm1 443; SSE2-NEXT: movdqa %xmm1, %xmm5 444; SSE2-NEXT: pandn %xmm3, %xmm5 445; SSE2-NEXT: psrlw $4, %xmm3 446; SSE2-NEXT: pand %xmm1, %xmm3 447; SSE2-NEXT: por %xmm5, %xmm3 448; SSE2-NEXT: paddw %xmm4, %xmm4 449; SSE2-NEXT: movdqa %xmm4, %xmm1 450; SSE2-NEXT: psraw $15, %xmm1 451; SSE2-NEXT: movdqa %xmm1, %xmm5 452; SSE2-NEXT: pandn %xmm3, %xmm5 453; SSE2-NEXT: psrlw $2, %xmm3 454; SSE2-NEXT: pand %xmm1, %xmm3 455; SSE2-NEXT: por %xmm5, %xmm3 456; SSE2-NEXT: paddw %xmm4, %xmm4 457; SSE2-NEXT: psraw $15, %xmm4 458; SSE2-NEXT: movdqa %xmm4, %xmm1 459; SSE2-NEXT: pandn %xmm3, %xmm1 460; SSE2-NEXT: psrlw $1, %xmm3 461; SSE2-NEXT: pand %xmm4, %xmm3 462; SSE2-NEXT: por %xmm1, %xmm3 463; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 464; SSE2-NEXT: movdqa %xmm2, %xmm1 465; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 466; SSE2-NEXT: pslld $23, %xmm1 467; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 468; SSE2-NEXT: paddd %xmm4, %xmm1 469; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 470; SSE2-NEXT: pslld $16, %xmm1 471; SSE2-NEXT: psrad $16, %xmm1 472; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 473; SSE2-NEXT: pslld $23, %xmm2 474; SSE2-NEXT: paddd %xmm4, %xmm2 475; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 476; SSE2-NEXT: pslld $16, %xmm2 477; SSE2-NEXT: psrad $16, %xmm2 478; SSE2-NEXT: packssdw %xmm1, %xmm2 479; SSE2-NEXT: paddw %xmm0, %xmm0 480; SSE2-NEXT: pmullw %xmm2, %xmm0 481; SSE2-NEXT: por %xmm3, %xmm0 482; SSE2-NEXT: retq 483; 484; SSE41-LABEL: var_funnnel_v8i16: 485; SSE41: # %bb.0: 486; SSE41-NEXT: movdqa %xmm0, %xmm3 487; SSE41-NEXT: pmovsxbw {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15] 488; SSE41-NEXT: movdqa %xmm2, %xmm4 489; SSE41-NEXT: pand %xmm5, %xmm4 490; SSE41-NEXT: psllw $4, %xmm4 491; SSE41-NEXT: movdqa %xmm2, %xmm0 492; SSE41-NEXT: psllw $12, %xmm0 493; SSE41-NEXT: por %xmm4, %xmm0 494; SSE41-NEXT: movdqa %xmm0, %xmm4 495; SSE41-NEXT: paddw %xmm0, %xmm4 496; SSE41-NEXT: movdqa %xmm1, %xmm6 497; SSE41-NEXT: psrlw $8, %xmm6 498; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1 499; SSE41-NEXT: movdqa %xmm1, %xmm6 500; SSE41-NEXT: psrlw $4, %xmm6 501; SSE41-NEXT: movdqa %xmm4, %xmm0 502; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1 503; SSE41-NEXT: movdqa %xmm1, %xmm6 504; SSE41-NEXT: psrlw $2, %xmm6 505; SSE41-NEXT: paddw %xmm4, %xmm4 506; SSE41-NEXT: movdqa %xmm4, %xmm0 507; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1 508; SSE41-NEXT: movdqa %xmm1, %xmm6 509; SSE41-NEXT: psrlw $1, %xmm6 510; SSE41-NEXT: paddw %xmm4, %xmm4 511; SSE41-NEXT: movdqa %xmm4, %xmm0 512; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1 513; SSE41-NEXT: pandn %xmm5, %xmm2 514; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 515; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 516; SSE41-NEXT: pslld $23, %xmm2 517; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 518; SSE41-NEXT: paddd %xmm4, %xmm2 519; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 520; SSE41-NEXT: pslld $23, %xmm0 521; SSE41-NEXT: paddd %xmm4, %xmm0 522; SSE41-NEXT: cvttps2dq %xmm0, %xmm0 523; SSE41-NEXT: packusdw %xmm2, %xmm0 524; SSE41-NEXT: paddw %xmm3, %xmm3 525; SSE41-NEXT: pmullw %xmm0, %xmm3 526; SSE41-NEXT: por %xmm1, %xmm3 527; SSE41-NEXT: movdqa %xmm3, %xmm0 528; SSE41-NEXT: retq 529; 530; AVX1-LABEL: var_funnnel_v8i16: 531; AVX1: # %bb.0: 532; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 533; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 534; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4 535; AVX1-NEXT: vpsllw $12, %xmm2, %xmm5 536; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 537; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm5 538; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm6 539; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 540; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 541; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 542; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4 543; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 544; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 545; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4 546; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 547; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 548; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 549; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] 550; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 551; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 552; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 553; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 554; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 555; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 556; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 557; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 558; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 559; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 560; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 561; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 562; AVX1-NEXT: retq 563; 564; AVX2-LABEL: var_funnnel_v8i16: 565; AVX2: # %bb.0: 566; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 567; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 568; AVX2-NEXT: vpslld $16, %ymm0, %ymm0 569; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 570; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 571; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 572; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 573; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 574; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 575; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 576; AVX2-NEXT: vzeroupper 577; AVX2-NEXT: retq 578; 579; AVX512F-LABEL: var_funnnel_v8i16: 580; AVX512F: # %bb.0: 581; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 582; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 583; AVX512F-NEXT: vpslld $16, %ymm0, %ymm0 584; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 585; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 586; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 587; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 588; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 589; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 590; AVX512F-NEXT: vzeroupper 591; AVX512F-NEXT: retq 592; 593; AVX512VL-LABEL: var_funnnel_v8i16: 594; AVX512VL: # %bb.0: 595; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 596; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 597; AVX512VL-NEXT: vpslld $16, %ymm0, %ymm0 598; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 599; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 600; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 601; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 602; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 603; AVX512VL-NEXT: vzeroupper 604; AVX512VL-NEXT: retq 605; 606; AVX512BW-LABEL: var_funnnel_v8i16: 607; AVX512BW: # %bb.0: 608; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 609; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 610; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 611; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 612; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 613; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 614; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 615; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 616; AVX512BW-NEXT: vzeroupper 617; AVX512BW-NEXT: retq 618; 619; AVX512VBMI2-LABEL: var_funnnel_v8i16: 620; AVX512VBMI2: # %bb.0: 621; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 622; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 623; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 624; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1 625; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 626; AVX512VBMI2-NEXT: vzeroupper 627; AVX512VBMI2-NEXT: retq 628; 629; AVX512VLBW-LABEL: var_funnnel_v8i16: 630; AVX512VLBW: # %bb.0: 631; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 632; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 633; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1 634; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 635; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 636; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm0 637; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 638; AVX512VLBW-NEXT: retq 639; 640; AVX512VLVBMI2-LABEL: var_funnnel_v8i16: 641; AVX512VLVBMI2: # %bb.0: 642; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1 643; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 644; AVX512VLVBMI2-NEXT: retq 645; 646; XOPAVX1-LABEL: var_funnnel_v8i16: 647; XOPAVX1: # %bb.0: 648; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 649; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 650; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 651; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0 652; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 653; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 654; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2 655; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1 656; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 657; XOPAVX1-NEXT: retq 658; 659; XOPAVX2-LABEL: var_funnnel_v8i16: 660; XOPAVX2: # %bb.0: 661; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 662; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 663; XOPAVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm0 664; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0 665; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 666; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 667; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2 668; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1 669; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 670; XOPAVX2-NEXT: retq 671; 672; X86-SSE2-LABEL: var_funnnel_v8i16: 673; X86-SSE2: # %bb.0: 674; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 675; X86-SSE2-NEXT: psllw $12, %xmm4 676; X86-SSE2-NEXT: movdqa %xmm4, %xmm3 677; X86-SSE2-NEXT: psraw $15, %xmm3 678; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 679; X86-SSE2-NEXT: pandn %xmm1, %xmm5 680; X86-SSE2-NEXT: psrlw $8, %xmm1 681; X86-SSE2-NEXT: pand %xmm1, %xmm3 682; X86-SSE2-NEXT: por %xmm5, %xmm3 683; X86-SSE2-NEXT: paddw %xmm4, %xmm4 684; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 685; X86-SSE2-NEXT: psraw $15, %xmm1 686; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 687; X86-SSE2-NEXT: pandn %xmm3, %xmm5 688; X86-SSE2-NEXT: psrlw $4, %xmm3 689; X86-SSE2-NEXT: pand %xmm1, %xmm3 690; X86-SSE2-NEXT: por %xmm5, %xmm3 691; X86-SSE2-NEXT: paddw %xmm4, %xmm4 692; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 693; X86-SSE2-NEXT: psraw $15, %xmm1 694; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 695; X86-SSE2-NEXT: pandn %xmm3, %xmm5 696; X86-SSE2-NEXT: psrlw $2, %xmm3 697; X86-SSE2-NEXT: pand %xmm1, %xmm3 698; X86-SSE2-NEXT: por %xmm5, %xmm3 699; X86-SSE2-NEXT: paddw %xmm4, %xmm4 700; X86-SSE2-NEXT: psraw $15, %xmm4 701; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 702; X86-SSE2-NEXT: pandn %xmm3, %xmm1 703; X86-SSE2-NEXT: psrlw $1, %xmm3 704; X86-SSE2-NEXT: pand %xmm4, %xmm3 705; X86-SSE2-NEXT: por %xmm1, %xmm3 706; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 707; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 708; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 709; X86-SSE2-NEXT: pslld $23, %xmm1 710; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 711; X86-SSE2-NEXT: paddd %xmm4, %xmm1 712; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 713; X86-SSE2-NEXT: pslld $16, %xmm1 714; X86-SSE2-NEXT: psrad $16, %xmm1 715; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 716; X86-SSE2-NEXT: pslld $23, %xmm2 717; X86-SSE2-NEXT: paddd %xmm4, %xmm2 718; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 719; X86-SSE2-NEXT: pslld $16, %xmm2 720; X86-SSE2-NEXT: psrad $16, %xmm2 721; X86-SSE2-NEXT: packssdw %xmm1, %xmm2 722; X86-SSE2-NEXT: paddw %xmm0, %xmm0 723; X86-SSE2-NEXT: pmullw %xmm2, %xmm0 724; X86-SSE2-NEXT: por %xmm3, %xmm0 725; X86-SSE2-NEXT: retl 726 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) 727 ret <8 x i16> %res 728} 729 730define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind { 731; SSE2-LABEL: var_funnnel_v16i8: 732; SSE2: # %bb.0: 733; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 734; SSE2-NEXT: movdqa %xmm2, %xmm6 735; SSE2-NEXT: pand %xmm5, %xmm6 736; SSE2-NEXT: psllw $5, %xmm6 737; SSE2-NEXT: pxor %xmm4, %xmm4 738; SSE2-NEXT: pxor %xmm3, %xmm3 739; SSE2-NEXT: pcmpgtb %xmm6, %xmm3 740; SSE2-NEXT: movdqa %xmm3, %xmm7 741; SSE2-NEXT: pandn %xmm1, %xmm7 742; SSE2-NEXT: psrlw $4, %xmm1 743; SSE2-NEXT: pand %xmm1, %xmm3 744; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 745; SSE2-NEXT: por %xmm7, %xmm3 746; SSE2-NEXT: paddb %xmm6, %xmm6 747; SSE2-NEXT: pxor %xmm1, %xmm1 748; SSE2-NEXT: pcmpgtb %xmm6, %xmm1 749; SSE2-NEXT: movdqa %xmm1, %xmm7 750; SSE2-NEXT: pandn %xmm3, %xmm7 751; SSE2-NEXT: psrlw $2, %xmm3 752; SSE2-NEXT: pand %xmm1, %xmm3 753; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 754; SSE2-NEXT: por %xmm7, %xmm3 755; SSE2-NEXT: paddb %xmm6, %xmm6 756; SSE2-NEXT: pxor %xmm1, %xmm1 757; SSE2-NEXT: pcmpgtb %xmm6, %xmm1 758; SSE2-NEXT: movdqa %xmm1, %xmm6 759; SSE2-NEXT: pandn %xmm3, %xmm6 760; SSE2-NEXT: psrlw $1, %xmm3 761; SSE2-NEXT: pand %xmm1, %xmm3 762; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 763; SSE2-NEXT: por %xmm6, %xmm3 764; SSE2-NEXT: pandn %xmm5, %xmm2 765; SSE2-NEXT: psllw $5, %xmm2 766; SSE2-NEXT: pxor %xmm1, %xmm1 767; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 768; SSE2-NEXT: paddb %xmm0, %xmm0 769; SSE2-NEXT: movdqa %xmm1, %xmm5 770; SSE2-NEXT: pandn %xmm0, %xmm5 771; SSE2-NEXT: psllw $4, %xmm0 772; SSE2-NEXT: pand %xmm1, %xmm0 773; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 774; SSE2-NEXT: por %xmm5, %xmm0 775; SSE2-NEXT: paddb %xmm2, %xmm2 776; SSE2-NEXT: pxor %xmm1, %xmm1 777; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 778; SSE2-NEXT: movdqa %xmm1, %xmm5 779; SSE2-NEXT: pandn %xmm0, %xmm5 780; SSE2-NEXT: psllw $2, %xmm0 781; SSE2-NEXT: pand %xmm1, %xmm0 782; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 783; SSE2-NEXT: por %xmm5, %xmm0 784; SSE2-NEXT: paddb %xmm2, %xmm2 785; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 786; SSE2-NEXT: movdqa %xmm4, %xmm1 787; SSE2-NEXT: pandn %xmm0, %xmm1 788; SSE2-NEXT: paddb %xmm0, %xmm0 789; SSE2-NEXT: pand %xmm4, %xmm0 790; SSE2-NEXT: por %xmm1, %xmm0 791; SSE2-NEXT: por %xmm3, %xmm0 792; SSE2-NEXT: retq 793; 794; SSE41-LABEL: var_funnnel_v16i8: 795; SSE41: # %bb.0: 796; SSE41-NEXT: movdqa %xmm2, %xmm3 797; SSE41-NEXT: movdqa %xmm0, %xmm2 798; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 799; SSE41-NEXT: movdqa %xmm3, %xmm0 800; SSE41-NEXT: pand %xmm5, %xmm0 801; SSE41-NEXT: psllw $5, %xmm0 802; SSE41-NEXT: movdqa %xmm0, %xmm4 803; SSE41-NEXT: paddb %xmm0, %xmm4 804; SSE41-NEXT: movdqa %xmm1, %xmm6 805; SSE41-NEXT: psrlw $4, %xmm6 806; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 807; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1 808; SSE41-NEXT: movdqa %xmm1, %xmm6 809; SSE41-NEXT: psrlw $2, %xmm6 810; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 811; SSE41-NEXT: movdqa %xmm4, %xmm0 812; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1 813; SSE41-NEXT: movdqa %xmm1, %xmm6 814; SSE41-NEXT: psrlw $1, %xmm6 815; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 816; SSE41-NEXT: paddb %xmm4, %xmm4 817; SSE41-NEXT: movdqa %xmm4, %xmm0 818; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1 819; SSE41-NEXT: pandn %xmm5, %xmm3 820; SSE41-NEXT: psllw $5, %xmm3 821; SSE41-NEXT: movdqa %xmm3, %xmm4 822; SSE41-NEXT: paddb %xmm3, %xmm4 823; SSE41-NEXT: paddb %xmm2, %xmm2 824; SSE41-NEXT: movdqa %xmm2, %xmm5 825; SSE41-NEXT: psllw $4, %xmm5 826; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 827; SSE41-NEXT: movdqa %xmm3, %xmm0 828; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm2 829; SSE41-NEXT: movdqa %xmm2, %xmm3 830; SSE41-NEXT: psllw $2, %xmm3 831; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 832; SSE41-NEXT: movdqa %xmm4, %xmm0 833; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 834; SSE41-NEXT: movdqa %xmm2, %xmm3 835; SSE41-NEXT: paddb %xmm2, %xmm3 836; SSE41-NEXT: paddb %xmm4, %xmm4 837; SSE41-NEXT: movdqa %xmm4, %xmm0 838; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 839; SSE41-NEXT: por %xmm1, %xmm2 840; SSE41-NEXT: movdqa %xmm2, %xmm0 841; SSE41-NEXT: retq 842; 843; AVX1-LABEL: var_funnnel_v16i8: 844; AVX1: # %bb.0: 845; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 846; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 847; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4 848; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5 849; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm6 850; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 851; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 852; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4 853; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 854; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 855; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4 856; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 857; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 858; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 859; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 860; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 861; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3 862; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 863; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4 864; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 865; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 866; AVX1-NEXT: vpsllw $2, %xmm0, %xmm2 867; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 868; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 869; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2 870; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3 871; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 872; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 873; AVX1-NEXT: retq 874; 875; AVX2-LABEL: var_funnnel_v16i8: 876; AVX2: # %bb.0: 877; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 878; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 879; AVX2-NEXT: vpsllw $5, %xmm4, %xmm4 880; AVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm5 881; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm6 882; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 883; AVX2-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 884; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm4 885; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 886; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 887; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm4 888; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 889; AVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5 890; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 891; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 892; AVX2-NEXT: vpsllw $5, %xmm2, %xmm2 893; AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm3 894; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 895; AVX2-NEXT: vpsllw $4, %xmm0, %xmm4 896; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 897; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 898; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2 899; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 900; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 901; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 902; AVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3 903; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 904; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 905; AVX2-NEXT: retq 906; 907; AVX512F-LABEL: var_funnnel_v16i8: 908; AVX512F: # %bb.0: 909; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 910; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 911; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero 912; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 913; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 914; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 915; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero 916; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm0 917; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 918; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 919; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 920; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 921; AVX512F-NEXT: vzeroupper 922; AVX512F-NEXT: retq 923; 924; AVX512VL-LABEL: var_funnnel_v16i8: 925; AVX512VL: # %bb.0: 926; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 927; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 928; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero 929; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 930; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 931; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 932; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero 933; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0 934; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 935; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 936; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 937; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 938; AVX512VL-NEXT: vzeroupper 939; AVX512VL-NEXT: retq 940; 941; AVX512BW-LABEL: var_funnnel_v16i8: 942; AVX512BW: # %bb.0: 943; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 944; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 945; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 946; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 947; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0 948; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 949; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 950; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 951; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 952; AVX512BW-NEXT: vzeroupper 953; AVX512BW-NEXT: retq 954; 955; AVX512VBMI2-LABEL: var_funnnel_v16i8: 956; AVX512VBMI2: # %bb.0: 957; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 958; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 959; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79] 960; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm3, %zmm1 961; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 962; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 963; AVX512VBMI2-NEXT: vpsrlvw %zmm0, %zmm1, %zmm0 964; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 965; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 966; AVX512VBMI2-NEXT: vzeroupper 967; AVX512VBMI2-NEXT: retq 968; 969; AVX512VLBW-LABEL: var_funnnel_v16i8: 970; AVX512VLBW: # %bb.0: 971; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 972; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 973; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0 974; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 975; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 976; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 977; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 978; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 979; AVX512VLBW-NEXT: vzeroupper 980; AVX512VLBW-NEXT: retq 981; 982; AVX512VLVBMI2-LABEL: var_funnnel_v16i8: 983; AVX512VLVBMI2: # %bb.0: 984; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 985; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 986; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] 987; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm3 988; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 989; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 990; AVX512VLVBMI2-NEXT: vpsrlvw %ymm0, %ymm3, %ymm0 991; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 992; AVX512VLVBMI2-NEXT: vzeroupper 993; AVX512VLVBMI2-NEXT: retq 994; 995; XOPAVX1-LABEL: var_funnnel_v16i8: 996; XOPAVX1: # %bb.0: 997; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 998; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 999; XOPAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 1000; XOPAVX1-NEXT: vpshlb %xmm4, %xmm0, %xmm0 1001; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1002; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1003; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2 1004; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1 1005; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1006; XOPAVX1-NEXT: retq 1007; 1008; XOPAVX2-LABEL: var_funnnel_v16i8: 1009; XOPAVX2: # %bb.0: 1010; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1011; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 1012; XOPAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 1013; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0 1014; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 1015; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 1016; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2 1017; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1 1018; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1019; XOPAVX2-NEXT: retq 1020; 1021; X86-SSE2-LABEL: var_funnnel_v16i8: 1022; X86-SSE2: # %bb.0: 1023; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1024; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 1025; X86-SSE2-NEXT: pand %xmm5, %xmm6 1026; X86-SSE2-NEXT: psllw $5, %xmm6 1027; X86-SSE2-NEXT: pxor %xmm4, %xmm4 1028; X86-SSE2-NEXT: pxor %xmm3, %xmm3 1029; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm3 1030; X86-SSE2-NEXT: movdqa %xmm3, %xmm7 1031; X86-SSE2-NEXT: pandn %xmm1, %xmm7 1032; X86-SSE2-NEXT: psrlw $4, %xmm1 1033; X86-SSE2-NEXT: pand %xmm1, %xmm3 1034; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 1035; X86-SSE2-NEXT: por %xmm7, %xmm3 1036; X86-SSE2-NEXT: paddb %xmm6, %xmm6 1037; X86-SSE2-NEXT: pxor %xmm1, %xmm1 1038; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 1039; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 1040; X86-SSE2-NEXT: pandn %xmm3, %xmm7 1041; X86-SSE2-NEXT: psrlw $2, %xmm3 1042; X86-SSE2-NEXT: pand %xmm1, %xmm3 1043; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 1044; X86-SSE2-NEXT: por %xmm7, %xmm3 1045; X86-SSE2-NEXT: paddb %xmm6, %xmm6 1046; X86-SSE2-NEXT: pxor %xmm1, %xmm1 1047; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 1048; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 1049; X86-SSE2-NEXT: pandn %xmm3, %xmm6 1050; X86-SSE2-NEXT: psrlw $1, %xmm3 1051; X86-SSE2-NEXT: pand %xmm1, %xmm3 1052; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 1053; X86-SSE2-NEXT: por %xmm6, %xmm3 1054; X86-SSE2-NEXT: pandn %xmm5, %xmm2 1055; X86-SSE2-NEXT: psllw $5, %xmm2 1056; X86-SSE2-NEXT: pxor %xmm1, %xmm1 1057; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 1058; X86-SSE2-NEXT: paddb %xmm0, %xmm0 1059; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 1060; X86-SSE2-NEXT: pandn %xmm0, %xmm5 1061; X86-SSE2-NEXT: psllw $4, %xmm0 1062; X86-SSE2-NEXT: pand %xmm1, %xmm0 1063; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1064; X86-SSE2-NEXT: por %xmm5, %xmm0 1065; X86-SSE2-NEXT: paddb %xmm2, %xmm2 1066; X86-SSE2-NEXT: pxor %xmm1, %xmm1 1067; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 1068; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 1069; X86-SSE2-NEXT: pandn %xmm0, %xmm5 1070; X86-SSE2-NEXT: psllw $2, %xmm0 1071; X86-SSE2-NEXT: pand %xmm1, %xmm0 1072; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1073; X86-SSE2-NEXT: por %xmm5, %xmm0 1074; X86-SSE2-NEXT: paddb %xmm2, %xmm2 1075; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4 1076; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 1077; X86-SSE2-NEXT: pandn %xmm0, %xmm1 1078; X86-SSE2-NEXT: paddb %xmm0, %xmm0 1079; X86-SSE2-NEXT: pand %xmm4, %xmm0 1080; X86-SSE2-NEXT: por %xmm1, %xmm0 1081; X86-SSE2-NEXT: por %xmm3, %xmm0 1082; X86-SSE2-NEXT: retl 1083 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) 1084 ret <16 x i8> %res 1085} 1086 1087; 1088; Uniform Variable Shifts 1089; 1090 1091define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind { 1092; SSE2-LABEL: splatvar_funnnel_v2i64: 1093; SSE2: # %bb.0: 1094; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63] 1095; SSE2-NEXT: movdqa %xmm2, %xmm4 1096; SSE2-NEXT: pand %xmm3, %xmm4 1097; SSE2-NEXT: psrlq %xmm4, %xmm1 1098; SSE2-NEXT: pandn %xmm3, %xmm2 1099; SSE2-NEXT: paddq %xmm0, %xmm0 1100; SSE2-NEXT: psllq %xmm2, %xmm0 1101; SSE2-NEXT: por %xmm1, %xmm0 1102; SSE2-NEXT: retq 1103; 1104; SSE41-LABEL: splatvar_funnnel_v2i64: 1105; SSE41: # %bb.0: 1106; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [63,63] 1107; SSE41-NEXT: movdqa %xmm2, %xmm4 1108; SSE41-NEXT: pand %xmm3, %xmm4 1109; SSE41-NEXT: psrlq %xmm4, %xmm1 1110; SSE41-NEXT: pandn %xmm3, %xmm2 1111; SSE41-NEXT: paddq %xmm0, %xmm0 1112; SSE41-NEXT: psllq %xmm2, %xmm0 1113; SSE41-NEXT: por %xmm1, %xmm0 1114; SSE41-NEXT: retq 1115; 1116; AVX-LABEL: splatvar_funnnel_v2i64: 1117; AVX: # %bb.0: 1118; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 1119; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 1120; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 1121; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 1122; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 1123; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1124; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1125; AVX-NEXT: retq 1126; 1127; AVX512F-LABEL: splatvar_funnnel_v2i64: 1128; AVX512F: # %bb.0: 1129; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 1130; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 1131; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 1132; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 1133; AVX512F-NEXT: vpaddq %xmm0, %xmm0, %xmm0 1134; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1135; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 1136; AVX512F-NEXT: retq 1137; 1138; AVX512VL-LABEL: splatvar_funnnel_v2i64: 1139; AVX512VL: # %bb.0: 1140; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] 1141; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 1142; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 1143; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 1144; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 1145; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1146; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 1147; AVX512VL-NEXT: retq 1148; 1149; AVX512BW-LABEL: splatvar_funnnel_v2i64: 1150; AVX512BW: # %bb.0: 1151; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 1152; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 1153; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 1154; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 1155; AVX512BW-NEXT: vpaddq %xmm0, %xmm0, %xmm0 1156; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1157; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1158; AVX512BW-NEXT: retq 1159; 1160; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64: 1161; AVX512VBMI2: # %bb.0: 1162; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1163; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1164; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2 1165; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1 1166; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 1167; AVX512VBMI2-NEXT: vzeroupper 1168; AVX512VBMI2-NEXT: retq 1169; 1170; AVX512VLBW-LABEL: splatvar_funnnel_v2i64: 1171; AVX512VLBW: # %bb.0: 1172; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] 1173; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 1174; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 1175; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 1176; AVX512VLBW-NEXT: vpaddq %xmm0, %xmm0, %xmm0 1177; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1178; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1179; AVX512VLBW-NEXT: retq 1180; 1181; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64: 1182; AVX512VLVBMI2: # %bb.0: 1183; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %xmm2 1184; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1 1185; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 1186; AVX512VLVBMI2-NEXT: retq 1187; 1188; XOP-LABEL: splatvar_funnnel_v2i64: 1189; XOP: # %bb.0: 1190; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63] 1191; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 1192; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 1193; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 1194; XOP-NEXT: vpaddq %xmm0, %xmm0, %xmm0 1195; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 1196; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 1197; XOP-NEXT: retq 1198; 1199; X86-SSE2-LABEL: splatvar_funnnel_v2i64: 1200; X86-SSE2: # %bb.0: 1201; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0] 1202; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 1203; X86-SSE2-NEXT: pand %xmm3, %xmm4 1204; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 1205; X86-SSE2-NEXT: pandn %xmm3, %xmm2 1206; X86-SSE2-NEXT: paddq %xmm0, %xmm0 1207; X86-SSE2-NEXT: psllq %xmm2, %xmm0 1208; X86-SSE2-NEXT: por %xmm1, %xmm0 1209; X86-SSE2-NEXT: retl 1210 %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer 1211 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat) 1212 ret <2 x i64> %res 1213} 1214 1215define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind { 1216; SSE-LABEL: splatvar_funnnel_v4i32: 1217; SSE: # %bb.0: 1218; SSE-NEXT: movdqa %xmm1, %xmm3 1219; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] 1220; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1221; SSE-NEXT: psrlq %xmm2, %xmm3 1222; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1223; SSE-NEXT: psrlq %xmm2, %xmm1 1224; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] 1225; SSE-NEXT: movaps %xmm1, %xmm0 1226; SSE-NEXT: retq 1227; 1228; AVX-LABEL: splatvar_funnnel_v4i32: 1229; AVX: # %bb.0: 1230; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1231; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1232; AVX-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 1233; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1234; AVX-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 1235; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] 1236; AVX-NEXT: retq 1237; 1238; AVX512F-LABEL: splatvar_funnnel_v4i32: 1239; AVX512F: # %bb.0: 1240; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1241; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1242; AVX512F-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 1243; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1244; AVX512F-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 1245; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] 1246; AVX512F-NEXT: retq 1247; 1248; AVX512VL-LABEL: splatvar_funnnel_v4i32: 1249; AVX512VL: # %bb.0: 1250; AVX512VL-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1251; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1252; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 1253; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 1254; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 1255; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 1256; AVX512VL-NEXT: vzeroupper 1257; AVX512VL-NEXT: retq 1258; 1259; AVX512BW-LABEL: splatvar_funnnel_v4i32: 1260; AVX512BW: # %bb.0: 1261; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1262; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1263; AVX512BW-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 1264; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1265; AVX512BW-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 1266; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] 1267; AVX512BW-NEXT: retq 1268; 1269; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32: 1270; AVX512VBMI2: # %bb.0: 1271; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1272; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1273; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2 1274; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1 1275; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 1276; AVX512VBMI2-NEXT: vzeroupper 1277; AVX512VBMI2-NEXT: retq 1278; 1279; AVX512VLBW-LABEL: splatvar_funnnel_v4i32: 1280; AVX512VLBW: # %bb.0: 1281; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1282; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1283; AVX512VLBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 1284; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 1285; AVX512VLBW-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 1286; AVX512VLBW-NEXT: vpmovqd %ymm0, %xmm0 1287; AVX512VLBW-NEXT: vzeroupper 1288; AVX512VLBW-NEXT: retq 1289; 1290; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32: 1291; AVX512VLVBMI2: # %bb.0: 1292; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %xmm2 1293; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1 1294; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 1295; AVX512VLVBMI2-NEXT: retq 1296; 1297; XOP-LABEL: splatvar_funnnel_v4i32: 1298; XOP: # %bb.0: 1299; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1300; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1301; XOP-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 1302; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1303; XOP-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 1304; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] 1305; XOP-NEXT: retq 1306; 1307; X86-SSE2-LABEL: splatvar_funnnel_v4i32: 1308; X86-SSE2: # %bb.0: 1309; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 1310; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] 1311; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 1312; X86-SSE2-NEXT: psrlq %xmm2, %xmm3 1313; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1314; X86-SSE2-NEXT: psrlq %xmm2, %xmm1 1315; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] 1316; X86-SSE2-NEXT: movaps %xmm1, %xmm0 1317; X86-SSE2-NEXT: retl 1318 %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer 1319 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat) 1320 ret <4 x i32> %res 1321} 1322 1323define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind { 1324; SSE2-LABEL: splatvar_funnnel_v8i16: 1325; SSE2: # %bb.0: 1326; SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] 1327; SSE2-NEXT: movdqa %xmm2, %xmm4 1328; SSE2-NEXT: pand %xmm3, %xmm4 1329; SSE2-NEXT: psrlw %xmm4, %xmm1 1330; SSE2-NEXT: pandn %xmm3, %xmm2 1331; SSE2-NEXT: paddw %xmm0, %xmm0 1332; SSE2-NEXT: psllw %xmm2, %xmm0 1333; SSE2-NEXT: por %xmm1, %xmm0 1334; SSE2-NEXT: retq 1335; 1336; SSE41-LABEL: splatvar_funnnel_v8i16: 1337; SSE41: # %bb.0: 1338; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [15,0] 1339; SSE41-NEXT: movdqa %xmm2, %xmm4 1340; SSE41-NEXT: pand %xmm3, %xmm4 1341; SSE41-NEXT: psrlw %xmm4, %xmm1 1342; SSE41-NEXT: pandn %xmm3, %xmm2 1343; SSE41-NEXT: paddw %xmm0, %xmm0 1344; SSE41-NEXT: psllw %xmm2, %xmm0 1345; SSE41-NEXT: por %xmm1, %xmm0 1346; SSE41-NEXT: retq 1347; 1348; AVX-LABEL: splatvar_funnnel_v8i16: 1349; AVX: # %bb.0: 1350; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] 1351; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 1352; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 1353; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 1354; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 1355; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1356; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1357; AVX-NEXT: retq 1358; 1359; AVX512F-LABEL: splatvar_funnnel_v8i16: 1360; AVX512F: # %bb.0: 1361; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] 1362; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 1363; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 1364; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 1365; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0 1366; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1367; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 1368; AVX512F-NEXT: retq 1369; 1370; AVX512VL-LABEL: splatvar_funnnel_v8i16: 1371; AVX512VL: # %bb.0: 1372; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] 1373; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 1374; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 1375; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 1376; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0 1377; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1378; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 1379; AVX512VL-NEXT: retq 1380; 1381; AVX512BW-LABEL: splatvar_funnnel_v8i16: 1382; AVX512BW: # %bb.0: 1383; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] 1384; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 1385; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 1386; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 1387; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 1388; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1389; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1390; AVX512BW-NEXT: retq 1391; 1392; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16: 1393; AVX512VBMI2: # %bb.0: 1394; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1395; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1396; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2 1397; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1 1398; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 1399; AVX512VBMI2-NEXT: vzeroupper 1400; AVX512VBMI2-NEXT: retq 1401; 1402; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: 1403; AVX512VLBW: # %bb.0: 1404; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] 1405; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 1406; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 1407; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 1408; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 1409; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1410; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1411; AVX512VLBW-NEXT: retq 1412; 1413; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16: 1414; AVX512VLVBMI2: # %bb.0: 1415; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %xmm2 1416; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1 1417; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 1418; AVX512VLVBMI2-NEXT: retq 1419; 1420; XOP-LABEL: splatvar_funnnel_v8i16: 1421; XOP: # %bb.0: 1422; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] 1423; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 1424; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 1425; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 1426; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0 1427; XOP-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1428; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 1429; XOP-NEXT: retq 1430; 1431; X86-SSE2-LABEL: splatvar_funnnel_v8i16: 1432; X86-SSE2: # %bb.0: 1433; X86-SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] 1434; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 1435; X86-SSE2-NEXT: pand %xmm3, %xmm4 1436; X86-SSE2-NEXT: psrlw %xmm4, %xmm1 1437; X86-SSE2-NEXT: pandn %xmm3, %xmm2 1438; X86-SSE2-NEXT: paddw %xmm0, %xmm0 1439; X86-SSE2-NEXT: psllw %xmm2, %xmm0 1440; X86-SSE2-NEXT: por %xmm1, %xmm0 1441; X86-SSE2-NEXT: retl 1442 %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer 1443 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat) 1444 ret <8 x i16> %res 1445} 1446 1447define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind { 1448; SSE2-LABEL: splatvar_funnnel_v16i8: 1449; SSE2: # %bb.0: 1450; SSE2-NEXT: movdqa %xmm1, %xmm4 1451; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 1452; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1453; SSE2-NEXT: psrlw %xmm2, %xmm4 1454; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 1455; SSE2-NEXT: pand %xmm3, %xmm4 1456; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1457; SSE2-NEXT: psrlw %xmm2, %xmm1 1458; SSE2-NEXT: pand %xmm1, %xmm3 1459; SSE2-NEXT: packuswb %xmm4, %xmm3 1460; SSE2-NEXT: movdqa %xmm3, %xmm0 1461; SSE2-NEXT: retq 1462; 1463; SSE41-LABEL: splatvar_funnnel_v16i8: 1464; SSE41: # %bb.0: 1465; SSE41-NEXT: movdqa %xmm1, %xmm4 1466; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 1467; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1468; SSE41-NEXT: psrlw %xmm2, %xmm4 1469; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 1470; SSE41-NEXT: pand %xmm3, %xmm4 1471; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1472; SSE41-NEXT: psrlw %xmm2, %xmm1 1473; SSE41-NEXT: pand %xmm1, %xmm3 1474; SSE41-NEXT: packuswb %xmm4, %xmm3 1475; SSE41-NEXT: movdqa %xmm3, %xmm0 1476; SSE41-NEXT: retq 1477; 1478; AVX1-LABEL: splatvar_funnnel_v16i8: 1479; AVX1: # %bb.0: 1480; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1481; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1482; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 1483; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1484; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 1485; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1486; AVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 1487; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1488; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1489; AVX1-NEXT: retq 1490; 1491; AVX2-LABEL: splatvar_funnnel_v16i8: 1492; AVX2: # %bb.0: 1493; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1494; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1495; AVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 1496; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1497; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 1498; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1499; AVX2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 1500; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0 1501; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1502; AVX2-NEXT: retq 1503; 1504; AVX512F-LABEL: splatvar_funnnel_v16i8: 1505; AVX512F: # %bb.0: 1506; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1507; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1508; AVX512F-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 1509; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1510; AVX512F-NEXT: vpand %xmm4, %xmm3, %xmm3 1511; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1512; AVX512F-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 1513; AVX512F-NEXT: vpand %xmm4, %xmm0, %xmm0 1514; AVX512F-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1515; AVX512F-NEXT: retq 1516; 1517; AVX512VL-LABEL: splatvar_funnnel_v16i8: 1518; AVX512VL: # %bb.0: 1519; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1520; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1521; AVX512VL-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 1522; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1523; AVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm3 1524; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1525; AVX512VL-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 1526; AVX512VL-NEXT: vpand %xmm4, %xmm0, %xmm0 1527; AVX512VL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1528; AVX512VL-NEXT: retq 1529; 1530; AVX512BW-LABEL: splatvar_funnnel_v16i8: 1531; AVX512BW: # %bb.0: 1532; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1533; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1534; AVX512BW-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 1535; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1536; AVX512BW-NEXT: vpand %xmm4, %xmm3, %xmm3 1537; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1538; AVX512BW-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 1539; AVX512BW-NEXT: vpand %xmm4, %xmm0, %xmm0 1540; AVX512BW-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1541; AVX512BW-NEXT: retq 1542; 1543; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8: 1544; AVX512VBMI2: # %bb.0: 1545; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78] 1546; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1547; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1548; AVX512VBMI2-NEXT: vpsrlw %xmm2, %xmm4, %xmm4 1549; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1550; AVX512VBMI2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 1551; AVX512VBMI2-NEXT: vpermt2b %zmm4, %zmm3, %zmm0 1552; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1553; AVX512VBMI2-NEXT: vzeroupper 1554; AVX512VBMI2-NEXT: retq 1555; 1556; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: 1557; AVX512VLBW: # %bb.0: 1558; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1559; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1560; AVX512VLBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 1561; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 1562; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 1563; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 1564; AVX512VLBW-NEXT: vzeroupper 1565; AVX512VLBW-NEXT: retq 1566; 1567; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: 1568; AVX512VLVBMI2: # %bb.0: 1569; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1570; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1571; AVX512VLVBMI2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 1572; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 1573; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 1574; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 1575; AVX512VLVBMI2-NEXT: vzeroupper 1576; AVX512VLVBMI2-NEXT: retq 1577; 1578; XOP-LABEL: splatvar_funnnel_v16i8: 1579; XOP: # %bb.0: 1580; XOP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1581; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1582; XOP-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 1583; XOP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1584; XOP-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 1585; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14] 1586; XOP-NEXT: retq 1587; 1588; X86-SSE2-LABEL: splatvar_funnnel_v16i8: 1589; X86-SSE2: # %bb.0: 1590; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 1591; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 1592; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 1593; X86-SSE2-NEXT: psrlw %xmm2, %xmm4 1594; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 1595; X86-SSE2-NEXT: pand %xmm3, %xmm4 1596; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1597; X86-SSE2-NEXT: psrlw %xmm2, %xmm1 1598; X86-SSE2-NEXT: pand %xmm1, %xmm3 1599; X86-SSE2-NEXT: packuswb %xmm4, %xmm3 1600; X86-SSE2-NEXT: movdqa %xmm3, %xmm0 1601; X86-SSE2-NEXT: retl 1602 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer 1603 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat) 1604 ret <16 x i8> %res 1605} 1606 1607; 1608; Constant Shifts 1609; 1610 1611define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { 1612; SSE2-LABEL: constant_funnnel_v2i64: 1613; SSE2: # %bb.0: 1614; SSE2-NEXT: movdqa %xmm1, %xmm2 1615; SSE2-NEXT: psrlq $4, %xmm2 1616; SSE2-NEXT: psrlq $14, %xmm1 1617; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] 1618; SSE2-NEXT: movdqa %xmm0, %xmm1 1619; SSE2-NEXT: psllq $60, %xmm1 1620; SSE2-NEXT: psllq $50, %xmm0 1621; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1622; SSE2-NEXT: orpd %xmm2, %xmm0 1623; SSE2-NEXT: retq 1624; 1625; SSE41-LABEL: constant_funnnel_v2i64: 1626; SSE41: # %bb.0: 1627; SSE41-NEXT: movdqa %xmm1, %xmm2 1628; SSE41-NEXT: psrlq $14, %xmm2 1629; SSE41-NEXT: psrlq $4, %xmm1 1630; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1631; SSE41-NEXT: movdqa %xmm0, %xmm1 1632; SSE41-NEXT: psllq $50, %xmm1 1633; SSE41-NEXT: psllq $60, %xmm0 1634; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1635; SSE41-NEXT: por %xmm2, %xmm0 1636; SSE41-NEXT: retq 1637; 1638; AVX1-LABEL: constant_funnnel_v2i64: 1639; AVX1: # %bb.0: 1640; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm2 1641; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm1 1642; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1643; AVX1-NEXT: vpsllq $50, %xmm0, %xmm2 1644; AVX1-NEXT: vpsllq $60, %xmm0, %xmm0 1645; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1646; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1647; AVX1-NEXT: retq 1648; 1649; AVX2-LABEL: constant_funnnel_v2i64: 1650; AVX2: # %bb.0: 1651; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1652; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1653; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1654; AVX2-NEXT: retq 1655; 1656; AVX512F-LABEL: constant_funnnel_v2i64: 1657; AVX512F: # %bb.0: 1658; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1659; AVX512F-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1660; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 1661; AVX512F-NEXT: retq 1662; 1663; AVX512VL-LABEL: constant_funnnel_v2i64: 1664; AVX512VL: # %bb.0: 1665; AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1666; AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1667; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 1668; AVX512VL-NEXT: retq 1669; 1670; AVX512BW-LABEL: constant_funnnel_v2i64: 1671; AVX512BW: # %bb.0: 1672; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1673; AVX512BW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1674; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1675; AVX512BW-NEXT: retq 1676; 1677; AVX512VBMI2-LABEL: constant_funnnel_v2i64: 1678; AVX512VBMI2: # %bb.0: 1679; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1680; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1681; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,14] 1682; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1 1683; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 1684; AVX512VBMI2-NEXT: vzeroupper 1685; AVX512VBMI2-NEXT: retq 1686; 1687; AVX512VLBW-LABEL: constant_funnnel_v2i64: 1688; AVX512VLBW: # %bb.0: 1689; AVX512VLBW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1690; AVX512VLBW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1691; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1692; AVX512VLBW-NEXT: retq 1693; 1694; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64: 1695; AVX512VLVBMI2: # %bb.0: 1696; AVX512VLVBMI2-NEXT: vpshrdvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1697; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 1698; AVX512VLVBMI2-NEXT: retq 1699; 1700; XOPAVX1-LABEL: constant_funnnel_v2i64: 1701; XOPAVX1: # %bb.0: 1702; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1703; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1704; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1705; XOPAVX1-NEXT: retq 1706; 1707; XOPAVX2-LABEL: constant_funnnel_v2i64: 1708; XOPAVX2: # %bb.0: 1709; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1710; XOPAVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1711; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1712; XOPAVX2-NEXT: retq 1713; 1714; X86-SSE2-LABEL: constant_funnnel_v2i64: 1715; X86-SSE2: # %bb.0: 1716; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 1717; X86-SSE2-NEXT: psrlq $4, %xmm2 1718; X86-SSE2-NEXT: psrlq $14, %xmm1 1719; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] 1720; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1721; X86-SSE2-NEXT: psllq $60, %xmm1 1722; X86-SSE2-NEXT: psllq $50, %xmm0 1723; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1724; X86-SSE2-NEXT: orpd %xmm2, %xmm0 1725; X86-SSE2-NEXT: retl 1726 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>) 1727 ret <2 x i64> %res 1728} 1729 1730define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { 1731; SSE2-LABEL: constant_funnnel_v4i32: 1732; SSE2: # %bb.0: 1733; SSE2-NEXT: movdqa %xmm1, %xmm2 1734; SSE2-NEXT: psrld $7, %xmm2 1735; SSE2-NEXT: movdqa %xmm1, %xmm3 1736; SSE2-NEXT: psrld $6, %xmm3 1737; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1738; SSE2-NEXT: movdqa %xmm1, %xmm2 1739; SSE2-NEXT: psrld $5, %xmm2 1740; SSE2-NEXT: psrld $4, %xmm1 1741; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1742; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] 1743; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1744; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1745; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1746; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1747; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1748; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1749; SSE2-NEXT: por %xmm1, %xmm0 1750; SSE2-NEXT: retq 1751; 1752; SSE41-LABEL: constant_funnnel_v4i32: 1753; SSE41: # %bb.0: 1754; SSE41-NEXT: movdqa %xmm1, %xmm2 1755; SSE41-NEXT: psrld $7, %xmm2 1756; SSE41-NEXT: movdqa %xmm1, %xmm3 1757; SSE41-NEXT: psrld $5, %xmm3 1758; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1759; SSE41-NEXT: movdqa %xmm1, %xmm2 1760; SSE41-NEXT: psrld $6, %xmm2 1761; SSE41-NEXT: psrld $4, %xmm1 1762; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] 1763; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1764; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1765; SSE41-NEXT: por %xmm2, %xmm0 1766; SSE41-NEXT: retq 1767; 1768; AVX1-LABEL: constant_funnnel_v4i32: 1769; AVX1: # %bb.0: 1770; AVX1-NEXT: vpsrld $7, %xmm1, %xmm2 1771; AVX1-NEXT: vpsrld $5, %xmm1, %xmm3 1772; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1773; AVX1-NEXT: vpsrld $6, %xmm1, %xmm3 1774; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1 1775; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] 1776; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1777; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1778; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1779; AVX1-NEXT: retq 1780; 1781; AVX2-LABEL: constant_funnnel_v4i32: 1782; AVX2: # %bb.0: 1783; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1784; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1785; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1786; AVX2-NEXT: retq 1787; 1788; AVX512F-LABEL: constant_funnnel_v4i32: 1789; AVX512F: # %bb.0: 1790; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1791; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1792; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 1793; AVX512F-NEXT: retq 1794; 1795; AVX512VL-LABEL: constant_funnnel_v4i32: 1796; AVX512VL: # %bb.0: 1797; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1798; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1799; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 1800; AVX512VL-NEXT: retq 1801; 1802; AVX512BW-LABEL: constant_funnnel_v4i32: 1803; AVX512BW: # %bb.0: 1804; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1805; AVX512BW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1806; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1807; AVX512BW-NEXT: retq 1808; 1809; AVX512VBMI2-LABEL: constant_funnnel_v4i32: 1810; AVX512VBMI2: # %bb.0: 1811; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1812; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1813; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,5,6,7] 1814; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1 1815; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 1816; AVX512VBMI2-NEXT: vzeroupper 1817; AVX512VBMI2-NEXT: retq 1818; 1819; AVX512VLBW-LABEL: constant_funnnel_v4i32: 1820; AVX512VLBW: # %bb.0: 1821; AVX512VLBW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1822; AVX512VLBW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1823; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1824; AVX512VLBW-NEXT: retq 1825; 1826; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32: 1827; AVX512VLVBMI2: # %bb.0: 1828; AVX512VLVBMI2-NEXT: vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1829; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 1830; AVX512VLVBMI2-NEXT: retq 1831; 1832; XOPAVX1-LABEL: constant_funnnel_v4i32: 1833; XOPAVX1: # %bb.0: 1834; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1835; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1836; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1837; XOPAVX1-NEXT: retq 1838; 1839; XOPAVX2-LABEL: constant_funnnel_v4i32: 1840; XOPAVX2: # %bb.0: 1841; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1842; XOPAVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1843; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1844; XOPAVX2-NEXT: retq 1845; 1846; X86-SSE2-LABEL: constant_funnnel_v4i32: 1847; X86-SSE2: # %bb.0: 1848; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 1849; X86-SSE2-NEXT: psrld $7, %xmm2 1850; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 1851; X86-SSE2-NEXT: psrld $6, %xmm3 1852; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1853; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 1854; X86-SSE2-NEXT: psrld $5, %xmm2 1855; X86-SSE2-NEXT: psrld $4, %xmm1 1856; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1857; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] 1858; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1859; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1860; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1861; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 1862; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1863; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1864; X86-SSE2-NEXT: por %xmm1, %xmm0 1865; X86-SSE2-NEXT: retl 1866 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>) 1867 ret <4 x i32> %res 1868} 1869 1870define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { 1871; SSE2-LABEL: constant_funnnel_v8i16: 1872; SSE2: # %bb.0: 1873; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] 1874; SSE2-NEXT: pandn %xmm1, %xmm2 1875; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1876; SSE2-NEXT: por %xmm1, %xmm2 1877; SSE2-NEXT: paddw %xmm0, %xmm0 1878; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,8192,4096,2048,1024,512,256] 1879; SSE2-NEXT: por %xmm2, %xmm0 1880; SSE2-NEXT: retq 1881; 1882; SSE41-LABEL: constant_funnnel_v8i16: 1883; SSE41: # %bb.0: 1884; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [u,32768,16384,8192,4096,2048,1024,512] 1885; SSE41-NEXT: pmulhuw %xmm1, %xmm2 1886; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] 1887; SSE41-NEXT: paddw %xmm0, %xmm0 1888; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,8192,4096,2048,1024,512,256] 1889; SSE41-NEXT: por %xmm2, %xmm0 1890; SSE41-NEXT: retq 1891; 1892; AVX-LABEL: constant_funnnel_v8i16: 1893; AVX: # %bb.0: 1894; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,32768,16384,8192,4096,2048,1024,512] 1895; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 1896; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 1897; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256] 1898; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1899; AVX-NEXT: retq 1900; 1901; AVX512F-LABEL: constant_funnnel_v8i16: 1902; AVX512F: # %bb.0: 1903; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,32768,16384,8192,4096,2048,1024,512] 1904; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 1905; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0 1906; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256] 1907; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 1908; AVX512F-NEXT: retq 1909; 1910; AVX512VL-LABEL: constant_funnnel_v8i16: 1911; AVX512VL: # %bb.0: 1912; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,32768,16384,8192,4096,2048,1024,512] 1913; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] 1914; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0 1915; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256] 1916; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 1917; AVX512VL-NEXT: retq 1918; 1919; AVX512BW-LABEL: constant_funnnel_v8i16: 1920; AVX512BW: # %bb.0: 1921; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1922; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] 1923; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 1924; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8] 1925; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 1926; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 1927; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1928; AVX512BW-NEXT: vzeroupper 1929; AVX512BW-NEXT: retq 1930; 1931; AVX512VBMI2-LABEL: constant_funnnel_v8i16: 1932; AVX512VBMI2: # %bb.0: 1933; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1934; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1935; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] 1936; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1 1937; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 1938; AVX512VBMI2-NEXT: vzeroupper 1939; AVX512VBMI2-NEXT: retq 1940; 1941; AVX512VLBW-LABEL: constant_funnnel_v8i16: 1942; AVX512VLBW: # %bb.0: 1943; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1944; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 1945; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1946; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1947; AVX512VLBW-NEXT: retq 1948; 1949; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16: 1950; AVX512VLVBMI2: # %bb.0: 1951; AVX512VLVBMI2-NEXT: vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1952; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 1953; AVX512VLVBMI2-NEXT: retq 1954; 1955; XOP-LABEL: constant_funnnel_v8i16: 1956; XOP: # %bb.0: 1957; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1958; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0 1959; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1960; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 1961; XOP-NEXT: retq 1962; 1963; X86-SSE2-LABEL: constant_funnnel_v8i16: 1964; X86-SSE2: # %bb.0: 1965; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] 1966; X86-SSE2-NEXT: pandn %xmm1, %xmm2 1967; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1968; X86-SSE2-NEXT: por %xmm1, %xmm2 1969; X86-SSE2-NEXT: paddw %xmm0, %xmm0 1970; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256] 1971; X86-SSE2-NEXT: por %xmm2, %xmm0 1972; X86-SSE2-NEXT: retl 1973 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>) 1974 ret <8 x i16> %res 1975} 1976 1977define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { 1978; SSE2-LABEL: constant_funnnel_v16i8: 1979; SSE2: # %bb.0: 1980; SSE2-NEXT: pxor %xmm2, %xmm2 1981; SSE2-NEXT: movdqa %xmm1, %xmm3 1982; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1983; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,4,8,16,32,64,128] 1984; SSE2-NEXT: psrlw $8, %xmm3 1985; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1986; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,16,8,4,2] 1987; SSE2-NEXT: psrlw $8, %xmm1 1988; SSE2-NEXT: packuswb %xmm3, %xmm1 1989; SSE2-NEXT: paddb %xmm0, %xmm0 1990; SSE2-NEXT: movdqa %xmm0, %xmm2 1991; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1992; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,1,2,4,8,16,32,64] 1993; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 1994; SSE2-NEXT: pand %xmm3, %xmm2 1995; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1996; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,64,32,16,8,4,2,1] 1997; SSE2-NEXT: pand %xmm3, %xmm0 1998; SSE2-NEXT: packuswb %xmm2, %xmm0 1999; SSE2-NEXT: por %xmm1, %xmm0 2000; SSE2-NEXT: retq 2001; 2002; SSE41-LABEL: constant_funnnel_v16i8: 2003; SSE41: # %bb.0: 2004; SSE41-NEXT: pxor %xmm2, %xmm2 2005; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2006; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 2007; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,2,4,8,16,32,64,128] 2008; SSE41-NEXT: psrlw $8, %xmm1 2009; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,128,64,32,16,8,4,2] 2010; SSE41-NEXT: psrlw $8, %xmm3 2011; SSE41-NEXT: packuswb %xmm1, %xmm3 2012; SSE41-NEXT: paddb %xmm0, %xmm0 2013; SSE41-NEXT: movdqa %xmm0, %xmm1 2014; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] 2015; SSE41-NEXT: psllw $8, %xmm1 2016; SSE41-NEXT: por %xmm3, %xmm1 2017; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] 2018; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2019; SSE41-NEXT: por %xmm1, %xmm0 2020; SSE41-NEXT: retq 2021; 2022; AVX1-LABEL: constant_funnnel_v16i8: 2023; AVX1: # %bb.0: 2024; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 2025; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 2026; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,2,4,8,16,32,64,128] 2027; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 2028; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2029; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,128,64,32,16,8,4,2] 2030; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 2031; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2032; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 2033; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] 2034; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 2035; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 2036; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] 2037; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2038; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 2039; AVX1-NEXT: retq 2040; 2041; AVX2-LABEL: constant_funnnel_v16i8: 2042; AVX2: # %bb.0: 2043; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2044; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,128,64,32,16,8,4,2,256,2,4,8,16,32,64,128] 2045; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 2046; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2047; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2048; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 2049; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 2050; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] 2051; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2052; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2053; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2054; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 2055; AVX2-NEXT: vzeroupper 2056; AVX2-NEXT: retq 2057; 2058; AVX512F-LABEL: constant_funnnel_v16i8: 2059; AVX512F: # %bb.0: 2060; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 2061; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2062; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm0 2063; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 2064; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2065; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 2066; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2067; AVX512F-NEXT: vzeroupper 2068; AVX512F-NEXT: retq 2069; 2070; AVX512VL-LABEL: constant_funnnel_v16i8: 2071; AVX512VL: # %bb.0: 2072; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 2073; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2074; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0 2075; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 2076; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2077; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 2078; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 2079; AVX512VL-NEXT: vzeroupper 2080; AVX512VL-NEXT: retq 2081; 2082; AVX512BW-LABEL: constant_funnnel_v16i8: 2083; AVX512BW: # %bb.0: 2084; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] 2085; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2086; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 2087; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0 2088; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 2089; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 2090; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2091; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2092; AVX512BW-NEXT: vzeroupper 2093; AVX512BW-NEXT: retq 2094; 2095; AVX512VBMI2-LABEL: constant_funnnel_v16i8: 2096; AVX512VBMI2: # %bb.0: 2097; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 2098; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2099; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79] 2100; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm2, %zmm1 2101; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] 2102; AVX512VBMI2-NEXT: vpsrlvw %zmm0, %zmm1, %zmm0 2103; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 2104; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2105; AVX512VBMI2-NEXT: vzeroupper 2106; AVX512VBMI2-NEXT: retq 2107; 2108; AVX512VLBW-LABEL: constant_funnnel_v16i8: 2109; AVX512VLBW: # %bb.0: 2110; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2111; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 2112; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0 2113; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 2114; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2115; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 2116; AVX512VLBW-NEXT: vzeroupper 2117; AVX512VLBW-NEXT: retq 2118; 2119; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8: 2120; AVX512VLVBMI2: # %bb.0: 2121; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 2122; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 2123; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] 2124; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm2 2125; AVX512VLVBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 2126; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 2127; AVX512VLVBMI2-NEXT: vzeroupper 2128; AVX512VLVBMI2-NEXT: retq 2129; 2130; XOP-LABEL: constant_funnnel_v16i8: 2131; XOP: # %bb.0: 2132; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2133; XOP-NEXT: vpaddb %xmm0, %xmm0, %xmm0 2134; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2135; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 2136; XOP-NEXT: retq 2137; 2138; X86-SSE2-LABEL: constant_funnnel_v16i8: 2139; X86-SSE2: # %bb.0: 2140; X86-SSE2-NEXT: pxor %xmm2, %xmm2 2141; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 2142; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 2143; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 # [256,2,4,8,16,32,64,128] 2144; X86-SSE2-NEXT: psrlw $8, %xmm3 2145; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 2146; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [256,128,64,32,16,8,4,2] 2147; X86-SSE2-NEXT: psrlw $8, %xmm1 2148; X86-SSE2-NEXT: packuswb %xmm3, %xmm1 2149; X86-SSE2-NEXT: paddb %xmm0, %xmm0 2150; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 2151; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2152; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [128,1,2,4,8,16,32,64] 2153; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 2154; X86-SSE2-NEXT: pand %xmm3, %xmm2 2155; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2156; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [128,64,32,16,8,4,2,1] 2157; X86-SSE2-NEXT: pand %xmm3, %xmm0 2158; X86-SSE2-NEXT: packuswb %xmm2, %xmm0 2159; X86-SSE2-NEXT: por %xmm1, %xmm0 2160; X86-SSE2-NEXT: retl 2161 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>) 2162 ret <16 x i8> %res 2163} 2164 2165; 2166; Uniform Constant Shifts 2167; 2168 2169define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { 2170; SSE-LABEL: splatconstant_funnnel_v2i64: 2171; SSE: # %bb.0: 2172; SSE-NEXT: psrlq $14, %xmm1 2173; SSE-NEXT: psllq $50, %xmm0 2174; SSE-NEXT: por %xmm1, %xmm0 2175; SSE-NEXT: retq 2176; 2177; AVX-LABEL: splatconstant_funnnel_v2i64: 2178; AVX: # %bb.0: 2179; AVX-NEXT: vpsrlq $14, %xmm1, %xmm1 2180; AVX-NEXT: vpsllq $50, %xmm0, %xmm0 2181; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2182; AVX-NEXT: retq 2183; 2184; AVX512F-LABEL: splatconstant_funnnel_v2i64: 2185; AVX512F: # %bb.0: 2186; AVX512F-NEXT: vpsrlq $14, %xmm1, %xmm1 2187; AVX512F-NEXT: vpsllq $50, %xmm0, %xmm0 2188; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 2189; AVX512F-NEXT: retq 2190; 2191; AVX512VL-LABEL: splatconstant_funnnel_v2i64: 2192; AVX512VL: # %bb.0: 2193; AVX512VL-NEXT: vpsrlq $14, %xmm1, %xmm1 2194; AVX512VL-NEXT: vpsllq $50, %xmm0, %xmm0 2195; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 2196; AVX512VL-NEXT: retq 2197; 2198; AVX512BW-LABEL: splatconstant_funnnel_v2i64: 2199; AVX512BW: # %bb.0: 2200; AVX512BW-NEXT: vpsrlq $14, %xmm1, %xmm1 2201; AVX512BW-NEXT: vpsllq $50, %xmm0, %xmm0 2202; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 2203; AVX512BW-NEXT: retq 2204; 2205; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64: 2206; AVX512VBMI2: # %bb.0: 2207; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 2208; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2209; AVX512VBMI2-NEXT: vpshrdq $14, %zmm0, %zmm1, %zmm0 2210; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2211; AVX512VBMI2-NEXT: vzeroupper 2212; AVX512VBMI2-NEXT: retq 2213; 2214; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64: 2215; AVX512VLBW: # %bb.0: 2216; AVX512VLBW-NEXT: vpsrlq $14, %xmm1, %xmm1 2217; AVX512VLBW-NEXT: vpsllq $50, %xmm0, %xmm0 2218; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 2219; AVX512VLBW-NEXT: retq 2220; 2221; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64: 2222; AVX512VLVBMI2: # %bb.0: 2223; AVX512VLVBMI2-NEXT: vpshrdq $14, %xmm0, %xmm1, %xmm0 2224; AVX512VLVBMI2-NEXT: retq 2225; 2226; XOP-LABEL: splatconstant_funnnel_v2i64: 2227; XOP: # %bb.0: 2228; XOP-NEXT: vpsrlq $14, %xmm1, %xmm1 2229; XOP-NEXT: vpsllq $50, %xmm0, %xmm0 2230; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 2231; XOP-NEXT: retq 2232; 2233; X86-SSE2-LABEL: splatconstant_funnnel_v2i64: 2234; X86-SSE2: # %bb.0: 2235; X86-SSE2-NEXT: psrlq $14, %xmm1 2236; X86-SSE2-NEXT: psllq $50, %xmm0 2237; X86-SSE2-NEXT: por %xmm1, %xmm0 2238; X86-SSE2-NEXT: retl 2239 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>) 2240 ret <2 x i64> %res 2241} 2242 2243define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { 2244; SSE-LABEL: splatconstant_funnnel_v4i32: 2245; SSE: # %bb.0: 2246; SSE-NEXT: psrld $4, %xmm1 2247; SSE-NEXT: pslld $28, %xmm0 2248; SSE-NEXT: por %xmm1, %xmm0 2249; SSE-NEXT: retq 2250; 2251; AVX-LABEL: splatconstant_funnnel_v4i32: 2252; AVX: # %bb.0: 2253; AVX-NEXT: vpsrld $4, %xmm1, %xmm1 2254; AVX-NEXT: vpslld $28, %xmm0, %xmm0 2255; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2256; AVX-NEXT: retq 2257; 2258; AVX512F-LABEL: splatconstant_funnnel_v4i32: 2259; AVX512F: # %bb.0: 2260; AVX512F-NEXT: vpsrld $4, %xmm1, %xmm1 2261; AVX512F-NEXT: vpslld $28, %xmm0, %xmm0 2262; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 2263; AVX512F-NEXT: retq 2264; 2265; AVX512VL-LABEL: splatconstant_funnnel_v4i32: 2266; AVX512VL: # %bb.0: 2267; AVX512VL-NEXT: vpsrld $4, %xmm1, %xmm1 2268; AVX512VL-NEXT: vpslld $28, %xmm0, %xmm0 2269; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 2270; AVX512VL-NEXT: retq 2271; 2272; AVX512BW-LABEL: splatconstant_funnnel_v4i32: 2273; AVX512BW: # %bb.0: 2274; AVX512BW-NEXT: vpsrld $4, %xmm1, %xmm1 2275; AVX512BW-NEXT: vpslld $28, %xmm0, %xmm0 2276; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 2277; AVX512BW-NEXT: retq 2278; 2279; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32: 2280; AVX512VBMI2: # %bb.0: 2281; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 2282; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2283; AVX512VBMI2-NEXT: vpshrdd $4, %zmm0, %zmm1, %zmm0 2284; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2285; AVX512VBMI2-NEXT: vzeroupper 2286; AVX512VBMI2-NEXT: retq 2287; 2288; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32: 2289; AVX512VLBW: # %bb.0: 2290; AVX512VLBW-NEXT: vpsrld $4, %xmm1, %xmm1 2291; AVX512VLBW-NEXT: vpslld $28, %xmm0, %xmm0 2292; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 2293; AVX512VLBW-NEXT: retq 2294; 2295; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32: 2296; AVX512VLVBMI2: # %bb.0: 2297; AVX512VLVBMI2-NEXT: vpshrdd $4, %xmm0, %xmm1, %xmm0 2298; AVX512VLVBMI2-NEXT: retq 2299; 2300; XOP-LABEL: splatconstant_funnnel_v4i32: 2301; XOP: # %bb.0: 2302; XOP-NEXT: vpsrld $4, %xmm1, %xmm1 2303; XOP-NEXT: vpslld $28, %xmm0, %xmm0 2304; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 2305; XOP-NEXT: retq 2306; 2307; X86-SSE2-LABEL: splatconstant_funnnel_v4i32: 2308; X86-SSE2: # %bb.0: 2309; X86-SSE2-NEXT: psrld $4, %xmm1 2310; X86-SSE2-NEXT: pslld $28, %xmm0 2311; X86-SSE2-NEXT: por %xmm1, %xmm0 2312; X86-SSE2-NEXT: retl 2313 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>) 2314 ret <4 x i32> %res 2315} 2316 2317define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { 2318; SSE-LABEL: splatconstant_funnnel_v8i16: 2319; SSE: # %bb.0: 2320; SSE-NEXT: psrlw $7, %xmm1 2321; SSE-NEXT: psllw $9, %xmm0 2322; SSE-NEXT: por %xmm1, %xmm0 2323; SSE-NEXT: retq 2324; 2325; AVX-LABEL: splatconstant_funnnel_v8i16: 2326; AVX: # %bb.0: 2327; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1 2328; AVX-NEXT: vpsllw $9, %xmm0, %xmm0 2329; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2330; AVX-NEXT: retq 2331; 2332; AVX512F-LABEL: splatconstant_funnnel_v8i16: 2333; AVX512F: # %bb.0: 2334; AVX512F-NEXT: vpsrlw $7, %xmm1, %xmm1 2335; AVX512F-NEXT: vpsllw $9, %xmm0, %xmm0 2336; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 2337; AVX512F-NEXT: retq 2338; 2339; AVX512VL-LABEL: splatconstant_funnnel_v8i16: 2340; AVX512VL: # %bb.0: 2341; AVX512VL-NEXT: vpsrlw $7, %xmm1, %xmm1 2342; AVX512VL-NEXT: vpsllw $9, %xmm0, %xmm0 2343; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 2344; AVX512VL-NEXT: retq 2345; 2346; AVX512BW-LABEL: splatconstant_funnnel_v8i16: 2347; AVX512BW: # %bb.0: 2348; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1 2349; AVX512BW-NEXT: vpsllw $9, %xmm0, %xmm0 2350; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 2351; AVX512BW-NEXT: retq 2352; 2353; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i16: 2354; AVX512VBMI2: # %bb.0: 2355; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 2356; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2357; AVX512VBMI2-NEXT: vpshrdw $7, %zmm0, %zmm1, %zmm0 2358; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2359; AVX512VBMI2-NEXT: vzeroupper 2360; AVX512VBMI2-NEXT: retq 2361; 2362; AVX512VLBW-LABEL: splatconstant_funnnel_v8i16: 2363; AVX512VLBW: # %bb.0: 2364; AVX512VLBW-NEXT: vpsrlw $7, %xmm1, %xmm1 2365; AVX512VLBW-NEXT: vpsllw $9, %xmm0, %xmm0 2366; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 2367; AVX512VLBW-NEXT: retq 2368; 2369; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i16: 2370; AVX512VLVBMI2: # %bb.0: 2371; AVX512VLVBMI2-NEXT: vpshrdw $7, %xmm0, %xmm1, %xmm0 2372; AVX512VLVBMI2-NEXT: retq 2373; 2374; XOP-LABEL: splatconstant_funnnel_v8i16: 2375; XOP: # %bb.0: 2376; XOP-NEXT: vpsrlw $7, %xmm1, %xmm1 2377; XOP-NEXT: vpsllw $9, %xmm0, %xmm0 2378; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 2379; XOP-NEXT: retq 2380; 2381; X86-SSE2-LABEL: splatconstant_funnnel_v8i16: 2382; X86-SSE2: # %bb.0: 2383; X86-SSE2-NEXT: psrlw $7, %xmm1 2384; X86-SSE2-NEXT: psllw $9, %xmm0 2385; X86-SSE2-NEXT: por %xmm1, %xmm0 2386; X86-SSE2-NEXT: retl 2387 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>) 2388 ret <8 x i16> %res 2389} 2390 2391define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { 2392; SSE-LABEL: splatconstant_funnnel_v16i8: 2393; SSE: # %bb.0: 2394; SSE-NEXT: psrlw $4, %xmm1 2395; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2396; SSE-NEXT: psllw $4, %xmm0 2397; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2398; SSE-NEXT: por %xmm1, %xmm0 2399; SSE-NEXT: retq 2400; 2401; AVX-LABEL: splatconstant_funnnel_v16i8: 2402; AVX: # %bb.0: 2403; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1 2404; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2405; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 2406; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2407; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2408; AVX-NEXT: retq 2409; 2410; AVX512F-LABEL: splatconstant_funnnel_v16i8: 2411; AVX512F: # %bb.0: 2412; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm2 2413; AVX512F-NEXT: vpsrlw $4, %xmm1, %xmm0 2414; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) 2415; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2416; AVX512F-NEXT: vzeroupper 2417; AVX512F-NEXT: retq 2418; 2419; AVX512VL-LABEL: splatconstant_funnnel_v16i8: 2420; AVX512VL: # %bb.0: 2421; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2 2422; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm0 2423; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2)) 2424; AVX512VL-NEXT: retq 2425; 2426; AVX512BW-LABEL: splatconstant_funnnel_v16i8: 2427; AVX512BW: # %bb.0: 2428; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm2 2429; AVX512BW-NEXT: vpsrlw $4, %xmm1, %xmm0 2430; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) 2431; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2432; AVX512BW-NEXT: vzeroupper 2433; AVX512BW-NEXT: retq 2434; 2435; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8: 2436; AVX512VBMI2: # %bb.0: 2437; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm2 2438; AVX512VBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0 2439; AVX512VBMI2-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) 2440; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2441; AVX512VBMI2-NEXT: vzeroupper 2442; AVX512VBMI2-NEXT: retq 2443; 2444; AVX512VLBW-LABEL: splatconstant_funnnel_v16i8: 2445; AVX512VLBW: # %bb.0: 2446; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm2 2447; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm0 2448; AVX512VLBW-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2)) 2449; AVX512VLBW-NEXT: retq 2450; 2451; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8: 2452; AVX512VLVBMI2: # %bb.0: 2453; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm2 2454; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0 2455; AVX512VLVBMI2-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2)) 2456; AVX512VLVBMI2-NEXT: retq 2457; 2458; XOP-LABEL: splatconstant_funnnel_v16i8: 2459; XOP: # %bb.0: 2460; XOP-NEXT: vpsrlw $4, %xmm1, %xmm1 2461; XOP-NEXT: vpsllw $4, %xmm0, %xmm0 2462; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0, %xmm0 2463; XOP-NEXT: retq 2464; 2465; X86-SSE2-LABEL: splatconstant_funnnel_v16i8: 2466; X86-SSE2: # %bb.0: 2467; X86-SSE2-NEXT: psrlw $4, %xmm1 2468; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 2469; X86-SSE2-NEXT: psllw $4, %xmm0 2470; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 2471; X86-SSE2-NEXT: por %xmm1, %xmm0 2472; X86-SSE2-NEXT: retl 2473 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>) 2474 ret <16 x i8> %res 2475} 2476