1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --no_x86_scrub_mem_shuffle 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE3 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX512VLBW 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512VL,VLVBMI 13 14define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind { 15; SSE3-LABEL: var_shuffle_v2i64: 16; SSE3: # %bb.0: 17; SSE3-NEXT: movq %xmm1, %rax 18; SSE3-NEXT: andl $1, %eax 19; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 20; SSE3-NEXT: movq %xmm1, %rcx 21; SSE3-NEXT: andl $1, %ecx 22; SSE3-NEXT: movaps %xmm0, -24(%rsp) 23; SSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero 24; SSE3-NEXT: movsd -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero 25; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 26; SSE3-NEXT: retq 27; 28; SSSE3-LABEL: var_shuffle_v2i64: 29; SSSE3: # %bb.0: 30; SSSE3-NEXT: movq %xmm1, %rax 31; SSSE3-NEXT: andl $1, %eax 32; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 33; SSSE3-NEXT: movq %xmm1, %rcx 34; SSSE3-NEXT: andl $1, %ecx 35; SSSE3-NEXT: movaps %xmm0, -24(%rsp) 36; SSSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero 37; SSSE3-NEXT: movsd -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero 38; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 39; SSSE3-NEXT: retq 40; 41; SSE41-LABEL: var_shuffle_v2i64: 42; SSE41: # %bb.0: 43; SSE41-NEXT: pxor %xmm2, %xmm2 44; SSE41-NEXT: pcmpeqq %xmm1, %xmm2 45; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] 46; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 47; SSE41-NEXT: movdqa %xmm2, %xmm0 48; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 49; SSE41-NEXT: movapd %xmm1, %xmm0 50; SSE41-NEXT: retq 51; 52; AVX-LABEL: var_shuffle_v2i64: 53; AVX: # %bb.0: 54; AVX-NEXT: vpaddq %xmm1, %xmm1, %xmm1 55; AVX-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 56; AVX-NEXT: retq 57 %index0 = extractelement <2 x i64> %indices, i32 0 58 %index1 = extractelement <2 x i64> %indices, i32 1 59 %v0 = extractelement <2 x i64> %v, i64 %index0 60 %v1 = extractelement <2 x i64> %v, i64 %index1 61 %ret0 = insertelement <2 x i64> undef, i64 %v0, i32 0 62 %ret1 = insertelement <2 x i64> %ret0, i64 %v1, i32 1 63 ret <2 x i64> %ret1 64} 65 66define <2 x i64> @var_shuffle_zero_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind { 67; SSE3-LABEL: var_shuffle_zero_v2i64: 68; SSE3: # %bb.0: 69; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] 70; SSE3-NEXT: pxor %xmm1, %xmm2 71; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 72; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 73; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] 74; SSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 75; SSE3-NEXT: pand %xmm4, %xmm3 76; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 77; SSE3-NEXT: por %xmm3, %xmm2 78; SSE3-NEXT: por %xmm2, %xmm1 79; SSE3-NEXT: movq %xmm1, %rax 80; SSE3-NEXT: andl $1, %eax 81; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 82; SSE3-NEXT: movq %xmm1, %rcx 83; SSE3-NEXT: andl $1, %ecx 84; SSE3-NEXT: movaps %xmm0, -24(%rsp) 85; SSE3-NEXT: movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero 86; SSE3-NEXT: movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero 87; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 88; SSE3-NEXT: pandn %xmm0, %xmm2 89; SSE3-NEXT: movdqa %xmm2, %xmm0 90; SSE3-NEXT: retq 91; 92; SSSE3-LABEL: var_shuffle_zero_v2i64: 93; SSSE3: # %bb.0: 94; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] 95; SSSE3-NEXT: pxor %xmm1, %xmm2 96; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 97; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 98; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] 99; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 100; SSSE3-NEXT: pand %xmm4, %xmm3 101; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 102; SSSE3-NEXT: por %xmm3, %xmm2 103; SSSE3-NEXT: por %xmm2, %xmm1 104; SSSE3-NEXT: movq %xmm1, %rax 105; SSSE3-NEXT: andl $1, %eax 106; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 107; SSSE3-NEXT: movq %xmm1, %rcx 108; SSSE3-NEXT: andl $1, %ecx 109; SSSE3-NEXT: movaps %xmm0, -24(%rsp) 110; SSSE3-NEXT: movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero 111; SSSE3-NEXT: movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero 112; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 113; SSSE3-NEXT: pandn %xmm0, %xmm2 114; SSSE3-NEXT: movdqa %xmm2, %xmm0 115; SSSE3-NEXT: retq 116; 117; SSE41-LABEL: var_shuffle_zero_v2i64: 118; SSE41: # %bb.0: 119; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] 120; SSE41-NEXT: pxor %xmm1, %xmm2 121; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 122; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 123; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] 124; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 125; SSE41-NEXT: pand %xmm4, %xmm3 126; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 127; SSE41-NEXT: por %xmm3, %xmm2 128; SSE41-NEXT: por %xmm2, %xmm1 129; SSE41-NEXT: pxor %xmm3, %xmm3 130; SSE41-NEXT: pcmpeqq %xmm1, %xmm3 131; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] 132; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 133; SSE41-NEXT: movdqa %xmm3, %xmm0 134; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 135; SSE41-NEXT: pandn %xmm4, %xmm2 136; SSE41-NEXT: movdqa %xmm2, %xmm0 137; SSE41-NEXT: retq 138; 139; XOP-LABEL: var_shuffle_zero_v2i64: 140; XOP: # %bb.0: 141; XOP-NEXT: vpcomgtuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 142; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1 143; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 144; XOP-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 145; XOP-NEXT: vpandn %xmm0, %xmm2, %xmm0 146; XOP-NEXT: retq 147; 148; AVX1-LABEL: var_shuffle_zero_v2i64: 149; AVX1: # %bb.0: 150; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 151; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 152; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 153; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 154; AVX1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 155; AVX1-NEXT: vpandn %xmm0, %xmm2, %xmm0 156; AVX1-NEXT: retq 157; 158; AVX2-LABEL: var_shuffle_zero_v2i64: 159; AVX2: # %bb.0: 160; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 161; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 162; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 163; AVX2-NEXT: vpaddq %xmm1, %xmm1, %xmm1 164; AVX2-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 165; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0 166; AVX2-NEXT: retq 167; 168; AVX512-LABEL: var_shuffle_zero_v2i64: 169; AVX512: # %bb.0: 170; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 171; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3] 172; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1 173; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 174; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} 175; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1 176; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 177; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 178; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} 179; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 180; AVX512-NEXT: vzeroupper 181; AVX512-NEXT: retq 182; 183; AVX512VL-LABEL: var_shuffle_zero_v2i64: 184; AVX512VL: # %bb.0: 185; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1 186; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 187; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1} 188; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1 189; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 190; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 191; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} 192; AVX512VL-NEXT: retq 193 %cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3> 194 %or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices 195 %idx0 = extractelement <2 x i64> %or, i64 0 196 %idx1 = extractelement <2 x i64> %or, i64 1 197 %elt0 = extractelement <2 x i64> %v, i64 %idx0 198 %elt1 = extractelement <2 x i64> %v, i64 %idx1 199 %vec0 = insertelement <2 x i64> poison, i64 %elt0, i64 0 200 %vec1 = insertelement <2 x i64> %vec0, i64 %elt1, i64 1 201 %res = select <2 x i1> %cmp, <2 x i64> zeroinitializer, <2 x i64> %vec1 202 ret <2 x i64> %res 203} 204 205define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind { 206; SSE3-LABEL: var_shuffle_v4i32: 207; SSE3: # %bb.0: 208; SSE3-NEXT: movd %xmm1, %eax 209; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 210; SSE3-NEXT: movd %xmm2, %ecx 211; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 212; SSE3-NEXT: movd %xmm2, %edx 213; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 214; SSE3-NEXT: movd %xmm1, %esi 215; SSE3-NEXT: movaps %xmm0, -24(%rsp) 216; SSE3-NEXT: andl $3, %eax 217; SSE3-NEXT: andl $3, %ecx 218; SSE3-NEXT: andl $3, %edx 219; SSE3-NEXT: andl $3, %esi 220; SSE3-NEXT: movss -24(%rsp,%rsi,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 221; SSE3-NEXT: movss -24(%rsp,%rdx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero 222; SSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 223; SSE3-NEXT: movss -24(%rsp,%rax,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 224; SSE3-NEXT: movss -24(%rsp,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero 225; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 226; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 227; SSE3-NEXT: retq 228; 229; SSSE3-LABEL: var_shuffle_v4i32: 230; SSSE3: # %bb.0: 231; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036] 232; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 233; SSSE3-NEXT: pmuludq %xmm2, %xmm1 234; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 235; SSSE3-NEXT: pmuludq %xmm2, %xmm3 236; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 237; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 238; SSSE3-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 239; SSSE3-NEXT: pshufb %xmm1, %xmm0 240; SSSE3-NEXT: retq 241; 242; SSE41-LABEL: var_shuffle_v4i32: 243; SSE41: # %bb.0: 244; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 245; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 246; SSE41-NEXT: pshufb %xmm1, %xmm0 247; SSE41-NEXT: retq 248; 249; AVX-LABEL: var_shuffle_v4i32: 250; AVX: # %bb.0: 251; AVX-NEXT: vpermilps %xmm1, %xmm0, %xmm0 252; AVX-NEXT: retq 253 %index0 = extractelement <4 x i32> %indices, i32 0 254 %index1 = extractelement <4 x i32> %indices, i32 1 255 %index2 = extractelement <4 x i32> %indices, i32 2 256 %index3 = extractelement <4 x i32> %indices, i32 3 257 %v0 = extractelement <4 x i32> %v, i32 %index0 258 %v1 = extractelement <4 x i32> %v, i32 %index1 259 %v2 = extractelement <4 x i32> %v, i32 %index2 260 %v3 = extractelement <4 x i32> %v, i32 %index3 261 %ret0 = insertelement <4 x i32> undef, i32 %v0, i32 0 262 %ret1 = insertelement <4 x i32> %ret0, i32 %v1, i32 1 263 %ret2 = insertelement <4 x i32> %ret1, i32 %v2, i32 2 264 %ret3 = insertelement <4 x i32> %ret2, i32 %v3, i32 3 265 ret <4 x i32> %ret3 266} 267 268define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind { 269; SSE3-LABEL: var_shuffle_zero_v4i32: 270; SSE3: # %bb.0: 271; SSE3-NEXT: movaps %xmm0, %xmm2 272; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] 273; SSE3-NEXT: pxor %xmm1, %xmm0 274; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 275; SSE3-NEXT: por %xmm0, %xmm1 276; SSE3-NEXT: movd %xmm1, %eax 277; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] 278; SSE3-NEXT: movd %xmm3, %ecx 279; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 280; SSE3-NEXT: movd %xmm3, %edx 281; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 282; SSE3-NEXT: movd %xmm1, %esi 283; SSE3-NEXT: movaps %xmm2, -24(%rsp) 284; SSE3-NEXT: andl $3, %eax 285; SSE3-NEXT: andl $3, %ecx 286; SSE3-NEXT: andl $3, %edx 287; SSE3-NEXT: andl $3, %esi 288; SSE3-NEXT: movd -24(%rsp,%rsi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero 289; SSE3-NEXT: movd -24(%rsp,%rdx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero 290; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 291; SSE3-NEXT: movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero 292; SSE3-NEXT: movd -24(%rsp,%rcx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero 293; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 294; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 295; SSE3-NEXT: pandn %xmm1, %xmm0 296; SSE3-NEXT: retq 297; 298; SSSE3-LABEL: var_shuffle_zero_v4i32: 299; SSSE3: # %bb.0: 300; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 301; SSSE3-NEXT: pxor %xmm1, %xmm2 302; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 303; SSSE3-NEXT: por %xmm2, %xmm1 304; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [67372036,67372036,67372036,67372036] 305; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 306; SSSE3-NEXT: pmuludq %xmm3, %xmm1 307; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 308; SSSE3-NEXT: pmuludq %xmm3, %xmm4 309; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] 310; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 311; SSSE3-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 312; SSSE3-NEXT: por %xmm2, %xmm1 313; SSSE3-NEXT: pshufb %xmm1, %xmm0 314; SSSE3-NEXT: retq 315; 316; SSE41-LABEL: var_shuffle_zero_v4i32: 317; SSE41: # %bb.0: 318; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [4,4,4,4] 319; SSE41-NEXT: pmaxud %xmm1, %xmm2 320; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 321; SSE41-NEXT: por %xmm2, %xmm1 322; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 323; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 324; SSE41-NEXT: por %xmm2, %xmm1 325; SSE41-NEXT: pshufb %xmm1, %xmm0 326; SSE41-NEXT: retq 327; 328; XOP-LABEL: var_shuffle_zero_v4i32: 329; XOP: # %bb.0: 330; XOP-NEXT: vpcomgtud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 331; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1 332; XOP-NEXT: vpermilps %xmm1, %xmm0, %xmm0 333; XOP-NEXT: vpandn %xmm0, %xmm2, %xmm0 334; XOP-NEXT: retq 335; 336; AVX1-LABEL: var_shuffle_zero_v4i32: 337; AVX1: # %bb.0: 338; AVX1-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 339; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 340; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 341; AVX1-NEXT: vpermilps %xmm1, %xmm0, %xmm0 342; AVX1-NEXT: vpandn %xmm0, %xmm2, %xmm0 343; AVX1-NEXT: retq 344; 345; AVX2-LABEL: var_shuffle_zero_v4i32: 346; AVX2: # %bb.0: 347; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4] 348; AVX2-NEXT: vpmaxud %xmm2, %xmm1, %xmm2 349; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 350; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 351; AVX2-NEXT: vpermilps %xmm1, %xmm0, %xmm0 352; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0 353; AVX2-NEXT: retq 354; 355; AVX512-LABEL: var_shuffle_zero_v4i32: 356; AVX512: # %bb.0: 357; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 358; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1 359; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 360; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} 361; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0 362; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 363; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} 364; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 365; AVX512-NEXT: vzeroupper 366; AVX512-NEXT: retq 367; 368; AVX512VL-LABEL: var_shuffle_zero_v4i32: 369; AVX512VL: # %bb.0: 370; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1 371; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 372; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1} 373; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 374; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 375; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} 376; AVX512VL-NEXT: retq 377 %cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3> 378 %or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices 379 %idx0 = extractelement <4 x i32> %or, i64 0 380 %idx1 = extractelement <4 x i32> %or, i64 1 381 %idx2 = extractelement <4 x i32> %or, i64 2 382 %idx3 = extractelement <4 x i32> %or, i64 3 383 %elt0 = extractelement <4 x i32> %v, i32 %idx0 384 %elt1 = extractelement <4 x i32> %v, i32 %idx1 385 %elt2 = extractelement <4 x i32> %v, i32 %idx2 386 %elt3 = extractelement <4 x i32> %v, i32 %idx3 387 %vec0 = insertelement <4 x i32> poison, i32 %elt0, i32 0 388 %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1 389 %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2 390 %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3 391 %res = select <4 x i1> %cmp, <4 x i32> zeroinitializer, <4 x i32> %vec3 392 ret <4 x i32> %res 393} 394 395define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind { 396; SSE3-LABEL: var_shuffle_v8i16: 397; SSE3: # %bb.0: 398; SSE3-NEXT: pextrw $0, %xmm1, %eax 399; SSE3-NEXT: pextrw $1, %xmm1, %ecx 400; SSE3-NEXT: pextrw $2, %xmm1, %edx 401; SSE3-NEXT: pextrw $3, %xmm1, %esi 402; SSE3-NEXT: pextrw $4, %xmm1, %edi 403; SSE3-NEXT: pextrw $5, %xmm1, %r8d 404; SSE3-NEXT: pextrw $6, %xmm1, %r9d 405; SSE3-NEXT: pextrw $7, %xmm1, %r10d 406; SSE3-NEXT: movaps %xmm0, -24(%rsp) 407; SSE3-NEXT: andl $7, %eax 408; SSE3-NEXT: andl $7, %ecx 409; SSE3-NEXT: andl $7, %edx 410; SSE3-NEXT: andl $7, %esi 411; SSE3-NEXT: andl $7, %edi 412; SSE3-NEXT: andl $7, %r8d 413; SSE3-NEXT: andl $7, %r9d 414; SSE3-NEXT: andl $7, %r10d 415; SSE3-NEXT: movzwl -24(%rsp,%r10,2), %r10d 416; SSE3-NEXT: movd %r10d, %xmm0 417; SSE3-NEXT: movzwl -24(%rsp,%r9,2), %r9d 418; SSE3-NEXT: movd %r9d, %xmm1 419; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 420; SSE3-NEXT: movzwl -24(%rsp,%r8,2), %r8d 421; SSE3-NEXT: movd %r8d, %xmm0 422; SSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi 423; SSE3-NEXT: movd %edi, %xmm2 424; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 425; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 426; SSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi 427; SSE3-NEXT: movd %esi, %xmm0 428; SSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx 429; SSE3-NEXT: movd %edx, %xmm1 430; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 431; SSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx 432; SSE3-NEXT: movd %ecx, %xmm3 433; SSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax 434; SSE3-NEXT: movd %eax, %xmm0 435; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 436; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 437; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 438; SSE3-NEXT: retq 439; 440; SSSE3-LABEL: var_shuffle_v8i16: 441; SSSE3: # %bb.0: 442; SSSE3-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514] 443; SSSE3-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 444; SSSE3-NEXT: pshufb %xmm1, %xmm0 445; SSSE3-NEXT: retq 446; 447; SSE41-LABEL: var_shuffle_v8i16: 448; SSE41: # %bb.0: 449; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514] 450; SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 451; SSE41-NEXT: pshufb %xmm1, %xmm0 452; SSE41-NEXT: retq 453; 454; AVXNOVLBW-LABEL: var_shuffle_v8i16: 455; AVXNOVLBW: # %bb.0: 456; AVXNOVLBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514] 457; AVXNOVLBW-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 458; AVXNOVLBW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 459; AVXNOVLBW-NEXT: retq 460; 461; AVX512VL-LABEL: var_shuffle_v8i16: 462; AVX512VL: # %bb.0: 463; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0 464; AVX512VL-NEXT: retq 465 %index0 = extractelement <8 x i16> %indices, i32 0 466 %index1 = extractelement <8 x i16> %indices, i32 1 467 %index2 = extractelement <8 x i16> %indices, i32 2 468 %index3 = extractelement <8 x i16> %indices, i32 3 469 %index4 = extractelement <8 x i16> %indices, i32 4 470 %index5 = extractelement <8 x i16> %indices, i32 5 471 %index6 = extractelement <8 x i16> %indices, i32 6 472 %index7 = extractelement <8 x i16> %indices, i32 7 473 %v0 = extractelement <8 x i16> %v, i16 %index0 474 %v1 = extractelement <8 x i16> %v, i16 %index1 475 %v2 = extractelement <8 x i16> %v, i16 %index2 476 %v3 = extractelement <8 x i16> %v, i16 %index3 477 %v4 = extractelement <8 x i16> %v, i16 %index4 478 %v5 = extractelement <8 x i16> %v, i16 %index5 479 %v6 = extractelement <8 x i16> %v, i16 %index6 480 %v7 = extractelement <8 x i16> %v, i16 %index7 481 %ret0 = insertelement <8 x i16> undef, i16 %v0, i32 0 482 %ret1 = insertelement <8 x i16> %ret0, i16 %v1, i32 1 483 %ret2 = insertelement <8 x i16> %ret1, i16 %v2, i32 2 484 %ret3 = insertelement <8 x i16> %ret2, i16 %v3, i32 3 485 %ret4 = insertelement <8 x i16> %ret3, i16 %v4, i32 4 486 %ret5 = insertelement <8 x i16> %ret4, i16 %v5, i32 5 487 %ret6 = insertelement <8 x i16> %ret5, i16 %v6, i32 6 488 %ret7 = insertelement <8 x i16> %ret6, i16 %v7, i32 7 489 ret <8 x i16> %ret7 490} 491 492define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind { 493; SSE3-LABEL: var_shuffle_zero_v8i16: 494; SSE3: # %bb.0: 495; SSE3-NEXT: movdqa %xmm0, %xmm2 496; SSE3-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8] 497; SSE3-NEXT: psubusw %xmm1, %xmm3 498; SSE3-NEXT: pxor %xmm0, %xmm0 499; SSE3-NEXT: pcmpeqw %xmm3, %xmm0 500; SSE3-NEXT: por %xmm0, %xmm1 501; SSE3-NEXT: pextrw $0, %xmm1, %eax 502; SSE3-NEXT: pextrw $1, %xmm1, %ecx 503; SSE3-NEXT: pextrw $2, %xmm1, %edx 504; SSE3-NEXT: pextrw $3, %xmm1, %esi 505; SSE3-NEXT: pextrw $4, %xmm1, %edi 506; SSE3-NEXT: pextrw $5, %xmm1, %r8d 507; SSE3-NEXT: pextrw $6, %xmm1, %r9d 508; SSE3-NEXT: pextrw $7, %xmm1, %r10d 509; SSE3-NEXT: movdqa %xmm2, -24(%rsp) 510; SSE3-NEXT: andl $7, %eax 511; SSE3-NEXT: andl $7, %ecx 512; SSE3-NEXT: andl $7, %edx 513; SSE3-NEXT: andl $7, %esi 514; SSE3-NEXT: andl $7, %edi 515; SSE3-NEXT: andl $7, %r8d 516; SSE3-NEXT: andl $7, %r9d 517; SSE3-NEXT: andl $7, %r10d 518; SSE3-NEXT: movzwl -24(%rsp,%r10,2), %r10d 519; SSE3-NEXT: movd %r10d, %xmm1 520; SSE3-NEXT: movzwl -24(%rsp,%r9,2), %r9d 521; SSE3-NEXT: movd %r9d, %xmm2 522; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 523; SSE3-NEXT: movzwl -24(%rsp,%r8,2), %r8d 524; SSE3-NEXT: movd %r8d, %xmm1 525; SSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi 526; SSE3-NEXT: movd %edi, %xmm3 527; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 528; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 529; SSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi 530; SSE3-NEXT: movd %esi, %xmm1 531; SSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx 532; SSE3-NEXT: movd %edx, %xmm2 533; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 534; SSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx 535; SSE3-NEXT: movd %ecx, %xmm1 536; SSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax 537; SSE3-NEXT: movd %eax, %xmm4 538; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 539; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] 540; SSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] 541; SSE3-NEXT: pandn %xmm4, %xmm0 542; SSE3-NEXT: retq 543; 544; SSSE3-LABEL: var_shuffle_zero_v8i16: 545; SSSE3: # %bb.0: 546; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8] 547; SSSE3-NEXT: psubusw %xmm1, %xmm2 548; SSSE3-NEXT: pxor %xmm3, %xmm3 549; SSSE3-NEXT: pcmpeqw %xmm2, %xmm3 550; SSSE3-NEXT: por %xmm3, %xmm1 551; SSSE3-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514] 552; SSSE3-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 553; SSSE3-NEXT: por %xmm3, %xmm1 554; SSSE3-NEXT: pshufb %xmm1, %xmm0 555; SSSE3-NEXT: retq 556; 557; SSE41-LABEL: var_shuffle_zero_v8i16: 558; SSE41: # %bb.0: 559; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8] 560; SSE41-NEXT: pmaxuw %xmm1, %xmm2 561; SSE41-NEXT: pcmpeqw %xmm1, %xmm2 562; SSE41-NEXT: por %xmm2, %xmm1 563; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514] 564; SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 565; SSE41-NEXT: por %xmm2, %xmm1 566; SSE41-NEXT: pshufb %xmm1, %xmm0 567; SSE41-NEXT: retq 568; 569; XOP-LABEL: var_shuffle_zero_v8i16: 570; XOP: # %bb.0: 571; XOP-NEXT: vpcomgtuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 572; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1 573; XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514] 574; XOP-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 575; XOP-NEXT: vpor %xmm2, %xmm1, %xmm1 576; XOP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 577; XOP-NEXT: retq 578; 579; AVX1-LABEL: var_shuffle_zero_v8i16: 580; AVX1: # %bb.0: 581; AVX1-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 582; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 583; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 584; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514] 585; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 586; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 587; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 588; AVX1-NEXT: retq 589; 590; AVX2-LABEL: var_shuffle_zero_v8i16: 591; AVX2: # %bb.0: 592; AVX2-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 593; AVX2-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 594; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 595; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514] 596; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 597; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 598; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 599; AVX2-NEXT: retq 600; 601; AVX512VL-LABEL: var_shuffle_zero_v8i16: 602; AVX512VL: # %bb.0: 603; AVX512VL-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1 604; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 605; AVX512VL-NEXT: vmovdqu16 %xmm2, %xmm1 {%k1} 606; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0 607; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 608; AVX512VL-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} 609; AVX512VL-NEXT: retq 610 %cmp = icmp ugt <8 x i16> %indices, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 611 %or = select <8 x i1> %cmp, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %indices 612 %idx0 = extractelement <8 x i16> %or, i64 0 613 %idx1 = extractelement <8 x i16> %or, i64 1 614 %idx2 = extractelement <8 x i16> %or, i64 2 615 %idx3 = extractelement <8 x i16> %or, i64 3 616 %idx4 = extractelement <8 x i16> %or, i64 4 617 %idx5 = extractelement <8 x i16> %or, i64 5 618 %idx6 = extractelement <8 x i16> %or, i64 6 619 %idx7 = extractelement <8 x i16> %or, i64 7 620 %elt0 = extractelement <8 x i16> %v, i16 %idx0 621 %elt1 = extractelement <8 x i16> %v, i16 %idx1 622 %elt2 = extractelement <8 x i16> %v, i16 %idx2 623 %elt3 = extractelement <8 x i16> %v, i16 %idx3 624 %elt4 = extractelement <8 x i16> %v, i16 %idx4 625 %elt5 = extractelement <8 x i16> %v, i16 %idx5 626 %elt6 = extractelement <8 x i16> %v, i16 %idx6 627 %elt7 = extractelement <8 x i16> %v, i16 %idx7 628 %vec0 = insertelement <8 x i16> poison, i16 %elt0, i64 0 629 %vec1 = insertelement <8 x i16> %vec0, i16 %elt1, i64 1 630 %vec2 = insertelement <8 x i16> %vec1, i16 %elt2, i64 2 631 %vec3 = insertelement <8 x i16> %vec2, i16 %elt3, i64 3 632 %vec4 = insertelement <8 x i16> %vec3, i16 %elt4, i64 4 633 %vec5 = insertelement <8 x i16> %vec4, i16 %elt5, i64 5 634 %vec6 = insertelement <8 x i16> %vec5, i16 %elt6, i64 6 635 %vec7 = insertelement <8 x i16> %vec6, i16 %elt7, i64 7 636 %res = select <8 x i1> %cmp, <8 x i16> zeroinitializer, <8 x i16> %vec7 637 ret <8 x i16> %res 638} 639 640define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { 641; SSE3-LABEL: var_shuffle_v16i8: 642; SSE3: # %bb.0: 643; SSE3-NEXT: movaps %xmm1, -40(%rsp) 644; SSE3-NEXT: movaps %xmm0, -24(%rsp) 645; SSE3-NEXT: movzbl -25(%rsp), %eax 646; SSE3-NEXT: andl $15, %eax 647; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 648; SSE3-NEXT: movd %eax, %xmm1 649; SSE3-NEXT: movzbl -26(%rsp), %eax 650; SSE3-NEXT: andl $15, %eax 651; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 652; SSE3-NEXT: movd %eax, %xmm2 653; SSE3-NEXT: movzbl -27(%rsp), %eax 654; SSE3-NEXT: andl $15, %eax 655; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 656; SSE3-NEXT: movd %eax, %xmm4 657; SSE3-NEXT: movzbl -28(%rsp), %eax 658; SSE3-NEXT: andl $15, %eax 659; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 660; SSE3-NEXT: movd %eax, %xmm3 661; SSE3-NEXT: movzbl -29(%rsp), %eax 662; SSE3-NEXT: andl $15, %eax 663; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 664; SSE3-NEXT: movd %eax, %xmm6 665; SSE3-NEXT: movzbl -30(%rsp), %eax 666; SSE3-NEXT: andl $15, %eax 667; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 668; SSE3-NEXT: movd %eax, %xmm7 669; SSE3-NEXT: movzbl -31(%rsp), %eax 670; SSE3-NEXT: andl $15, %eax 671; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 672; SSE3-NEXT: movd %eax, %xmm8 673; SSE3-NEXT: movzbl -32(%rsp), %eax 674; SSE3-NEXT: andl $15, %eax 675; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 676; SSE3-NEXT: movd %eax, %xmm5 677; SSE3-NEXT: movzbl -33(%rsp), %eax 678; SSE3-NEXT: andl $15, %eax 679; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 680; SSE3-NEXT: movd %eax, %xmm9 681; SSE3-NEXT: movzbl -34(%rsp), %eax 682; SSE3-NEXT: andl $15, %eax 683; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 684; SSE3-NEXT: movd %eax, %xmm10 685; SSE3-NEXT: movzbl -35(%rsp), %eax 686; SSE3-NEXT: andl $15, %eax 687; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 688; SSE3-NEXT: movd %eax, %xmm12 689; SSE3-NEXT: movzbl -36(%rsp), %eax 690; SSE3-NEXT: andl $15, %eax 691; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 692; SSE3-NEXT: movd %eax, %xmm11 693; SSE3-NEXT: movzbl -37(%rsp), %eax 694; SSE3-NEXT: andl $15, %eax 695; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 696; SSE3-NEXT: movd %eax, %xmm13 697; SSE3-NEXT: movzbl -38(%rsp), %eax 698; SSE3-NEXT: andl $15, %eax 699; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 700; SSE3-NEXT: movd %eax, %xmm14 701; SSE3-NEXT: movzbl -39(%rsp), %eax 702; SSE3-NEXT: andl $15, %eax 703; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 704; SSE3-NEXT: movd %eax, %xmm15 705; SSE3-NEXT: movzbl -40(%rsp), %eax 706; SSE3-NEXT: andl $15, %eax 707; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 708; SSE3-NEXT: movd %eax, %xmm0 709; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 710; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 711; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 712; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 713; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 714; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] 715; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] 716; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] 717; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] 718; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] 719; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 720; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] 721; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] 722; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] 723; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] 724; SSE3-NEXT: retq 725; 726; SSSE3-LABEL: var_shuffle_v16i8: 727; SSSE3: # %bb.0: 728; SSSE3-NEXT: pshufb %xmm1, %xmm0 729; SSSE3-NEXT: retq 730; 731; SSE41-LABEL: var_shuffle_v16i8: 732; SSE41: # %bb.0: 733; SSE41-NEXT: pshufb %xmm1, %xmm0 734; SSE41-NEXT: retq 735; 736; AVX-LABEL: var_shuffle_v16i8: 737; AVX: # %bb.0: 738; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 739; AVX-NEXT: retq 740 %index0 = extractelement <16 x i8> %indices, i32 0 741 %index1 = extractelement <16 x i8> %indices, i32 1 742 %index2 = extractelement <16 x i8> %indices, i32 2 743 %index3 = extractelement <16 x i8> %indices, i32 3 744 %index4 = extractelement <16 x i8> %indices, i32 4 745 %index5 = extractelement <16 x i8> %indices, i32 5 746 %index6 = extractelement <16 x i8> %indices, i32 6 747 %index7 = extractelement <16 x i8> %indices, i32 7 748 %index8 = extractelement <16 x i8> %indices, i32 8 749 %index9 = extractelement <16 x i8> %indices, i32 9 750 %index10 = extractelement <16 x i8> %indices, i32 10 751 %index11 = extractelement <16 x i8> %indices, i32 11 752 %index12 = extractelement <16 x i8> %indices, i32 12 753 %index13 = extractelement <16 x i8> %indices, i32 13 754 %index14 = extractelement <16 x i8> %indices, i32 14 755 %index15 = extractelement <16 x i8> %indices, i32 15 756 %v0 = extractelement <16 x i8> %v, i8 %index0 757 %v1 = extractelement <16 x i8> %v, i8 %index1 758 %v2 = extractelement <16 x i8> %v, i8 %index2 759 %v3 = extractelement <16 x i8> %v, i8 %index3 760 %v4 = extractelement <16 x i8> %v, i8 %index4 761 %v5 = extractelement <16 x i8> %v, i8 %index5 762 %v6 = extractelement <16 x i8> %v, i8 %index6 763 %v7 = extractelement <16 x i8> %v, i8 %index7 764 %v8 = extractelement <16 x i8> %v, i8 %index8 765 %v9 = extractelement <16 x i8> %v, i8 %index9 766 %v10 = extractelement <16 x i8> %v, i8 %index10 767 %v11 = extractelement <16 x i8> %v, i8 %index11 768 %v12 = extractelement <16 x i8> %v, i8 %index12 769 %v13 = extractelement <16 x i8> %v, i8 %index13 770 %v14 = extractelement <16 x i8> %v, i8 %index14 771 %v15 = extractelement <16 x i8> %v, i8 %index15 772 %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0 773 %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1 774 %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2 775 %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3 776 %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4 777 %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5 778 %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6 779 %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7 780 %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8 781 %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9 782 %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10 783 %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11 784 %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12 785 %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13 786 %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14 787 %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15 788 ret <16 x i8> %ret15 789} 790 791define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { 792; SSE3-LABEL: var_shuffle_zero_v16i8: 793; SSE3: # %bb.0: 794; SSE3-NEXT: movaps %xmm0, %xmm2 795; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 796; SSE3-NEXT: pmaxub %xmm1, %xmm0 797; SSE3-NEXT: pcmpeqb %xmm1, %xmm0 798; SSE3-NEXT: por %xmm0, %xmm1 799; SSE3-NEXT: movdqa %xmm1, -40(%rsp) 800; SSE3-NEXT: movaps %xmm2, -24(%rsp) 801; SSE3-NEXT: movzbl -25(%rsp), %eax 802; SSE3-NEXT: andl $15, %eax 803; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 804; SSE3-NEXT: movd %eax, %xmm1 805; SSE3-NEXT: movzbl -26(%rsp), %eax 806; SSE3-NEXT: andl $15, %eax 807; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 808; SSE3-NEXT: movd %eax, %xmm2 809; SSE3-NEXT: movzbl -27(%rsp), %eax 810; SSE3-NEXT: andl $15, %eax 811; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 812; SSE3-NEXT: movd %eax, %xmm4 813; SSE3-NEXT: movzbl -28(%rsp), %eax 814; SSE3-NEXT: andl $15, %eax 815; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 816; SSE3-NEXT: movd %eax, %xmm3 817; SSE3-NEXT: movzbl -29(%rsp), %eax 818; SSE3-NEXT: andl $15, %eax 819; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 820; SSE3-NEXT: movd %eax, %xmm6 821; SSE3-NEXT: movzbl -30(%rsp), %eax 822; SSE3-NEXT: andl $15, %eax 823; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 824; SSE3-NEXT: movd %eax, %xmm7 825; SSE3-NEXT: movzbl -31(%rsp), %eax 826; SSE3-NEXT: andl $15, %eax 827; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 828; SSE3-NEXT: movd %eax, %xmm8 829; SSE3-NEXT: movzbl -32(%rsp), %eax 830; SSE3-NEXT: andl $15, %eax 831; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 832; SSE3-NEXT: movd %eax, %xmm5 833; SSE3-NEXT: movzbl -33(%rsp), %eax 834; SSE3-NEXT: andl $15, %eax 835; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 836; SSE3-NEXT: movd %eax, %xmm9 837; SSE3-NEXT: movzbl -34(%rsp), %eax 838; SSE3-NEXT: andl $15, %eax 839; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 840; SSE3-NEXT: movd %eax, %xmm10 841; SSE3-NEXT: movzbl -35(%rsp), %eax 842; SSE3-NEXT: andl $15, %eax 843; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 844; SSE3-NEXT: movd %eax, %xmm12 845; SSE3-NEXT: movzbl -36(%rsp), %eax 846; SSE3-NEXT: andl $15, %eax 847; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 848; SSE3-NEXT: movd %eax, %xmm11 849; SSE3-NEXT: movzbl -37(%rsp), %eax 850; SSE3-NEXT: andl $15, %eax 851; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 852; SSE3-NEXT: movd %eax, %xmm13 853; SSE3-NEXT: movzbl -38(%rsp), %eax 854; SSE3-NEXT: andl $15, %eax 855; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 856; SSE3-NEXT: movd %eax, %xmm14 857; SSE3-NEXT: movzbl -39(%rsp), %eax 858; SSE3-NEXT: andl $15, %eax 859; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 860; SSE3-NEXT: movd %eax, %xmm15 861; SSE3-NEXT: movzbl -40(%rsp), %eax 862; SSE3-NEXT: andl $15, %eax 863; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 864; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 865; SSE3-NEXT: movd %eax, %xmm1 866; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 867; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 868; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 869; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 870; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] 871; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] 872; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] 873; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] 874; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] 875; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 876; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] 877; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] 878; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] 879; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] 880; SSE3-NEXT: pandn %xmm1, %xmm0 881; SSE3-NEXT: retq 882; 883; SSSE3-LABEL: var_shuffle_zero_v16i8: 884; SSSE3: # %bb.0: 885; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 886; SSSE3-NEXT: pmaxub %xmm1, %xmm2 887; SSSE3-NEXT: pcmpeqb %xmm1, %xmm2 888; SSSE3-NEXT: por %xmm1, %xmm2 889; SSSE3-NEXT: pshufb %xmm2, %xmm0 890; SSSE3-NEXT: retq 891; 892; SSE41-LABEL: var_shuffle_zero_v16i8: 893; SSE41: # %bb.0: 894; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 895; SSE41-NEXT: pmaxub %xmm1, %xmm2 896; SSE41-NEXT: pcmpeqb %xmm1, %xmm2 897; SSE41-NEXT: por %xmm1, %xmm2 898; SSE41-NEXT: pshufb %xmm2, %xmm0 899; SSE41-NEXT: retq 900; 901; XOP-LABEL: var_shuffle_zero_v16i8: 902; XOP: # %bb.0: 903; XOP-NEXT: vpcomgtub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 904; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1 905; XOP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 906; XOP-NEXT: retq 907; 908; AVX1-LABEL: var_shuffle_zero_v16i8: 909; AVX1: # %bb.0: 910; AVX1-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 911; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 912; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 913; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 914; AVX1-NEXT: retq 915; 916; AVX2-LABEL: var_shuffle_zero_v16i8: 917; AVX2: # %bb.0: 918; AVX2-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 919; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 920; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 921; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 922; AVX2-NEXT: retq 923; 924; AVX512VL-LABEL: var_shuffle_zero_v16i8: 925; AVX512VL: # %bb.0: 926; AVX512VL-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1 927; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 928; AVX512VL-NEXT: vmovdqu8 %xmm2, %xmm1 {%k1} 929; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 930; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 931; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} 932; AVX512VL-NEXT: retq 933 %cmp = icmp ugt <16 x i8> %indices, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15> 934 %or = select <16 x i1> %cmp, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %indices 935 %idx0 = extractelement <16 x i8> %or, i64 0 936 %idx1 = extractelement <16 x i8> %or, i64 1 937 %idx2 = extractelement <16 x i8> %or, i64 2 938 %idx3 = extractelement <16 x i8> %or, i64 3 939 %idx4 = extractelement <16 x i8> %or, i64 4 940 %idx5 = extractelement <16 x i8> %or, i64 5 941 %idx6 = extractelement <16 x i8> %or, i64 6 942 %idx7 = extractelement <16 x i8> %or, i64 7 943 %idx8 = extractelement <16 x i8> %or, i64 8 944 %idx9 = extractelement <16 x i8> %or, i64 9 945 %idxA = extractelement <16 x i8> %or, i64 10 946 %idxB = extractelement <16 x i8> %or, i64 11 947 %idxC = extractelement <16 x i8> %or, i64 12 948 %idxD = extractelement <16 x i8> %or, i64 13 949 %idxE = extractelement <16 x i8> %or, i64 14 950 %idxF = extractelement <16 x i8> %or, i64 15 951 %elt0 = extractelement <16 x i8> %v, i8 %idx0 952 %elt1 = extractelement <16 x i8> %v, i8 %idx1 953 %elt2 = extractelement <16 x i8> %v, i8 %idx2 954 %elt3 = extractelement <16 x i8> %v, i8 %idx3 955 %elt4 = extractelement <16 x i8> %v, i8 %idx4 956 %elt5 = extractelement <16 x i8> %v, i8 %idx5 957 %elt6 = extractelement <16 x i8> %v, i8 %idx6 958 %elt7 = extractelement <16 x i8> %v, i8 %idx7 959 %elt8 = extractelement <16 x i8> %v, i8 %idx8 960 %elt9 = extractelement <16 x i8> %v, i8 %idx9 961 %eltA = extractelement <16 x i8> %v, i8 %idxA 962 %eltB = extractelement <16 x i8> %v, i8 %idxB 963 %eltC = extractelement <16 x i8> %v, i8 %idxC 964 %eltD = extractelement <16 x i8> %v, i8 %idxD 965 %eltE = extractelement <16 x i8> %v, i8 %idxE 966 %eltF = extractelement <16 x i8> %v, i8 %idxF 967 %vec0 = insertelement <16 x i8> poison, i8 %elt0, i64 0 968 %vec1 = insertelement <16 x i8> %vec0, i8 %elt1, i64 1 969 %vec2 = insertelement <16 x i8> %vec1, i8 %elt2, i64 2 970 %vec3 = insertelement <16 x i8> %vec2, i8 %elt3, i64 3 971 %vec4 = insertelement <16 x i8> %vec3, i8 %elt4, i64 4 972 %vec5 = insertelement <16 x i8> %vec4, i8 %elt5, i64 5 973 %vec6 = insertelement <16 x i8> %vec5, i8 %elt6, i64 6 974 %vec7 = insertelement <16 x i8> %vec6, i8 %elt7, i64 7 975 %vec8 = insertelement <16 x i8> %vec7, i8 %elt8, i64 8 976 %vec9 = insertelement <16 x i8> %vec8, i8 %elt9, i64 9 977 %vecA = insertelement <16 x i8> %vec9, i8 %eltA, i64 10 978 %vecB = insertelement <16 x i8> %vecA, i8 %eltB, i64 11 979 %vecC = insertelement <16 x i8> %vecB, i8 %eltC, i64 12 980 %vecD = insertelement <16 x i8> %vecC, i8 %eltD, i64 13 981 %vecE = insertelement <16 x i8> %vecD, i8 %eltE, i64 14 982 %vecF = insertelement <16 x i8> %vecE, i8 %eltF, i64 15 983 %res = select <16 x i1> %cmp, <16 x i8> zeroinitializer, <16 x i8> %vecF 984 ret <16 x i8> %res 985} 986 987define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind { 988; SSE3-LABEL: var_shuffle_v2f64: 989; SSE3: # %bb.0: 990; SSE3-NEXT: movq %xmm1, %rax 991; SSE3-NEXT: andl $1, %eax 992; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 993; SSE3-NEXT: movq %xmm1, %rcx 994; SSE3-NEXT: andl $1, %ecx 995; SSE3-NEXT: movaps %xmm0, -24(%rsp) 996; SSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero 997; SSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1] 998; SSE3-NEXT: retq 999; 1000; SSSE3-LABEL: var_shuffle_v2f64: 1001; SSSE3: # %bb.0: 1002; SSSE3-NEXT: movq %xmm1, %rax 1003; SSSE3-NEXT: andl $1, %eax 1004; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1005; SSSE3-NEXT: movq %xmm1, %rcx 1006; SSSE3-NEXT: andl $1, %ecx 1007; SSSE3-NEXT: movaps %xmm0, -24(%rsp) 1008; SSSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero 1009; SSSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1] 1010; SSSE3-NEXT: retq 1011; 1012; SSE41-LABEL: var_shuffle_v2f64: 1013; SSE41: # %bb.0: 1014; SSE41-NEXT: movdqa %xmm0, %xmm2 1015; SSE41-NEXT: pxor %xmm0, %xmm0 1016; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 1017; SSE41-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0] 1018; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1019; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 1020; SSE41-NEXT: movapd %xmm2, %xmm0 1021; SSE41-NEXT: retq 1022; 1023; AVX-LABEL: var_shuffle_v2f64: 1024; AVX: # %bb.0: 1025; AVX-NEXT: vpaddq %xmm1, %xmm1, %xmm1 1026; AVX-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 1027; AVX-NEXT: retq 1028 %index0 = extractelement <2 x i64> %indices, i32 0 1029 %index1 = extractelement <2 x i64> %indices, i32 1 1030 %v0 = extractelement <2 x double> %v, i64 %index0 1031 %v1 = extractelement <2 x double> %v, i64 %index1 1032 %ret0 = insertelement <2 x double> undef, double %v0, i32 0 1033 %ret1 = insertelement <2 x double> %ret0, double %v1, i32 1 1034 ret <2 x double> %ret1 1035} 1036 1037define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind { 1038; SSE3-LABEL: var_shuffle_zero_v2f64: 1039; SSE3: # %bb.0: 1040; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] 1041; SSE3-NEXT: pxor %xmm1, %xmm2 1042; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 1043; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1044; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] 1045; SSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 1046; SSE3-NEXT: pand %xmm4, %xmm3 1047; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1048; SSE3-NEXT: por %xmm3, %xmm2 1049; SSE3-NEXT: por %xmm2, %xmm1 1050; SSE3-NEXT: movq %xmm1, %rax 1051; SSE3-NEXT: andl $1, %eax 1052; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1053; SSE3-NEXT: movq %xmm1, %rcx 1054; SSE3-NEXT: andl $1, %ecx 1055; SSE3-NEXT: movaps %xmm0, -24(%rsp) 1056; SSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero 1057; SSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1] 1058; SSE3-NEXT: pandn %xmm0, %xmm2 1059; SSE3-NEXT: movdqa %xmm2, %xmm0 1060; SSE3-NEXT: retq 1061; 1062; SSSE3-LABEL: var_shuffle_zero_v2f64: 1063; SSSE3: # %bb.0: 1064; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] 1065; SSSE3-NEXT: pxor %xmm1, %xmm2 1066; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 1067; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1068; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] 1069; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 1070; SSSE3-NEXT: pand %xmm4, %xmm3 1071; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1072; SSSE3-NEXT: por %xmm3, %xmm2 1073; SSSE3-NEXT: por %xmm2, %xmm1 1074; SSSE3-NEXT: movq %xmm1, %rax 1075; SSSE3-NEXT: andl $1, %eax 1076; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1077; SSSE3-NEXT: movq %xmm1, %rcx 1078; SSSE3-NEXT: andl $1, %ecx 1079; SSSE3-NEXT: movaps %xmm0, -24(%rsp) 1080; SSSE3-NEXT: movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero 1081; SSSE3-NEXT: movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1] 1082; SSSE3-NEXT: pandn %xmm0, %xmm2 1083; SSSE3-NEXT: movdqa %xmm2, %xmm0 1084; SSSE3-NEXT: retq 1085; 1086; SSE41-LABEL: var_shuffle_zero_v2f64: 1087; SSE41: # %bb.0: 1088; SSE41-NEXT: movapd %xmm0, %xmm2 1089; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] 1090; SSE41-NEXT: pxor %xmm1, %xmm0 1091; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 1092; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1093; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] 1094; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 1095; SSE41-NEXT: pand %xmm3, %xmm4 1096; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1097; SSE41-NEXT: por %xmm4, %xmm3 1098; SSE41-NEXT: por %xmm3, %xmm1 1099; SSE41-NEXT: pxor %xmm0, %xmm0 1100; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 1101; SSE41-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0] 1102; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] 1103; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 1104; SSE41-NEXT: pandn %xmm2, %xmm3 1105; SSE41-NEXT: movdqa %xmm3, %xmm0 1106; SSE41-NEXT: retq 1107; 1108; XOP-LABEL: var_shuffle_zero_v2f64: 1109; XOP: # %bb.0: 1110; XOP-NEXT: vpcomgtuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 1111; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1 1112; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 1113; XOP-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 1114; XOP-NEXT: vpandn %xmm0, %xmm2, %xmm0 1115; XOP-NEXT: retq 1116; 1117; AVX1-LABEL: var_shuffle_zero_v2f64: 1118; AVX1: # %bb.0: 1119; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 1120; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1121; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 1122; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 1123; AVX1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 1124; AVX1-NEXT: vpandn %xmm0, %xmm2, %xmm0 1125; AVX1-NEXT: retq 1126; 1127; AVX2-LABEL: var_shuffle_zero_v2f64: 1128; AVX2: # %bb.0: 1129; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 1130; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1131; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 1132; AVX2-NEXT: vpaddq %xmm1, %xmm1, %xmm1 1133; AVX2-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 1134; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0 1135; AVX2-NEXT: retq 1136; 1137; AVX512-LABEL: var_shuffle_zero_v2f64: 1138; AVX512: # %bb.0: 1139; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1140; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3] 1141; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1 1142; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1143; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} 1144; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1 1145; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 1146; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1147; AVX512-NEXT: vmovapd %zmm1, %zmm0 {%k1} 1148; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1149; AVX512-NEXT: vzeroupper 1150; AVX512-NEXT: retq 1151; 1152; AVX512VL-LABEL: var_shuffle_zero_v2f64: 1153; AVX512VL: # %bb.0: 1154; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1 1155; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1156; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1} 1157; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1 1158; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 1159; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1160; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 {%k1} 1161; AVX512VL-NEXT: retq 1162 %cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3> 1163 %or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices 1164 %idx0 = extractelement <2 x i64> %or, i64 0 1165 %idx1 = extractelement <2 x i64> %or, i64 1 1166 %elt0 = extractelement <2 x double> %v, i64 %idx0 1167 %elt1 = extractelement <2 x double> %v, i64 %idx1 1168 %vec0 = insertelement <2 x double> poison, double %elt0, i64 0 1169 %vec1 = insertelement <2 x double> %vec0, double %elt1, i64 1 1170 %res = select <2 x i1> %cmp, <2 x double> zeroinitializer, <2 x double> %vec1 1171 ret <2 x double> %res 1172} 1173 1174define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind { 1175; SSE3-LABEL: var_shuffle_v4f32: 1176; SSE3: # %bb.0: 1177; SSE3-NEXT: movd %xmm1, %eax 1178; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 1179; SSE3-NEXT: movd %xmm2, %ecx 1180; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 1181; SSE3-NEXT: movd %xmm2, %edx 1182; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 1183; SSE3-NEXT: movd %xmm1, %esi 1184; SSE3-NEXT: movaps %xmm0, -24(%rsp) 1185; SSE3-NEXT: andl $3, %eax 1186; SSE3-NEXT: andl $3, %ecx 1187; SSE3-NEXT: andl $3, %edx 1188; SSE3-NEXT: andl $3, %esi 1189; SSE3-NEXT: movss -24(%rsp,%rsi,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 1190; SSE3-NEXT: movss -24(%rsp,%rdx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero 1191; SSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1192; SSE3-NEXT: movss -24(%rsp,%rax,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 1193; SSE3-NEXT: movss -24(%rsp,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero 1194; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1195; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1196; SSE3-NEXT: retq 1197; 1198; SSSE3-LABEL: var_shuffle_v4f32: 1199; SSSE3: # %bb.0: 1200; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036] 1201; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 1202; SSSE3-NEXT: pmuludq %xmm2, %xmm1 1203; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1204; SSSE3-NEXT: pmuludq %xmm2, %xmm3 1205; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 1206; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1207; SSSE3-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1208; SSSE3-NEXT: pshufb %xmm1, %xmm0 1209; SSSE3-NEXT: retq 1210; 1211; SSE41-LABEL: var_shuffle_v4f32: 1212; SSE41: # %bb.0: 1213; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1214; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1215; SSE41-NEXT: pshufb %xmm1, %xmm0 1216; SSE41-NEXT: retq 1217; 1218; AVX-LABEL: var_shuffle_v4f32: 1219; AVX: # %bb.0: 1220; AVX-NEXT: vpermilps %xmm1, %xmm0, %xmm0 1221; AVX-NEXT: retq 1222 %index0 = extractelement <4 x i32> %indices, i32 0 1223 %index1 = extractelement <4 x i32> %indices, i32 1 1224 %index2 = extractelement <4 x i32> %indices, i32 2 1225 %index3 = extractelement <4 x i32> %indices, i32 3 1226 %v0 = extractelement <4 x float> %v, i32 %index0 1227 %v1 = extractelement <4 x float> %v, i32 %index1 1228 %v2 = extractelement <4 x float> %v, i32 %index2 1229 %v3 = extractelement <4 x float> %v, i32 %index3 1230 %ret0 = insertelement <4 x float> undef, float %v0, i32 0 1231 %ret1 = insertelement <4 x float> %ret0, float %v1, i32 1 1232 %ret2 = insertelement <4 x float> %ret1, float %v2, i32 2 1233 %ret3 = insertelement <4 x float> %ret2, float %v3, i32 3 1234 ret <4 x float> %ret3 1235} 1236 1237define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind { 1238; SSE3-LABEL: var_shuffle_zero_v4f32: 1239; SSE3: # %bb.0: 1240; SSE3-NEXT: movaps %xmm0, %xmm2 1241; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] 1242; SSE3-NEXT: pxor %xmm1, %xmm0 1243; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1244; SSE3-NEXT: por %xmm0, %xmm1 1245; SSE3-NEXT: movd %xmm1, %eax 1246; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] 1247; SSE3-NEXT: movd %xmm3, %ecx 1248; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 1249; SSE3-NEXT: movd %xmm3, %edx 1250; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 1251; SSE3-NEXT: movd %xmm1, %esi 1252; SSE3-NEXT: movaps %xmm2, -24(%rsp) 1253; SSE3-NEXT: andl $3, %eax 1254; SSE3-NEXT: andl $3, %ecx 1255; SSE3-NEXT: andl $3, %edx 1256; SSE3-NEXT: andl $3, %esi 1257; SSE3-NEXT: movd -24(%rsp,%rsi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero 1258; SSE3-NEXT: movd -24(%rsp,%rdx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero 1259; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1260; SSE3-NEXT: movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero 1261; SSE3-NEXT: movd -24(%rsp,%rcx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero 1262; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 1263; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1264; SSE3-NEXT: pandn %xmm1, %xmm0 1265; SSE3-NEXT: retq 1266; 1267; SSSE3-LABEL: var_shuffle_zero_v4f32: 1268; SSSE3: # %bb.0: 1269; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] 1270; SSSE3-NEXT: pxor %xmm1, %xmm2 1271; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1272; SSSE3-NEXT: por %xmm2, %xmm1 1273; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [67372036,67372036,67372036,67372036] 1274; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 1275; SSSE3-NEXT: pmuludq %xmm3, %xmm1 1276; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1277; SSSE3-NEXT: pmuludq %xmm3, %xmm4 1278; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] 1279; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 1280; SSSE3-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1281; SSSE3-NEXT: por %xmm2, %xmm1 1282; SSSE3-NEXT: pshufb %xmm1, %xmm0 1283; SSSE3-NEXT: retq 1284; 1285; SSE41-LABEL: var_shuffle_zero_v4f32: 1286; SSE41: # %bb.0: 1287; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [4,4,4,4] 1288; SSE41-NEXT: pmaxud %xmm1, %xmm2 1289; SSE41-NEXT: pcmpeqd %xmm1, %xmm2 1290; SSE41-NEXT: por %xmm2, %xmm1 1291; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1292; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1293; SSE41-NEXT: por %xmm2, %xmm1 1294; SSE41-NEXT: pshufb %xmm1, %xmm0 1295; SSE41-NEXT: retq 1296; 1297; XOP-LABEL: var_shuffle_zero_v4f32: 1298; XOP: # %bb.0: 1299; XOP-NEXT: vpcomgtud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 1300; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1 1301; XOP-NEXT: vpermilps %xmm1, %xmm0, %xmm0 1302; XOP-NEXT: vpandn %xmm0, %xmm2, %xmm0 1303; XOP-NEXT: retq 1304; 1305; AVX1-LABEL: var_shuffle_zero_v4f32: 1306; AVX1: # %bb.0: 1307; AVX1-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 1308; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 1309; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 1310; AVX1-NEXT: vpermilps %xmm1, %xmm0, %xmm0 1311; AVX1-NEXT: vpandn %xmm0, %xmm2, %xmm0 1312; AVX1-NEXT: retq 1313; 1314; AVX2-LABEL: var_shuffle_zero_v4f32: 1315; AVX2: # %bb.0: 1316; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4] 1317; AVX2-NEXT: vpmaxud %xmm2, %xmm1, %xmm2 1318; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 1319; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 1320; AVX2-NEXT: vpermilps %xmm1, %xmm0, %xmm0 1321; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0 1322; AVX2-NEXT: retq 1323; 1324; AVX512-LABEL: var_shuffle_zero_v4f32: 1325; AVX512: # %bb.0: 1326; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1327; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1 1328; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1329; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} 1330; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0 1331; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 1332; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1} 1333; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1334; AVX512-NEXT: vzeroupper 1335; AVX512-NEXT: retq 1336; 1337; AVX512VL-LABEL: var_shuffle_zero_v4f32: 1338; AVX512VL: # %bb.0: 1339; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1 1340; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1341; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1} 1342; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 1343; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 1344; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 {%k1} 1345; AVX512VL-NEXT: retq 1346 %cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3> 1347 %or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices 1348 %idx0 = extractelement <4 x i32> %or, i64 0 1349 %idx1 = extractelement <4 x i32> %or, i64 1 1350 %idx2 = extractelement <4 x i32> %or, i64 2 1351 %idx3 = extractelement <4 x i32> %or, i64 3 1352 %elt0 = extractelement <4 x float> %v, i32 %idx0 1353 %elt1 = extractelement <4 x float> %v, i32 %idx1 1354 %elt2 = extractelement <4 x float> %v, i32 %idx2 1355 %elt3 = extractelement <4 x float> %v, i32 %idx3 1356 %vec0 = insertelement <4 x float> poison, float %elt0, i64 0 1357 %vec1 = insertelement <4 x float> %vec0, float %elt1, i64 1 1358 %vec2 = insertelement <4 x float> %vec1, float %elt2, i64 2 1359 %vec3 = insertelement <4 x float> %vec2, float %elt3, i64 3 1360 %res = select <4 x i1> %cmp, <4 x float> zeroinitializer, <4 x float> %vec3 1361 ret <4 x float> %res 1362} 1363 1364define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %indices) nounwind { 1365; SSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: 1366; SSE3: # %bb.0: 1367; SSE3-NEXT: movaps %xmm1, -40(%rsp) 1368; SSE3-NEXT: movaps %xmm0, -24(%rsp) 1369; SSE3-NEXT: movzbl -25(%rsp), %eax 1370; SSE3-NEXT: andl $15, %eax 1371; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1372; SSE3-NEXT: movd %eax, %xmm1 1373; SSE3-NEXT: movzbl -26(%rsp), %eax 1374; SSE3-NEXT: andl $15, %eax 1375; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1376; SSE3-NEXT: movd %eax, %xmm2 1377; SSE3-NEXT: movzbl -27(%rsp), %eax 1378; SSE3-NEXT: andl $15, %eax 1379; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1380; SSE3-NEXT: movd %eax, %xmm4 1381; SSE3-NEXT: movzbl -28(%rsp), %eax 1382; SSE3-NEXT: andl $15, %eax 1383; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1384; SSE3-NEXT: movd %eax, %xmm3 1385; SSE3-NEXT: movzbl -29(%rsp), %eax 1386; SSE3-NEXT: andl $15, %eax 1387; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1388; SSE3-NEXT: movd %eax, %xmm6 1389; SSE3-NEXT: movzbl -30(%rsp), %eax 1390; SSE3-NEXT: andl $15, %eax 1391; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1392; SSE3-NEXT: movd %eax, %xmm7 1393; SSE3-NEXT: movzbl -31(%rsp), %eax 1394; SSE3-NEXT: andl $15, %eax 1395; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1396; SSE3-NEXT: movd %eax, %xmm8 1397; SSE3-NEXT: movzbl -32(%rsp), %eax 1398; SSE3-NEXT: andl $15, %eax 1399; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1400; SSE3-NEXT: movd %eax, %xmm5 1401; SSE3-NEXT: movzbl -33(%rsp), %eax 1402; SSE3-NEXT: andl $15, %eax 1403; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1404; SSE3-NEXT: movd %eax, %xmm9 1405; SSE3-NEXT: movzbl -34(%rsp), %eax 1406; SSE3-NEXT: andl $15, %eax 1407; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1408; SSE3-NEXT: movd %eax, %xmm10 1409; SSE3-NEXT: movzbl -35(%rsp), %eax 1410; SSE3-NEXT: andl $15, %eax 1411; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1412; SSE3-NEXT: movd %eax, %xmm12 1413; SSE3-NEXT: movzbl -36(%rsp), %eax 1414; SSE3-NEXT: andl $15, %eax 1415; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1416; SSE3-NEXT: movd %eax, %xmm11 1417; SSE3-NEXT: movzbl -37(%rsp), %eax 1418; SSE3-NEXT: andl $15, %eax 1419; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1420; SSE3-NEXT: movd %eax, %xmm13 1421; SSE3-NEXT: movzbl -38(%rsp), %eax 1422; SSE3-NEXT: andl $15, %eax 1423; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1424; SSE3-NEXT: movd %eax, %xmm14 1425; SSE3-NEXT: movzbl -39(%rsp), %eax 1426; SSE3-NEXT: andl $15, %eax 1427; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1428; SSE3-NEXT: movd %eax, %xmm15 1429; SSE3-NEXT: movzbl -40(%rsp), %eax 1430; SSE3-NEXT: andl $15, %eax 1431; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax 1432; SSE3-NEXT: movd %eax, %xmm0 1433; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1434; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1435; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 1436; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 1437; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 1438; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] 1439; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] 1440; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] 1441; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] 1442; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] 1443; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 1444; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] 1445; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] 1446; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] 1447; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] 1448; SSE3-NEXT: retq 1449; 1450; SSSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: 1451; SSSE3: # %bb.0: 1452; SSSE3-NEXT: pshufb %xmm1, %xmm0 1453; SSSE3-NEXT: retq 1454; 1455; SSE41-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: 1456; SSE41: # %bb.0: 1457; SSE41-NEXT: pshufb %xmm1, %xmm0 1458; SSE41-NEXT: retq 1459; 1460; AVX-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: 1461; AVX: # %bb.0: 1462; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1463; AVX-NEXT: vzeroupper 1464; AVX-NEXT: retq 1465 %index0 = extractelement <32 x i8> %indices, i32 0 1466 %index1 = extractelement <32 x i8> %indices, i32 1 1467 %index2 = extractelement <32 x i8> %indices, i32 2 1468 %index3 = extractelement <32 x i8> %indices, i32 3 1469 %index4 = extractelement <32 x i8> %indices, i32 4 1470 %index5 = extractelement <32 x i8> %indices, i32 5 1471 %index6 = extractelement <32 x i8> %indices, i32 6 1472 %index7 = extractelement <32 x i8> %indices, i32 7 1473 %index8 = extractelement <32 x i8> %indices, i32 8 1474 %index9 = extractelement <32 x i8> %indices, i32 9 1475 %index10 = extractelement <32 x i8> %indices, i32 10 1476 %index11 = extractelement <32 x i8> %indices, i32 11 1477 %index12 = extractelement <32 x i8> %indices, i32 12 1478 %index13 = extractelement <32 x i8> %indices, i32 13 1479 %index14 = extractelement <32 x i8> %indices, i32 14 1480 %index15 = extractelement <32 x i8> %indices, i32 15 1481 %v0 = extractelement <16 x i8> %v, i8 %index0 1482 %v1 = extractelement <16 x i8> %v, i8 %index1 1483 %v2 = extractelement <16 x i8> %v, i8 %index2 1484 %v3 = extractelement <16 x i8> %v, i8 %index3 1485 %v4 = extractelement <16 x i8> %v, i8 %index4 1486 %v5 = extractelement <16 x i8> %v, i8 %index5 1487 %v6 = extractelement <16 x i8> %v, i8 %index6 1488 %v7 = extractelement <16 x i8> %v, i8 %index7 1489 %v8 = extractelement <16 x i8> %v, i8 %index8 1490 %v9 = extractelement <16 x i8> %v, i8 %index9 1491 %v10 = extractelement <16 x i8> %v, i8 %index10 1492 %v11 = extractelement <16 x i8> %v, i8 %index11 1493 %v12 = extractelement <16 x i8> %v, i8 %index12 1494 %v13 = extractelement <16 x i8> %v, i8 %index13 1495 %v14 = extractelement <16 x i8> %v, i8 %index14 1496 %v15 = extractelement <16 x i8> %v, i8 %index15 1497 %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0 1498 %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1 1499 %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2 1500 %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3 1501 %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4 1502 %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5 1503 %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6 1504 %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7 1505 %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8 1506 %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9 1507 %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10 1508 %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11 1509 %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12 1510 %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13 1511 %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14 1512 %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15 1513 ret <16 x i8> %ret15 1514} 1515 1516define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %indices) nounwind { 1517; SSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 1518; SSE3: # %bb.0: 1519; SSE3-NEXT: pushq %rbp 1520; SSE3-NEXT: pushq %r15 1521; SSE3-NEXT: pushq %r14 1522; SSE3-NEXT: pushq %r13 1523; SSE3-NEXT: pushq %r12 1524; SSE3-NEXT: pushq %rbx 1525; SSE3-NEXT: subq $424, %rsp # imm = 0x1A8 1526; SSE3-NEXT: movaps %xmm2, -128(%rsp) 1527; SSE3-NEXT: movaps %xmm1, 400(%rsp) 1528; SSE3-NEXT: movaps %xmm0, 384(%rsp) 1529; SSE3-NEXT: movzbl -128(%rsp), %eax 1530; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1531; SSE3-NEXT: movaps %xmm1, 368(%rsp) 1532; SSE3-NEXT: movaps %xmm0, 352(%rsp) 1533; SSE3-NEXT: movzbl -127(%rsp), %ecx 1534; SSE3-NEXT: movaps %xmm1, 336(%rsp) 1535; SSE3-NEXT: movaps %xmm0, 320(%rsp) 1536; SSE3-NEXT: movzbl -126(%rsp), %edx 1537; SSE3-NEXT: movaps %xmm1, 304(%rsp) 1538; SSE3-NEXT: movaps %xmm0, 288(%rsp) 1539; SSE3-NEXT: movzbl -125(%rsp), %esi 1540; SSE3-NEXT: movaps %xmm1, 272(%rsp) 1541; SSE3-NEXT: movaps %xmm0, 256(%rsp) 1542; SSE3-NEXT: movzbl -124(%rsp), %edi 1543; SSE3-NEXT: movaps %xmm1, 240(%rsp) 1544; SSE3-NEXT: movaps %xmm0, 224(%rsp) 1545; SSE3-NEXT: movzbl -123(%rsp), %r8d 1546; SSE3-NEXT: movaps %xmm1, 208(%rsp) 1547; SSE3-NEXT: movaps %xmm0, 192(%rsp) 1548; SSE3-NEXT: movzbl -122(%rsp), %r9d 1549; SSE3-NEXT: movaps %xmm1, 176(%rsp) 1550; SSE3-NEXT: movaps %xmm0, 160(%rsp) 1551; SSE3-NEXT: movzbl -121(%rsp), %r10d 1552; SSE3-NEXT: movaps %xmm1, 144(%rsp) 1553; SSE3-NEXT: movaps %xmm0, 128(%rsp) 1554; SSE3-NEXT: movzbl -120(%rsp), %r11d 1555; SSE3-NEXT: movaps %xmm1, 112(%rsp) 1556; SSE3-NEXT: movaps %xmm0, 96(%rsp) 1557; SSE3-NEXT: movzbl -119(%rsp), %ebx 1558; SSE3-NEXT: movaps %xmm1, 80(%rsp) 1559; SSE3-NEXT: movaps %xmm0, 64(%rsp) 1560; SSE3-NEXT: movzbl -118(%rsp), %r14d 1561; SSE3-NEXT: movaps %xmm1, 48(%rsp) 1562; SSE3-NEXT: movaps %xmm0, 32(%rsp) 1563; SSE3-NEXT: movzbl -117(%rsp), %r15d 1564; SSE3-NEXT: movaps %xmm1, 16(%rsp) 1565; SSE3-NEXT: movaps %xmm0, (%rsp) 1566; SSE3-NEXT: movzbl -116(%rsp), %r12d 1567; SSE3-NEXT: movaps %xmm1, -16(%rsp) 1568; SSE3-NEXT: movaps %xmm0, -32(%rsp) 1569; SSE3-NEXT: movzbl -115(%rsp), %r13d 1570; SSE3-NEXT: movaps %xmm1, -48(%rsp) 1571; SSE3-NEXT: movaps %xmm0, -64(%rsp) 1572; SSE3-NEXT: movzbl -114(%rsp), %ebp 1573; SSE3-NEXT: movaps %xmm1, -80(%rsp) 1574; SSE3-NEXT: movaps %xmm0, -96(%rsp) 1575; SSE3-NEXT: movzbl -113(%rsp), %eax 1576; SSE3-NEXT: andl $31, %eax 1577; SSE3-NEXT: movzbl -96(%rsp,%rax), %eax 1578; SSE3-NEXT: movd %eax, %xmm1 1579; SSE3-NEXT: andl $31, %ebp 1580; SSE3-NEXT: movzbl -64(%rsp,%rbp), %eax 1581; SSE3-NEXT: movd %eax, %xmm2 1582; SSE3-NEXT: andl $31, %r13d 1583; SSE3-NEXT: movzbl -32(%rsp,%r13), %eax 1584; SSE3-NEXT: movd %eax, %xmm4 1585; SSE3-NEXT: andl $31, %r12d 1586; SSE3-NEXT: movzbl (%rsp,%r12), %eax 1587; SSE3-NEXT: movd %eax, %xmm3 1588; SSE3-NEXT: andl $31, %r15d 1589; SSE3-NEXT: movzbl 32(%rsp,%r15), %eax 1590; SSE3-NEXT: movd %eax, %xmm6 1591; SSE3-NEXT: andl $31, %r14d 1592; SSE3-NEXT: movzbl 64(%rsp,%r14), %eax 1593; SSE3-NEXT: movd %eax, %xmm7 1594; SSE3-NEXT: andl $31, %ebx 1595; SSE3-NEXT: movzbl 96(%rsp,%rbx), %eax 1596; SSE3-NEXT: movd %eax, %xmm8 1597; SSE3-NEXT: andl $31, %r11d 1598; SSE3-NEXT: movzbl 128(%rsp,%r11), %eax 1599; SSE3-NEXT: movd %eax, %xmm5 1600; SSE3-NEXT: andl $31, %r10d 1601; SSE3-NEXT: movzbl 160(%rsp,%r10), %eax 1602; SSE3-NEXT: movd %eax, %xmm9 1603; SSE3-NEXT: andl $31, %r9d 1604; SSE3-NEXT: movzbl 192(%rsp,%r9), %eax 1605; SSE3-NEXT: movd %eax, %xmm10 1606; SSE3-NEXT: andl $31, %r8d 1607; SSE3-NEXT: movzbl 224(%rsp,%r8), %eax 1608; SSE3-NEXT: movd %eax, %xmm12 1609; SSE3-NEXT: andl $31, %edi 1610; SSE3-NEXT: movzbl 256(%rsp,%rdi), %eax 1611; SSE3-NEXT: movd %eax, %xmm11 1612; SSE3-NEXT: andl $31, %esi 1613; SSE3-NEXT: movzbl 288(%rsp,%rsi), %eax 1614; SSE3-NEXT: movd %eax, %xmm13 1615; SSE3-NEXT: andl $31, %edx 1616; SSE3-NEXT: movzbl 320(%rsp,%rdx), %eax 1617; SSE3-NEXT: movd %eax, %xmm14 1618; SSE3-NEXT: andl $31, %ecx 1619; SSE3-NEXT: movzbl 352(%rsp,%rcx), %eax 1620; SSE3-NEXT: movd %eax, %xmm15 1621; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 1622; SSE3-NEXT: andl $31, %eax 1623; SSE3-NEXT: movzbl 384(%rsp,%rax), %eax 1624; SSE3-NEXT: movd %eax, %xmm0 1625; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1626; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1627; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 1628; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 1629; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 1630; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] 1631; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] 1632; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] 1633; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] 1634; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] 1635; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 1636; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] 1637; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] 1638; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] 1639; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] 1640; SSE3-NEXT: addq $424, %rsp # imm = 0x1A8 1641; SSE3-NEXT: popq %rbx 1642; SSE3-NEXT: popq %r12 1643; SSE3-NEXT: popq %r13 1644; SSE3-NEXT: popq %r14 1645; SSE3-NEXT: popq %r15 1646; SSE3-NEXT: popq %rbp 1647; SSE3-NEXT: retq 1648; 1649; SSSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 1650; SSSE3: # %bb.0: 1651; SSSE3-NEXT: pushq %rbp 1652; SSSE3-NEXT: pushq %r15 1653; SSSE3-NEXT: pushq %r14 1654; SSSE3-NEXT: pushq %r13 1655; SSSE3-NEXT: pushq %r12 1656; SSSE3-NEXT: pushq %rbx 1657; SSSE3-NEXT: subq $424, %rsp # imm = 0x1A8 1658; SSSE3-NEXT: movaps %xmm2, -128(%rsp) 1659; SSSE3-NEXT: movaps %xmm1, 400(%rsp) 1660; SSSE3-NEXT: movaps %xmm0, 384(%rsp) 1661; SSSE3-NEXT: movzbl -128(%rsp), %eax 1662; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1663; SSSE3-NEXT: movaps %xmm1, 368(%rsp) 1664; SSSE3-NEXT: movaps %xmm0, 352(%rsp) 1665; SSSE3-NEXT: movzbl -127(%rsp), %ecx 1666; SSSE3-NEXT: movaps %xmm1, 336(%rsp) 1667; SSSE3-NEXT: movaps %xmm0, 320(%rsp) 1668; SSSE3-NEXT: movzbl -126(%rsp), %edx 1669; SSSE3-NEXT: movaps %xmm1, 304(%rsp) 1670; SSSE3-NEXT: movaps %xmm0, 288(%rsp) 1671; SSSE3-NEXT: movzbl -125(%rsp), %esi 1672; SSSE3-NEXT: movaps %xmm1, 272(%rsp) 1673; SSSE3-NEXT: movaps %xmm0, 256(%rsp) 1674; SSSE3-NEXT: movzbl -124(%rsp), %edi 1675; SSSE3-NEXT: movaps %xmm1, 240(%rsp) 1676; SSSE3-NEXT: movaps %xmm0, 224(%rsp) 1677; SSSE3-NEXT: movzbl -123(%rsp), %r8d 1678; SSSE3-NEXT: movaps %xmm1, 208(%rsp) 1679; SSSE3-NEXT: movaps %xmm0, 192(%rsp) 1680; SSSE3-NEXT: movzbl -122(%rsp), %r9d 1681; SSSE3-NEXT: movaps %xmm1, 176(%rsp) 1682; SSSE3-NEXT: movaps %xmm0, 160(%rsp) 1683; SSSE3-NEXT: movzbl -121(%rsp), %r10d 1684; SSSE3-NEXT: movaps %xmm1, 144(%rsp) 1685; SSSE3-NEXT: movaps %xmm0, 128(%rsp) 1686; SSSE3-NEXT: movzbl -120(%rsp), %r11d 1687; SSSE3-NEXT: movaps %xmm1, 112(%rsp) 1688; SSSE3-NEXT: movaps %xmm0, 96(%rsp) 1689; SSSE3-NEXT: movzbl -119(%rsp), %ebx 1690; SSSE3-NEXT: movaps %xmm1, 80(%rsp) 1691; SSSE3-NEXT: movaps %xmm0, 64(%rsp) 1692; SSSE3-NEXT: movzbl -118(%rsp), %r14d 1693; SSSE3-NEXT: movaps %xmm1, 48(%rsp) 1694; SSSE3-NEXT: movaps %xmm0, 32(%rsp) 1695; SSSE3-NEXT: movzbl -117(%rsp), %r15d 1696; SSSE3-NEXT: movaps %xmm1, 16(%rsp) 1697; SSSE3-NEXT: movaps %xmm0, (%rsp) 1698; SSSE3-NEXT: movzbl -116(%rsp), %r12d 1699; SSSE3-NEXT: movaps %xmm1, -16(%rsp) 1700; SSSE3-NEXT: movaps %xmm0, -32(%rsp) 1701; SSSE3-NEXT: movzbl -115(%rsp), %r13d 1702; SSSE3-NEXT: movaps %xmm1, -48(%rsp) 1703; SSSE3-NEXT: movaps %xmm0, -64(%rsp) 1704; SSSE3-NEXT: movzbl -114(%rsp), %ebp 1705; SSSE3-NEXT: movaps %xmm1, -80(%rsp) 1706; SSSE3-NEXT: movaps %xmm0, -96(%rsp) 1707; SSSE3-NEXT: movzbl -113(%rsp), %eax 1708; SSSE3-NEXT: andl $31, %eax 1709; SSSE3-NEXT: movzbl -96(%rsp,%rax), %eax 1710; SSSE3-NEXT: movd %eax, %xmm1 1711; SSSE3-NEXT: andl $31, %ebp 1712; SSSE3-NEXT: movzbl -64(%rsp,%rbp), %eax 1713; SSSE3-NEXT: movd %eax, %xmm2 1714; SSSE3-NEXT: andl $31, %r13d 1715; SSSE3-NEXT: movzbl -32(%rsp,%r13), %eax 1716; SSSE3-NEXT: movd %eax, %xmm4 1717; SSSE3-NEXT: andl $31, %r12d 1718; SSSE3-NEXT: movzbl (%rsp,%r12), %eax 1719; SSSE3-NEXT: movd %eax, %xmm3 1720; SSSE3-NEXT: andl $31, %r15d 1721; SSSE3-NEXT: movzbl 32(%rsp,%r15), %eax 1722; SSSE3-NEXT: movd %eax, %xmm6 1723; SSSE3-NEXT: andl $31, %r14d 1724; SSSE3-NEXT: movzbl 64(%rsp,%r14), %eax 1725; SSSE3-NEXT: movd %eax, %xmm7 1726; SSSE3-NEXT: andl $31, %ebx 1727; SSSE3-NEXT: movzbl 96(%rsp,%rbx), %eax 1728; SSSE3-NEXT: movd %eax, %xmm8 1729; SSSE3-NEXT: andl $31, %r11d 1730; SSSE3-NEXT: movzbl 128(%rsp,%r11), %eax 1731; SSSE3-NEXT: movd %eax, %xmm5 1732; SSSE3-NEXT: andl $31, %r10d 1733; SSSE3-NEXT: movzbl 160(%rsp,%r10), %eax 1734; SSSE3-NEXT: movd %eax, %xmm9 1735; SSSE3-NEXT: andl $31, %r9d 1736; SSSE3-NEXT: movzbl 192(%rsp,%r9), %eax 1737; SSSE3-NEXT: movd %eax, %xmm10 1738; SSSE3-NEXT: andl $31, %r8d 1739; SSSE3-NEXT: movzbl 224(%rsp,%r8), %eax 1740; SSSE3-NEXT: movd %eax, %xmm12 1741; SSSE3-NEXT: andl $31, %edi 1742; SSSE3-NEXT: movzbl 256(%rsp,%rdi), %eax 1743; SSSE3-NEXT: movd %eax, %xmm11 1744; SSSE3-NEXT: andl $31, %esi 1745; SSSE3-NEXT: movzbl 288(%rsp,%rsi), %eax 1746; SSSE3-NEXT: movd %eax, %xmm13 1747; SSSE3-NEXT: andl $31, %edx 1748; SSSE3-NEXT: movzbl 320(%rsp,%rdx), %eax 1749; SSSE3-NEXT: movd %eax, %xmm14 1750; SSSE3-NEXT: andl $31, %ecx 1751; SSSE3-NEXT: movzbl 352(%rsp,%rcx), %eax 1752; SSSE3-NEXT: movd %eax, %xmm15 1753; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 1754; SSSE3-NEXT: andl $31, %eax 1755; SSSE3-NEXT: movzbl 384(%rsp,%rax), %eax 1756; SSSE3-NEXT: movd %eax, %xmm0 1757; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1758; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1759; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 1760; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 1761; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] 1762; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] 1763; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] 1764; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] 1765; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] 1766; SSSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] 1767; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] 1768; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] 1769; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] 1770; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] 1771; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] 1772; SSSE3-NEXT: addq $424, %rsp # imm = 0x1A8 1773; SSSE3-NEXT: popq %rbx 1774; SSSE3-NEXT: popq %r12 1775; SSSE3-NEXT: popq %r13 1776; SSSE3-NEXT: popq %r14 1777; SSSE3-NEXT: popq %r15 1778; SSSE3-NEXT: popq %rbp 1779; SSSE3-NEXT: retq 1780; 1781; SSE41-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 1782; SSE41: # %bb.0: 1783; SSE41-NEXT: subq $392, %rsp # imm = 0x188 1784; SSE41-NEXT: movd %xmm2, %eax 1785; SSE41-NEXT: movaps %xmm1, 368(%rsp) 1786; SSE41-NEXT: movaps %xmm0, 352(%rsp) 1787; SSE41-NEXT: andl $31, %eax 1788; SSE41-NEXT: movaps %xmm1, 336(%rsp) 1789; SSE41-NEXT: movaps %xmm0, 320(%rsp) 1790; SSE41-NEXT: movaps %xmm1, 304(%rsp) 1791; SSE41-NEXT: movaps %xmm0, 288(%rsp) 1792; SSE41-NEXT: movaps %xmm1, 272(%rsp) 1793; SSE41-NEXT: movaps %xmm0, 256(%rsp) 1794; SSE41-NEXT: movaps %xmm1, 240(%rsp) 1795; SSE41-NEXT: movaps %xmm0, 224(%rsp) 1796; SSE41-NEXT: movaps %xmm1, 208(%rsp) 1797; SSE41-NEXT: movaps %xmm0, 192(%rsp) 1798; SSE41-NEXT: movaps %xmm1, 176(%rsp) 1799; SSE41-NEXT: movaps %xmm0, 160(%rsp) 1800; SSE41-NEXT: movaps %xmm1, 144(%rsp) 1801; SSE41-NEXT: movaps %xmm0, 128(%rsp) 1802; SSE41-NEXT: movaps %xmm1, 112(%rsp) 1803; SSE41-NEXT: movaps %xmm0, 96(%rsp) 1804; SSE41-NEXT: movaps %xmm1, 80(%rsp) 1805; SSE41-NEXT: movaps %xmm0, 64(%rsp) 1806; SSE41-NEXT: movaps %xmm1, 48(%rsp) 1807; SSE41-NEXT: movaps %xmm0, 32(%rsp) 1808; SSE41-NEXT: movaps %xmm1, 16(%rsp) 1809; SSE41-NEXT: movaps %xmm0, (%rsp) 1810; SSE41-NEXT: movaps %xmm1, -16(%rsp) 1811; SSE41-NEXT: movaps %xmm0, -32(%rsp) 1812; SSE41-NEXT: movaps %xmm1, -48(%rsp) 1813; SSE41-NEXT: movaps %xmm0, -64(%rsp) 1814; SSE41-NEXT: movaps %xmm1, -80(%rsp) 1815; SSE41-NEXT: movaps %xmm0, -96(%rsp) 1816; SSE41-NEXT: movaps %xmm1, -112(%rsp) 1817; SSE41-NEXT: movaps %xmm0, -128(%rsp) 1818; SSE41-NEXT: movzbl 352(%rsp,%rax), %eax 1819; SSE41-NEXT: movd %eax, %xmm0 1820; SSE41-NEXT: pextrb $1, %xmm2, %eax 1821; SSE41-NEXT: andl $31, %eax 1822; SSE41-NEXT: pinsrb $1, 320(%rsp,%rax), %xmm0 1823; SSE41-NEXT: pextrb $2, %xmm2, %eax 1824; SSE41-NEXT: andl $31, %eax 1825; SSE41-NEXT: pinsrb $2, 288(%rsp,%rax), %xmm0 1826; SSE41-NEXT: pextrb $3, %xmm2, %eax 1827; SSE41-NEXT: andl $31, %eax 1828; SSE41-NEXT: pinsrb $3, 256(%rsp,%rax), %xmm0 1829; SSE41-NEXT: pextrb $4, %xmm2, %eax 1830; SSE41-NEXT: andl $31, %eax 1831; SSE41-NEXT: pinsrb $4, 224(%rsp,%rax), %xmm0 1832; SSE41-NEXT: pextrb $5, %xmm2, %eax 1833; SSE41-NEXT: andl $31, %eax 1834; SSE41-NEXT: pinsrb $5, 192(%rsp,%rax), %xmm0 1835; SSE41-NEXT: pextrb $6, %xmm2, %eax 1836; SSE41-NEXT: andl $31, %eax 1837; SSE41-NEXT: pinsrb $6, 160(%rsp,%rax), %xmm0 1838; SSE41-NEXT: pextrb $7, %xmm2, %eax 1839; SSE41-NEXT: andl $31, %eax 1840; SSE41-NEXT: pinsrb $7, 128(%rsp,%rax), %xmm0 1841; SSE41-NEXT: pextrb $8, %xmm2, %eax 1842; SSE41-NEXT: andl $31, %eax 1843; SSE41-NEXT: pinsrb $8, 96(%rsp,%rax), %xmm0 1844; SSE41-NEXT: pextrb $9, %xmm2, %eax 1845; SSE41-NEXT: andl $31, %eax 1846; SSE41-NEXT: pinsrb $9, 64(%rsp,%rax), %xmm0 1847; SSE41-NEXT: pextrb $10, %xmm2, %eax 1848; SSE41-NEXT: andl $31, %eax 1849; SSE41-NEXT: pinsrb $10, 32(%rsp,%rax), %xmm0 1850; SSE41-NEXT: pextrb $11, %xmm2, %eax 1851; SSE41-NEXT: andl $31, %eax 1852; SSE41-NEXT: pinsrb $11, (%rsp,%rax), %xmm0 1853; SSE41-NEXT: pextrb $12, %xmm2, %eax 1854; SSE41-NEXT: andl $31, %eax 1855; SSE41-NEXT: pinsrb $12, -32(%rsp,%rax), %xmm0 1856; SSE41-NEXT: pextrb $13, %xmm2, %eax 1857; SSE41-NEXT: andl $31, %eax 1858; SSE41-NEXT: pinsrb $13, -64(%rsp,%rax), %xmm0 1859; SSE41-NEXT: pextrb $14, %xmm2, %eax 1860; SSE41-NEXT: andl $31, %eax 1861; SSE41-NEXT: pinsrb $14, -96(%rsp,%rax), %xmm0 1862; SSE41-NEXT: pextrb $15, %xmm2, %eax 1863; SSE41-NEXT: andl $31, %eax 1864; SSE41-NEXT: pinsrb $15, -128(%rsp,%rax), %xmm0 1865; SSE41-NEXT: addq $392, %rsp # imm = 0x188 1866; SSE41-NEXT: retq 1867; 1868; XOP-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 1869; XOP: # %bb.0: 1870; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 1871; XOP-NEXT: vpperm %xmm1, %xmm2, %xmm0, %xmm0 1872; XOP-NEXT: vzeroupper 1873; XOP-NEXT: retq 1874; 1875; AVX1-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 1876; AVX1: # %bb.0: 1877; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1878; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2 1879; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1880; AVX1-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1881; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1882; AVX1-NEXT: vzeroupper 1883; AVX1-NEXT: retq 1884; 1885; AVX2-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 1886; AVX2: # %bb.0: 1887; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1888; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm2 1889; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1890; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1891; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1892; AVX2-NEXT: vzeroupper 1893; AVX2-NEXT: retq 1894; 1895; AVX512-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 1896; AVX512: # %bb.0: 1897; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 1898; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2 1899; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1900; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1901; AVX512-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 1902; AVX512-NEXT: vzeroupper 1903; AVX512-NEXT: retq 1904; 1905; AVX512VLBW-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 1906; AVX512VLBW: # %bb.0: 1907; AVX512VLBW-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1908; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm2 1909; AVX512VLBW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 1910; AVX512VLBW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1911; AVX512VLBW-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1 1912; AVX512VLBW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} 1913; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1914; AVX512VLBW-NEXT: vzeroupper 1915; AVX512VLBW-NEXT: retq 1916; 1917; VLVBMI-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: 1918; VLVBMI: # %bb.0: 1919; VLVBMI-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1920; VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 1921; VLVBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1922; VLVBMI-NEXT: vzeroupper 1923; VLVBMI-NEXT: retq 1924 %index0 = extractelement <16 x i8> %indices, i32 0 1925 %index1 = extractelement <16 x i8> %indices, i32 1 1926 %index2 = extractelement <16 x i8> %indices, i32 2 1927 %index3 = extractelement <16 x i8> %indices, i32 3 1928 %index4 = extractelement <16 x i8> %indices, i32 4 1929 %index5 = extractelement <16 x i8> %indices, i32 5 1930 %index6 = extractelement <16 x i8> %indices, i32 6 1931 %index7 = extractelement <16 x i8> %indices, i32 7 1932 %index8 = extractelement <16 x i8> %indices, i32 8 1933 %index9 = extractelement <16 x i8> %indices, i32 9 1934 %index10 = extractelement <16 x i8> %indices, i32 10 1935 %index11 = extractelement <16 x i8> %indices, i32 11 1936 %index12 = extractelement <16 x i8> %indices, i32 12 1937 %index13 = extractelement <16 x i8> %indices, i32 13 1938 %index14 = extractelement <16 x i8> %indices, i32 14 1939 %index15 = extractelement <16 x i8> %indices, i32 15 1940 %v0 = extractelement <32 x i8> %v, i8 %index0 1941 %v1 = extractelement <32 x i8> %v, i8 %index1 1942 %v2 = extractelement <32 x i8> %v, i8 %index2 1943 %v3 = extractelement <32 x i8> %v, i8 %index3 1944 %v4 = extractelement <32 x i8> %v, i8 %index4 1945 %v5 = extractelement <32 x i8> %v, i8 %index5 1946 %v6 = extractelement <32 x i8> %v, i8 %index6 1947 %v7 = extractelement <32 x i8> %v, i8 %index7 1948 %v8 = extractelement <32 x i8> %v, i8 %index8 1949 %v9 = extractelement <32 x i8> %v, i8 %index9 1950 %v10 = extractelement <32 x i8> %v, i8 %index10 1951 %v11 = extractelement <32 x i8> %v, i8 %index11 1952 %v12 = extractelement <32 x i8> %v, i8 %index12 1953 %v13 = extractelement <32 x i8> %v, i8 %index13 1954 %v14 = extractelement <32 x i8> %v, i8 %index14 1955 %v15 = extractelement <32 x i8> %v, i8 %index15 1956 %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0 1957 %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1 1958 %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2 1959 %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3 1960 %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4 1961 %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5 1962 %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6 1963 %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7 1964 %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8 1965 %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9 1966 %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10 1967 %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11 1968 %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12 1969 %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13 1970 %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14 1971 %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15 1972 ret <16 x i8> %ret15 1973} 1974 1975define void @indices_convert() { 1976; SSE3-LABEL: indices_convert: 1977; SSE3: # %bb.0: # %bb 1978; SSE3-NEXT: movaps (%rax), %xmm0 1979; SSE3-NEXT: movaps %xmm0, -24(%rsp) 1980; SSE3-NEXT: movaps %xmm0, -40(%rsp) 1981; SSE3-NEXT: movl (%rax), %eax 1982; SSE3-NEXT: movaps %xmm0, -56(%rsp) 1983; SSE3-NEXT: movaps %xmm0, -72(%rsp) 1984; SSE3-NEXT: andl $3, %eax 1985; SSE3-NEXT: shll $3, %eax 1986; SSE3-NEXT: movsd -72(%rsp,%rax), %xmm0 # xmm0 = mem[0],zero 1987; SSE3-NEXT: movsd -40(%rsp,%rax), %xmm1 # xmm1 = mem[0],zero 1988; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 1989; SSE3-NEXT: movups %xmm1, (%rax) 1990; SSE3-NEXT: retq 1991; 1992; SSSE3-LABEL: indices_convert: 1993; SSSE3: # %bb.0: # %bb 1994; SSSE3-NEXT: movaps (%rax), %xmm0 1995; SSSE3-NEXT: movaps %xmm0, -24(%rsp) 1996; SSSE3-NEXT: movaps %xmm0, -40(%rsp) 1997; SSSE3-NEXT: movl (%rax), %eax 1998; SSSE3-NEXT: movaps %xmm0, -56(%rsp) 1999; SSSE3-NEXT: movaps %xmm0, -72(%rsp) 2000; SSSE3-NEXT: andl $3, %eax 2001; SSSE3-NEXT: shll $3, %eax 2002; SSSE3-NEXT: movsd -72(%rsp,%rax), %xmm0 # xmm0 = mem[0],zero 2003; SSSE3-NEXT: movsd -40(%rsp,%rax), %xmm1 # xmm1 = mem[0],zero 2004; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2005; SSSE3-NEXT: movups %xmm1, (%rax) 2006; SSSE3-NEXT: retq 2007; 2008; SSE41-LABEL: indices_convert: 2009; SSE41: # %bb.0: # %bb 2010; SSE41-NEXT: movaps (%rax), %xmm0 2011; SSE41-NEXT: extractps $2, %xmm0, %eax 2012; SSE41-NEXT: movaps %xmm0, -24(%rsp) 2013; SSE41-NEXT: movaps %xmm0, -40(%rsp) 2014; SSE41-NEXT: andl $3, %eax 2015; SSE41-NEXT: extractps $3, %xmm0, %ecx 2016; SSE41-NEXT: movaps %xmm0, -56(%rsp) 2017; SSE41-NEXT: movaps %xmm0, -72(%rsp) 2018; SSE41-NEXT: andl $3, %ecx 2019; SSE41-NEXT: movsd -72(%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero 2020; SSE41-NEXT: movsd -40(%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero 2021; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2022; SSE41-NEXT: movups %xmm1, (%rax) 2023; SSE41-NEXT: retq 2024; 2025; XOP-LABEL: indices_convert: 2026; XOP: # %bb.0: # %bb 2027; XOP-NEXT: vmovdqa (%rax), %xmm0 2028; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2029; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2030; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 2031; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 2032; XOP-NEXT: vpermil2pd $0, %xmm1, %xmm0, %xmm0, %xmm0 2033; XOP-NEXT: vmovupd %xmm0, (%rax) 2034; XOP-NEXT: retq 2035; 2036; AVX1-LABEL: indices_convert: 2037; AVX1: # %bb.0: # %bb 2038; AVX1-NEXT: vmovdqa (%rax), %xmm0 2039; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2040; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2041; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 2042; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 2043; AVX1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 2044; AVX1-NEXT: vmovupd %xmm0, (%rax) 2045; AVX1-NEXT: retq 2046; 2047; AVX2-LABEL: indices_convert: 2048; AVX2: # %bb.0: # %bb 2049; AVX2-NEXT: vpbroadcastq (%rax), %xmm0 2050; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] 2051; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 2052; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 2053; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 2054; AVX2-NEXT: vmovapd (%rax), %xmm1 2055; AVX2-NEXT: vpermilpd %xmm0, %xmm1, %xmm0 2056; AVX2-NEXT: vmovupd %xmm0, (%rax) 2057; AVX2-NEXT: retq 2058; 2059; AVX512-LABEL: indices_convert: 2060; AVX512: # %bb.0: # %bb 2061; AVX512-NEXT: vmovdqa (%rax), %ymm0 2062; AVX512-NEXT: vpbroadcastq (%rax), %xmm1 2063; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] 2064; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 2065; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 2066; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 2067; AVX512-NEXT: vmovdqu %xmm0, (%rax) 2068; AVX512-NEXT: vzeroupper 2069; AVX512-NEXT: retq 2070; 2071; AVX512VL-LABEL: indices_convert: 2072; AVX512VL: # %bb.0: # %bb 2073; AVX512VL-NEXT: vpbroadcastq (%rax), %xmm0 2074; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 2075; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 2076; AVX512VL-NEXT: vpermq (%rax), %ymm0, %ymm0 2077; AVX512VL-NEXT: vmovdqu %xmm0, (%rax) 2078; AVX512VL-NEXT: vzeroupper 2079; AVX512VL-NEXT: retq 2080bb: 2081 %0 = load <4 x i64>, ptr undef, align 32 2082 %1 = bitcast <4 x i64> %0 to <8 x i32> 2083 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <2 x i32> <i32 2, i32 12> 2084 %3 = and <2 x i32> %2, <i32 7, i32 7> 2085 %4 = extractelement <2 x i32> %3, i32 0 2086 %vecext.i8.1 = extractelement <4 x i64> %0, i32 %4 2087 %5 = extractelement <2 x i32> %3, i32 1 2088 %vecext.i8.2 = extractelement <4 x i64> %0, i32 %5 2089 %6 = insertelement <2 x i64> poison, i64 %vecext.i8.1, i32 0 2090 %7 = insertelement <2 x i64> %6, i64 %vecext.i8.2, i32 1 2091 %8 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> %7 2092 store <2 x i64> %8, ptr undef, align 8 2093 ret void 2094} 2095