1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 15 16define void @shuffle_v16i8_to_v8i8_1(ptr %L, ptr %S) nounwind { 17; SSE2-LABEL: shuffle_v16i8_to_v8i8_1: 18; SSE2: # %bb.0: 19; SSE2-NEXT: movdqa (%rdi), %xmm0 20; SSE2-NEXT: psrlw $8, %xmm0 21; SSE2-NEXT: packuswb %xmm0, %xmm0 22; SSE2-NEXT: movq %xmm0, (%rsi) 23; SSE2-NEXT: retq 24; 25; SSE42-LABEL: shuffle_v16i8_to_v8i8_1: 26; SSE42: # %bb.0: 27; SSE42-NEXT: movdqa (%rdi), %xmm0 28; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 29; SSE42-NEXT: movq %xmm0, (%rsi) 30; SSE42-NEXT: retq 31; 32; AVX-LABEL: shuffle_v16i8_to_v8i8_1: 33; AVX: # %bb.0: 34; AVX-NEXT: vmovdqa (%rdi), %xmm0 35; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 36; AVX-NEXT: vmovq %xmm0, (%rsi) 37; AVX-NEXT: retq 38; 39; AVX512-LABEL: shuffle_v16i8_to_v8i8_1: 40; AVX512: # %bb.0: 41; AVX512-NEXT: vmovdqa (%rdi), %xmm0 42; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 43; AVX512-NEXT: vmovq %xmm0, (%rsi) 44; AVX512-NEXT: retq 45 %vec = load <16 x i8>, ptr %L 46 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 47 store <8 x i8> %strided.vec, ptr %S 48 ret void 49} 50 51define void @shuffle_v8i16_to_v4i16_1(ptr %L, ptr %S) nounwind { 52; SSE2-LABEL: shuffle_v8i16_to_v4i16_1: 53; SSE2: # %bb.0: 54; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[3,1,2,3,4,5,6,7] 55; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 56; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 57; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 58; SSE2-NEXT: movq %xmm0, (%rsi) 59; SSE2-NEXT: retq 60; 61; SSE42-LABEL: shuffle_v8i16_to_v4i16_1: 62; SSE42: # %bb.0: 63; SSE42-NEXT: movdqa (%rdi), %xmm0 64; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 65; SSE42-NEXT: movq %xmm0, (%rsi) 66; SSE42-NEXT: retq 67; 68; AVX-LABEL: shuffle_v8i16_to_v4i16_1: 69; AVX: # %bb.0: 70; AVX-NEXT: vmovdqa (%rdi), %xmm0 71; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 72; AVX-NEXT: vmovq %xmm0, (%rsi) 73; AVX-NEXT: retq 74; 75; AVX512-LABEL: shuffle_v8i16_to_v4i16_1: 76; AVX512: # %bb.0: 77; AVX512-NEXT: vmovdqa (%rdi), %xmm0 78; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 79; AVX512-NEXT: vmovq %xmm0, (%rsi) 80; AVX512-NEXT: retq 81 %vec = load <8 x i16>, ptr %L 82 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 83 store <4 x i16> %strided.vec, ptr %S 84 ret void 85} 86 87define void @shuffle_v4i32_to_v2i32_1(ptr %L, ptr %S) nounwind { 88; SSE-LABEL: shuffle_v4i32_to_v2i32_1: 89; SSE: # %bb.0: 90; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3] 91; SSE-NEXT: movq %xmm0, (%rsi) 92; SSE-NEXT: retq 93; 94; AVX-LABEL: shuffle_v4i32_to_v2i32_1: 95; AVX: # %bb.0: 96; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] 97; AVX-NEXT: vmovlps %xmm0, (%rsi) 98; AVX-NEXT: retq 99; 100; AVX512-LABEL: shuffle_v4i32_to_v2i32_1: 101; AVX512: # %bb.0: 102; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3] 103; AVX512-NEXT: vmovlps %xmm0, (%rsi) 104; AVX512-NEXT: retq 105 %vec = load <4 x i32>, ptr %L 106 %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 1, i32 3> 107 store <2 x i32> %strided.vec, ptr %S 108 ret void 109} 110 111define void @shuffle_v16i8_to_v4i8_1(ptr %L, ptr %S) nounwind { 112; SSE2-LABEL: shuffle_v16i8_to_v4i8_1: 113; SSE2: # %bb.0: 114; SSE2-NEXT: movdqa (%rdi), %xmm0 115; SSE2-NEXT: pxor %xmm1, %xmm1 116; SSE2-NEXT: movdqa %xmm0, %xmm2 117; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 118; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 119; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 120; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 121; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 122; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 123; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 124; SSE2-NEXT: packuswb %xmm0, %xmm0 125; SSE2-NEXT: movd %xmm0, (%rsi) 126; SSE2-NEXT: retq 127; 128; SSE42-LABEL: shuffle_v16i8_to_v4i8_1: 129; SSE42: # %bb.0: 130; SSE42-NEXT: movdqa (%rdi), %xmm0 131; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] 132; SSE42-NEXT: movd %xmm0, (%rsi) 133; SSE42-NEXT: retq 134; 135; AVX-LABEL: shuffle_v16i8_to_v4i8_1: 136; AVX: # %bb.0: 137; AVX-NEXT: vmovdqa (%rdi), %xmm0 138; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] 139; AVX-NEXT: vmovd %xmm0, (%rsi) 140; AVX-NEXT: retq 141; 142; AVX512-LABEL: shuffle_v16i8_to_v4i8_1: 143; AVX512: # %bb.0: 144; AVX512-NEXT: vmovdqa (%rdi), %xmm0 145; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] 146; AVX512-NEXT: vmovd %xmm0, (%rsi) 147; AVX512-NEXT: retq 148 %vec = load <16 x i8>, ptr %L 149 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 150 store <4 x i8> %strided.vec, ptr %S 151 ret void 152} 153 154define void @shuffle_v16i8_to_v4i8_2(ptr %L, ptr %S) nounwind { 155; SSE2-LABEL: shuffle_v16i8_to_v4i8_2: 156; SSE2: # %bb.0: 157; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[3,1,2,3,4,5,6,7] 158; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 159; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 160; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 161; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 162; SSE2-NEXT: packuswb %xmm0, %xmm0 163; SSE2-NEXT: movd %xmm0, (%rsi) 164; SSE2-NEXT: retq 165; 166; SSE42-LABEL: shuffle_v16i8_to_v4i8_2: 167; SSE42: # %bb.0: 168; SSE42-NEXT: movdqa (%rdi), %xmm0 169; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] 170; SSE42-NEXT: movd %xmm0, (%rsi) 171; SSE42-NEXT: retq 172; 173; AVX-LABEL: shuffle_v16i8_to_v4i8_2: 174; AVX: # %bb.0: 175; AVX-NEXT: vmovdqa (%rdi), %xmm0 176; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] 177; AVX-NEXT: vmovd %xmm0, (%rsi) 178; AVX-NEXT: retq 179; 180; AVX512-LABEL: shuffle_v16i8_to_v4i8_2: 181; AVX512: # %bb.0: 182; AVX512-NEXT: vmovdqa (%rdi), %xmm0 183; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] 184; AVX512-NEXT: vmovd %xmm0, (%rsi) 185; AVX512-NEXT: retq 186 %vec = load <16 x i8>, ptr %L 187 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 188 store <4 x i8> %strided.vec, ptr %S 189 ret void 190} 191 192define void @shuffle_v16i8_to_v4i8_3(ptr %L, ptr %S) nounwind { 193; SSE2-LABEL: shuffle_v16i8_to_v4i8_3: 194; SSE2: # %bb.0: 195; SSE2-NEXT: movdqa (%rdi), %xmm0 196; SSE2-NEXT: pxor %xmm1, %xmm1 197; SSE2-NEXT: movdqa %xmm0, %xmm2 198; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 199; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] 200; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] 201; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 202; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 203; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 204; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 205; SSE2-NEXT: packuswb %xmm0, %xmm0 206; SSE2-NEXT: movd %xmm0, (%rsi) 207; SSE2-NEXT: retq 208; 209; SSE42-LABEL: shuffle_v16i8_to_v4i8_3: 210; SSE42: # %bb.0: 211; SSE42-NEXT: movdqa (%rdi), %xmm0 212; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] 213; SSE42-NEXT: movd %xmm0, (%rsi) 214; SSE42-NEXT: retq 215; 216; AVX-LABEL: shuffle_v16i8_to_v4i8_3: 217; AVX: # %bb.0: 218; AVX-NEXT: vmovdqa (%rdi), %xmm0 219; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] 220; AVX-NEXT: vmovd %xmm0, (%rsi) 221; AVX-NEXT: retq 222; 223; AVX512-LABEL: shuffle_v16i8_to_v4i8_3: 224; AVX512: # %bb.0: 225; AVX512-NEXT: vmovdqa (%rdi), %xmm0 226; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] 227; AVX512-NEXT: vmovd %xmm0, (%rsi) 228; AVX512-NEXT: retq 229 %vec = load <16 x i8>, ptr %L 230 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 231 store <4 x i8> %strided.vec, ptr %S 232 ret void 233} 234 235define void @shuffle_v8i16_to_v2i16_1(ptr %L, ptr %S) nounwind { 236; SSE-LABEL: shuffle_v8i16_to_v2i16_1: 237; SSE: # %bb.0: 238; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] 239; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 240; SSE-NEXT: movd %xmm0, (%rsi) 241; SSE-NEXT: retq 242; 243; AVX1-LABEL: shuffle_v8i16_to_v2i16_1: 244; AVX1: # %bb.0: 245; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 246; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 247; AVX1-NEXT: vmovd %xmm0, (%rsi) 248; AVX1-NEXT: retq 249; 250; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_1: 251; AVX2-SLOW: # %bb.0: 252; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 253; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 254; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) 255; AVX2-SLOW-NEXT: retq 256; 257; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_1: 258; AVX2-FAST: # %bb.0: 259; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 260; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 261; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) 262; AVX2-FAST-NEXT: retq 263; 264; AVX512F-LABEL: shuffle_v8i16_to_v2i16_1: 265; AVX512F: # %bb.0: 266; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 267; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 268; AVX512F-NEXT: vmovd %xmm0, (%rsi) 269; AVX512F-NEXT: retq 270; 271; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_1: 272; AVX512VL: # %bb.0: 273; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 274; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 275; AVX512VL-NEXT: vmovd %xmm0, (%rsi) 276; AVX512VL-NEXT: retq 277; 278; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1: 279; AVX512BW: # %bb.0: 280; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 281; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 282; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 283; AVX512BW-NEXT: retq 284; 285; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_1: 286; AVX512BWVL: # %bb.0: 287; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 288; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 289; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) 290; AVX512BWVL-NEXT: retq 291 %vec = load <8 x i16>, ptr %L 292 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 1, i32 5> 293 store <2 x i16> %strided.vec, ptr %S 294 ret void 295} 296 297define void @shuffle_v8i16_to_v2i16_2(ptr %L, ptr %S) nounwind { 298; SSE-LABEL: shuffle_v8i16_to_v2i16_2: 299; SSE: # %bb.0: 300; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] 301; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 302; SSE-NEXT: movd %xmm0, (%rsi) 303; SSE-NEXT: retq 304; 305; AVX1-LABEL: shuffle_v8i16_to_v2i16_2: 306; AVX1: # %bb.0: 307; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 308; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 309; AVX1-NEXT: vmovd %xmm0, (%rsi) 310; AVX1-NEXT: retq 311; 312; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_2: 313; AVX2-SLOW: # %bb.0: 314; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 315; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 316; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) 317; AVX2-SLOW-NEXT: retq 318; 319; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_2: 320; AVX2-FAST: # %bb.0: 321; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 322; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 323; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) 324; AVX2-FAST-NEXT: retq 325; 326; AVX512F-LABEL: shuffle_v8i16_to_v2i16_2: 327; AVX512F: # %bb.0: 328; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 329; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 330; AVX512F-NEXT: vmovd %xmm0, (%rsi) 331; AVX512F-NEXT: retq 332; 333; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_2: 334; AVX512VL: # %bb.0: 335; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 336; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 337; AVX512VL-NEXT: vmovd %xmm0, (%rsi) 338; AVX512VL-NEXT: retq 339; 340; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2: 341; AVX512BW: # %bb.0: 342; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 343; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 344; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 345; AVX512BW-NEXT: retq 346; 347; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_2: 348; AVX512BWVL: # %bb.0: 349; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 350; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 351; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) 352; AVX512BWVL-NEXT: retq 353 %vec = load <8 x i16>, ptr %L 354 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 2, i32 6> 355 store <2 x i16> %strided.vec, ptr %S 356 ret void 357} 358 359define void @shuffle_v8i16_to_v2i16_3(ptr %L, ptr %S) nounwind { 360; SSE-LABEL: shuffle_v8i16_to_v2i16_3: 361; SSE: # %bb.0: 362; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] 363; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 364; SSE-NEXT: movd %xmm0, (%rsi) 365; SSE-NEXT: retq 366; 367; AVX1-LABEL: shuffle_v8i16_to_v2i16_3: 368; AVX1: # %bb.0: 369; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 370; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 371; AVX1-NEXT: vmovd %xmm0, (%rsi) 372; AVX1-NEXT: retq 373; 374; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_3: 375; AVX2-SLOW: # %bb.0: 376; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 377; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 378; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) 379; AVX2-SLOW-NEXT: retq 380; 381; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_3: 382; AVX2-FAST: # %bb.0: 383; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 384; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 385; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) 386; AVX2-FAST-NEXT: retq 387; 388; AVX512F-LABEL: shuffle_v8i16_to_v2i16_3: 389; AVX512F: # %bb.0: 390; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 391; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 392; AVX512F-NEXT: vmovd %xmm0, (%rsi) 393; AVX512F-NEXT: retq 394; 395; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_3: 396; AVX512VL: # %bb.0: 397; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 398; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 399; AVX512VL-NEXT: vmovd %xmm0, (%rsi) 400; AVX512VL-NEXT: retq 401; 402; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3: 403; AVX512BW: # %bb.0: 404; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 405; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 406; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 407; AVX512BW-NEXT: retq 408; 409; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_3: 410; AVX512BWVL: # %bb.0: 411; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 412; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 413; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) 414; AVX512BWVL-NEXT: retq 415 %vec = load <8 x i16>, ptr %L 416 %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 3, i32 7> 417 store <2 x i16> %strided.vec, ptr %S 418 ret void 419} 420 421define void @shuffle_v16i8_to_v2i8_1(ptr %L, ptr %S) nounwind { 422; SSE2-LABEL: shuffle_v16i8_to_v2i8_1: 423; SSE2: # %bb.0: 424; SSE2-NEXT: movdqa (%rdi), %xmm0 425; SSE2-NEXT: pxor %xmm1, %xmm1 426; SSE2-NEXT: movdqa %xmm0, %xmm2 427; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 428; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 429; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 430; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 431; SSE2-NEXT: packuswb %xmm0, %xmm0 432; SSE2-NEXT: movd %xmm0, %eax 433; SSE2-NEXT: movw %ax, (%rsi) 434; SSE2-NEXT: retq 435; 436; SSE42-LABEL: shuffle_v16i8_to_v2i8_1: 437; SSE42: # %bb.0: 438; SSE42-NEXT: movdqa (%rdi), %xmm0 439; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 440; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 441; SSE42-NEXT: retq 442; 443; AVX-LABEL: shuffle_v16i8_to_v2i8_1: 444; AVX: # %bb.0: 445; AVX-NEXT: vmovdqa (%rdi), %xmm0 446; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 447; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 448; AVX-NEXT: retq 449; 450; AVX512-LABEL: shuffle_v16i8_to_v2i8_1: 451; AVX512: # %bb.0: 452; AVX512-NEXT: vmovdqa (%rdi), %xmm0 453; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 454; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) 455; AVX512-NEXT: retq 456 %vec = load <16 x i8>, ptr %L 457 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 1, i32 9> 458 store <2 x i8> %strided.vec, ptr %S 459 ret void 460} 461 462define void @shuffle_v16i8_to_v2i8_2(ptr %L, ptr %S) nounwind { 463; SSE2-LABEL: shuffle_v16i8_to_v2i8_2: 464; SSE2: # %bb.0: 465; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] 466; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 467; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 468; SSE2-NEXT: packuswb %xmm0, %xmm0 469; SSE2-NEXT: movd %xmm0, %eax 470; SSE2-NEXT: movw %ax, (%rsi) 471; SSE2-NEXT: retq 472; 473; SSE42-LABEL: shuffle_v16i8_to_v2i8_2: 474; SSE42: # %bb.0: 475; SSE42-NEXT: movdqa (%rdi), %xmm0 476; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 477; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 478; SSE42-NEXT: retq 479; 480; AVX-LABEL: shuffle_v16i8_to_v2i8_2: 481; AVX: # %bb.0: 482; AVX-NEXT: vmovdqa (%rdi), %xmm0 483; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 484; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 485; AVX-NEXT: retq 486; 487; AVX512-LABEL: shuffle_v16i8_to_v2i8_2: 488; AVX512: # %bb.0: 489; AVX512-NEXT: vmovdqa (%rdi), %xmm0 490; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 491; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) 492; AVX512-NEXT: retq 493 %vec = load <16 x i8>, ptr %L 494 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 2, i32 10> 495 store <2 x i8> %strided.vec, ptr %S 496 ret void 497} 498 499define void @shuffle_v16i8_to_v2i8_3(ptr %L, ptr %S) nounwind { 500; SSE2-LABEL: shuffle_v16i8_to_v2i8_3: 501; SSE2: # %bb.0: 502; SSE2-NEXT: movdqa (%rdi), %xmm0 503; SSE2-NEXT: pxor %xmm1, %xmm1 504; SSE2-NEXT: movdqa %xmm0, %xmm2 505; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 506; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 507; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 508; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 509; SSE2-NEXT: packuswb %xmm0, %xmm0 510; SSE2-NEXT: movd %xmm0, %eax 511; SSE2-NEXT: movw %ax, (%rsi) 512; SSE2-NEXT: retq 513; 514; SSE42-LABEL: shuffle_v16i8_to_v2i8_3: 515; SSE42: # %bb.0: 516; SSE42-NEXT: movdqa (%rdi), %xmm0 517; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 518; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 519; SSE42-NEXT: retq 520; 521; AVX-LABEL: shuffle_v16i8_to_v2i8_3: 522; AVX: # %bb.0: 523; AVX-NEXT: vmovdqa (%rdi), %xmm0 524; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 525; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 526; AVX-NEXT: retq 527; 528; AVX512-LABEL: shuffle_v16i8_to_v2i8_3: 529; AVX512: # %bb.0: 530; AVX512-NEXT: vmovdqa (%rdi), %xmm0 531; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 532; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) 533; AVX512-NEXT: retq 534 %vec = load <16 x i8>, ptr %L 535 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 3, i32 11> 536 store <2 x i8> %strided.vec, ptr %S 537 ret void 538} 539 540define void @shuffle_v16i8_to_v2i8_4(ptr %L, ptr %S) nounwind { 541; SSE2-LABEL: shuffle_v16i8_to_v2i8_4: 542; SSE2: # %bb.0: 543; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] 544; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 545; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 546; SSE2-NEXT: packuswb %xmm0, %xmm0 547; SSE2-NEXT: movd %xmm0, %eax 548; SSE2-NEXT: movw %ax, (%rsi) 549; SSE2-NEXT: retq 550; 551; SSE42-LABEL: shuffle_v16i8_to_v2i8_4: 552; SSE42: # %bb.0: 553; SSE42-NEXT: movdqa (%rdi), %xmm0 554; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 555; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 556; SSE42-NEXT: retq 557; 558; AVX-LABEL: shuffle_v16i8_to_v2i8_4: 559; AVX: # %bb.0: 560; AVX-NEXT: vmovdqa (%rdi), %xmm0 561; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 562; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 563; AVX-NEXT: retq 564; 565; AVX512-LABEL: shuffle_v16i8_to_v2i8_4: 566; AVX512: # %bb.0: 567; AVX512-NEXT: vmovdqa (%rdi), %xmm0 568; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 569; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) 570; AVX512-NEXT: retq 571 %vec = load <16 x i8>, ptr %L 572 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 4, i32 12> 573 store <2 x i8> %strided.vec, ptr %S 574 ret void 575} 576 577define void @shuffle_v16i8_to_v2i8_5(ptr %L, ptr %S) nounwind { 578; SSE2-LABEL: shuffle_v16i8_to_v2i8_5: 579; SSE2: # %bb.0: 580; SSE2-NEXT: movdqa (%rdi), %xmm0 581; SSE2-NEXT: pxor %xmm1, %xmm1 582; SSE2-NEXT: movdqa %xmm0, %xmm2 583; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 584; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 585; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 586; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 587; SSE2-NEXT: packuswb %xmm0, %xmm0 588; SSE2-NEXT: movd %xmm0, %eax 589; SSE2-NEXT: movw %ax, (%rsi) 590; SSE2-NEXT: retq 591; 592; SSE42-LABEL: shuffle_v16i8_to_v2i8_5: 593; SSE42: # %bb.0: 594; SSE42-NEXT: movdqa (%rdi), %xmm0 595; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 596; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 597; SSE42-NEXT: retq 598; 599; AVX-LABEL: shuffle_v16i8_to_v2i8_5: 600; AVX: # %bb.0: 601; AVX-NEXT: vmovdqa (%rdi), %xmm0 602; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 603; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 604; AVX-NEXT: retq 605; 606; AVX512-LABEL: shuffle_v16i8_to_v2i8_5: 607; AVX512: # %bb.0: 608; AVX512-NEXT: vmovdqa (%rdi), %xmm0 609; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 610; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) 611; AVX512-NEXT: retq 612 %vec = load <16 x i8>, ptr %L 613 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 5, i32 13> 614 store <2 x i8> %strided.vec, ptr %S 615 ret void 616} 617 618define void @shuffle_v16i8_to_v2i8_6(ptr %L, ptr %S) nounwind { 619; SSE2-LABEL: shuffle_v16i8_to_v2i8_6: 620; SSE2: # %bb.0: 621; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] 622; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 623; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 624; SSE2-NEXT: packuswb %xmm0, %xmm0 625; SSE2-NEXT: movd %xmm0, %eax 626; SSE2-NEXT: movw %ax, (%rsi) 627; SSE2-NEXT: retq 628; 629; SSE42-LABEL: shuffle_v16i8_to_v2i8_6: 630; SSE42: # %bb.0: 631; SSE42-NEXT: movdqa (%rdi), %xmm0 632; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 633; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 634; SSE42-NEXT: retq 635; 636; AVX-LABEL: shuffle_v16i8_to_v2i8_6: 637; AVX: # %bb.0: 638; AVX-NEXT: vmovdqa (%rdi), %xmm0 639; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 640; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 641; AVX-NEXT: retq 642; 643; AVX512-LABEL: shuffle_v16i8_to_v2i8_6: 644; AVX512: # %bb.0: 645; AVX512-NEXT: vmovdqa (%rdi), %xmm0 646; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 647; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) 648; AVX512-NEXT: retq 649 %vec = load <16 x i8>, ptr %L 650 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 6, i32 14> 651 store <2 x i8> %strided.vec, ptr %S 652 ret void 653} 654 655define void @shuffle_v16i8_to_v2i8_7(ptr %L, ptr %S) nounwind { 656; SSE2-LABEL: shuffle_v16i8_to_v2i8_7: 657; SSE2: # %bb.0: 658; SSE2-NEXT: movdqa (%rdi), %xmm0 659; SSE2-NEXT: pxor %xmm1, %xmm1 660; SSE2-NEXT: movdqa %xmm0, %xmm2 661; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 662; SSE2-NEXT: psrlw $8, %xmm0 663; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 664; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] 665; SSE2-NEXT: packuswb %xmm0, %xmm0 666; SSE2-NEXT: movd %xmm0, %eax 667; SSE2-NEXT: movw %ax, (%rsi) 668; SSE2-NEXT: retq 669; 670; SSE42-LABEL: shuffle_v16i8_to_v2i8_7: 671; SSE42: # %bb.0: 672; SSE42-NEXT: movdqa (%rdi), %xmm0 673; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 674; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) 675; SSE42-NEXT: retq 676; 677; AVX-LABEL: shuffle_v16i8_to_v2i8_7: 678; AVX: # %bb.0: 679; AVX-NEXT: vmovdqa (%rdi), %xmm0 680; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 681; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) 682; AVX-NEXT: retq 683; 684; AVX512-LABEL: shuffle_v16i8_to_v2i8_7: 685; AVX512: # %bb.0: 686; AVX512-NEXT: vmovdqa (%rdi), %xmm0 687; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 688; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) 689; AVX512-NEXT: retq 690 %vec = load <16 x i8>, ptr %L 691 %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 7, i32 15> 692 store <2 x i8> %strided.vec, ptr %S 693 ret void 694} 695 696