1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 13 14define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind { 15; AVX-LABEL: shuffle_v32i8_to_v16i8_1: 16; AVX: # %bb.0: 17; AVX-NEXT: vmovdqa (%rdi), %xmm0 18; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 19; AVX-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] 20; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 21; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 22; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 23; AVX-NEXT: vmovdqa %xmm0, (%rsi) 24; AVX-NEXT: retq 25; 26; AVX512F-LABEL: shuffle_v32i8_to_v16i8_1: 27; AVX512F: # %bb.0: 28; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 29; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 30; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] 31; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 32; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 33; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 34; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 35; AVX512F-NEXT: retq 36; 37; AVX512VL-LABEL: shuffle_v32i8_to_v16i8_1: 38; AVX512VL: # %bb.0: 39; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 40; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 41; AVX512VL-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] 42; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 43; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 44; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 45; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) 46; AVX512VL-NEXT: retq 47; 48; AVX512BW-LABEL: shuffle_v32i8_to_v16i8_1: 49; AVX512BW: # %bb.0: 50; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 51; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 52; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 53; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 54; AVX512BW-NEXT: vzeroupper 55; AVX512BW-NEXT: retq 56; 57; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8_1: 58; AVX512BWVL: # %bb.0: 59; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %ymm0 60; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) 61; AVX512BWVL-NEXT: vzeroupper 62; AVX512BWVL-NEXT: retq 63 %vec = load <32 x i8>, ptr %L 64 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 65 store <16 x i8> %strided.vec, ptr %S 66 ret void 67} 68 69define void @shuffle_v16i16_to_v8i16_1(ptr %L, ptr %S) nounwind { 70; AVX-LABEL: shuffle_v16i16_to_v8i16_1: 71; AVX: # %bb.0: 72; AVX-NEXT: vmovdqa (%rdi), %xmm0 73; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 74; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 75; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 76; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 77; AVX-NEXT: vmovdqa %xmm0, (%rsi) 78; AVX-NEXT: retq 79; 80; AVX512F-LABEL: shuffle_v16i16_to_v8i16_1: 81; AVX512F: # %bb.0: 82; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 83; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 84; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 85; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 86; AVX512F-NEXT: vzeroupper 87; AVX512F-NEXT: retq 88; 89; AVX512VL-LABEL: shuffle_v16i16_to_v8i16_1: 90; AVX512VL: # %bb.0: 91; AVX512VL-NEXT: vpsrld $16, (%rdi), %ymm0 92; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi) 93; AVX512VL-NEXT: vzeroupper 94; AVX512VL-NEXT: retq 95; 96; AVX512BW-LABEL: shuffle_v16i16_to_v8i16_1: 97; AVX512BW: # %bb.0: 98; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 99; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 100; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 101; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) 102; AVX512BW-NEXT: vzeroupper 103; AVX512BW-NEXT: retq 104; 105; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16_1: 106; AVX512BWVL: # %bb.0: 107; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %ymm0 108; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi) 109; AVX512BWVL-NEXT: vzeroupper 110; AVX512BWVL-NEXT: retq 111 %vec = load <16 x i16>, ptr %L 112 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 113 store <8 x i16> %strided.vec, ptr %S 114 ret void 115} 116 117define void @shuffle_v8i32_to_v4i32_1(ptr %L, ptr %S) nounwind { 118; AVX-LABEL: shuffle_v8i32_to_v4i32_1: 119; AVX: # %bb.0: 120; AVX-NEXT: vmovaps (%rdi), %xmm0 121; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] 122; AVX-NEXT: vmovaps %xmm0, (%rsi) 123; AVX-NEXT: retq 124; 125; AVX512-LABEL: shuffle_v8i32_to_v4i32_1: 126; AVX512: # %bb.0: 127; AVX512-NEXT: vmovaps (%rdi), %xmm0 128; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] 129; AVX512-NEXT: vmovaps %xmm0, (%rsi) 130; AVX512-NEXT: retq 131 %vec = load <8 x i32>, ptr %L 132 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 133 store <4 x i32> %strided.vec, ptr %S 134 ret void 135} 136 137define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind { 138; AVX-LABEL: shuffle_v32i8_to_v8i8_1: 139; AVX: # %bb.0: 140; AVX-NEXT: vmovdqa (%rdi), %xmm0 141; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 142; AVX-NEXT: vmovd {{.*#+}} xmm2 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] 143; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 144; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 145; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 146; AVX-NEXT: vmovq %xmm0, (%rsi) 147; AVX-NEXT: retq 148; 149; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1: 150; AVX512F: # %bb.0: 151; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 152; AVX512F-NEXT: vpsrld $8, %ymm0, %ymm0 153; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 154; AVX512F-NEXT: vmovq %xmm0, (%rsi) 155; AVX512F-NEXT: vzeroupper 156; AVX512F-NEXT: retq 157; 158; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1: 159; AVX512VL: # %bb.0: 160; AVX512VL-NEXT: vpsrld $8, (%rdi), %ymm0 161; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi) 162; AVX512VL-NEXT: vzeroupper 163; AVX512VL-NEXT: retq 164; 165; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1: 166; AVX512BW: # %bb.0: 167; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 168; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm0 169; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 170; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 171; AVX512BW-NEXT: vzeroupper 172; AVX512BW-NEXT: retq 173; 174; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1: 175; AVX512BWVL: # %bb.0: 176; AVX512BWVL-NEXT: vpsrld $8, (%rdi), %ymm0 177; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi) 178; AVX512BWVL-NEXT: vzeroupper 179; AVX512BWVL-NEXT: retq 180 %vec = load <32 x i8>, ptr %L 181 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 182 store <8 x i8> %strided.vec, ptr %S 183 ret void 184} 185 186define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind { 187; AVX-LABEL: shuffle_v32i8_to_v8i8_2: 188; AVX: # %bb.0: 189; AVX-NEXT: vmovdqa (%rdi), %xmm0 190; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 191; AVX-NEXT: vmovd {{.*#+}} xmm2 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] 192; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 193; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 194; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 195; AVX-NEXT: vmovq %xmm0, (%rsi) 196; AVX-NEXT: retq 197; 198; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2: 199; AVX512F: # %bb.0: 200; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 201; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 202; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 203; AVX512F-NEXT: vmovq %xmm0, (%rsi) 204; AVX512F-NEXT: vzeroupper 205; AVX512F-NEXT: retq 206; 207; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2: 208; AVX512VL: # %bb.0: 209; AVX512VL-NEXT: vpsrld $16, (%rdi), %ymm0 210; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi) 211; AVX512VL-NEXT: vzeroupper 212; AVX512VL-NEXT: retq 213; 214; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2: 215; AVX512BW: # %bb.0: 216; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 217; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 218; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 219; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 220; AVX512BW-NEXT: vzeroupper 221; AVX512BW-NEXT: retq 222; 223; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2: 224; AVX512BWVL: # %bb.0: 225; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %ymm0 226; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi) 227; AVX512BWVL-NEXT: vzeroupper 228; AVX512BWVL-NEXT: retq 229 %vec = load <32 x i8>, ptr %L 230 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 231 store <8 x i8> %strided.vec, ptr %S 232 ret void 233} 234 235define void @shuffle_v32i8_to_v8i8_3(ptr %L, ptr %S) nounwind { 236; AVX-LABEL: shuffle_v32i8_to_v8i8_3: 237; AVX: # %bb.0: 238; AVX-NEXT: vmovdqa (%rdi), %xmm0 239; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 240; AVX-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] 241; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 242; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 243; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 244; AVX-NEXT: vmovq %xmm0, (%rsi) 245; AVX-NEXT: retq 246; 247; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3: 248; AVX512F: # %bb.0: 249; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 250; AVX512F-NEXT: vpsrld $24, %ymm0, %ymm0 251; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 252; AVX512F-NEXT: vmovq %xmm0, (%rsi) 253; AVX512F-NEXT: vzeroupper 254; AVX512F-NEXT: retq 255; 256; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3: 257; AVX512VL: # %bb.0: 258; AVX512VL-NEXT: vpsrld $24, (%rdi), %ymm0 259; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi) 260; AVX512VL-NEXT: vzeroupper 261; AVX512VL-NEXT: retq 262; 263; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3: 264; AVX512BW: # %bb.0: 265; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 266; AVX512BW-NEXT: vpsrld $24, %ymm0, %ymm0 267; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 268; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 269; AVX512BW-NEXT: vzeroupper 270; AVX512BW-NEXT: retq 271; 272; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3: 273; AVX512BWVL: # %bb.0: 274; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %ymm0 275; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi) 276; AVX512BWVL-NEXT: vzeroupper 277; AVX512BWVL-NEXT: retq 278 %vec = load <32 x i8>, ptr %L 279 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 280 store <8 x i8> %strided.vec, ptr %S 281 ret void 282} 283 284define void @shuffle_v16i16_to_v4i16_1(ptr %L, ptr %S) nounwind { 285; AVX1-LABEL: shuffle_v16i16_to_v4i16_1: 286; AVX1: # %bb.0: 287; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 288; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 289; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] 290; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] 291; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 292; AVX1-NEXT: vmovq %xmm0, (%rsi) 293; AVX1-NEXT: retq 294; 295; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1: 296; AVX2-SLOW: # %bb.0: 297; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 298; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 299; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] 300; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] 301; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 302; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) 303; AVX2-SLOW-NEXT: retq 304; 305; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1: 306; AVX2-FAST: # %bb.0: 307; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 308; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 309; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] 310; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 311; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 312; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 313; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) 314; AVX2-FAST-NEXT: retq 315; 316; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1: 317; AVX512F: # %bb.0: 318; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 319; AVX512F-NEXT: vpsrlq $16, %ymm0, %ymm0 320; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 321; AVX512F-NEXT: vmovq %xmm0, (%rsi) 322; AVX512F-NEXT: vzeroupper 323; AVX512F-NEXT: retq 324; 325; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1: 326; AVX512VL: # %bb.0: 327; AVX512VL-NEXT: vpsrlq $16, (%rdi), %ymm0 328; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi) 329; AVX512VL-NEXT: vzeroupper 330; AVX512VL-NEXT: retq 331; 332; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1: 333; AVX512BW: # %bb.0: 334; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 335; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm0 336; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 337; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 338; AVX512BW-NEXT: vzeroupper 339; AVX512BW-NEXT: retq 340; 341; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1: 342; AVX512BWVL: # %bb.0: 343; AVX512BWVL-NEXT: vpsrlq $16, (%rdi), %ymm0 344; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi) 345; AVX512BWVL-NEXT: vzeroupper 346; AVX512BWVL-NEXT: retq 347 %vec = load <16 x i16>, ptr %L 348 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 349 store <4 x i16> %strided.vec, ptr %S 350 ret void 351} 352 353define void @shuffle_v16i16_to_v4i16_2(ptr %L, ptr %S) nounwind { 354; AVX1-LABEL: shuffle_v16i16_to_v4i16_2: 355; AVX1: # %bb.0: 356; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 357; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 358; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 359; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 360; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 361; AVX1-NEXT: vmovq %xmm0, (%rsi) 362; AVX1-NEXT: retq 363; 364; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2: 365; AVX2-SLOW: # %bb.0: 366; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 367; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 368; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 369; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 370; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 371; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) 372; AVX2-SLOW-NEXT: retq 373; 374; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2: 375; AVX2-FAST: # %bb.0: 376; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 377; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 378; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] 379; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 380; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 381; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 382; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) 383; AVX2-FAST-NEXT: retq 384; 385; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2: 386; AVX512F: # %bb.0: 387; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 388; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 389; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 390; AVX512F-NEXT: vmovq %xmm0, (%rsi) 391; AVX512F-NEXT: vzeroupper 392; AVX512F-NEXT: retq 393; 394; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2: 395; AVX512VL: # %bb.0: 396; AVX512VL-NEXT: vpsrlq $32, (%rdi), %ymm0 397; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi) 398; AVX512VL-NEXT: vzeroupper 399; AVX512VL-NEXT: retq 400; 401; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2: 402; AVX512BW: # %bb.0: 403; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 404; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0 405; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 406; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 407; AVX512BW-NEXT: vzeroupper 408; AVX512BW-NEXT: retq 409; 410; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2: 411; AVX512BWVL: # %bb.0: 412; AVX512BWVL-NEXT: vpsrlq $32, (%rdi), %ymm0 413; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi) 414; AVX512BWVL-NEXT: vzeroupper 415; AVX512BWVL-NEXT: retq 416 %vec = load <16 x i16>, ptr %L 417 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 418 store <4 x i16> %strided.vec, ptr %S 419 ret void 420} 421 422define void @shuffle_v16i16_to_v4i16_3(ptr %L, ptr %S) nounwind { 423; AVX1-LABEL: shuffle_v16i16_to_v4i16_3: 424; AVX1: # %bb.0: 425; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 426; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 427; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 428; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 429; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 430; AVX1-NEXT: vmovq %xmm0, (%rsi) 431; AVX1-NEXT: retq 432; 433; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3: 434; AVX2-SLOW: # %bb.0: 435; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 436; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 437; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 438; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 439; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 440; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) 441; AVX2-SLOW-NEXT: retq 442; 443; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3: 444; AVX2-FAST: # %bb.0: 445; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 446; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 447; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] 448; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 449; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 450; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 451; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) 452; AVX2-FAST-NEXT: retq 453; 454; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3: 455; AVX512F: # %bb.0: 456; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 457; AVX512F-NEXT: vpsrlq $48, %ymm0, %ymm0 458; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 459; AVX512F-NEXT: vmovq %xmm0, (%rsi) 460; AVX512F-NEXT: vzeroupper 461; AVX512F-NEXT: retq 462; 463; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3: 464; AVX512VL: # %bb.0: 465; AVX512VL-NEXT: vpsrlq $48, (%rdi), %ymm0 466; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi) 467; AVX512VL-NEXT: vzeroupper 468; AVX512VL-NEXT: retq 469; 470; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3: 471; AVX512BW: # %bb.0: 472; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 473; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm0 474; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 475; AVX512BW-NEXT: vmovq %xmm0, (%rsi) 476; AVX512BW-NEXT: vzeroupper 477; AVX512BW-NEXT: retq 478; 479; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3: 480; AVX512BWVL: # %bb.0: 481; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %ymm0 482; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi) 483; AVX512BWVL-NEXT: vzeroupper 484; AVX512BWVL-NEXT: retq 485 %vec = load <16 x i16>, ptr %L 486 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 487 store <4 x i16> %strided.vec, ptr %S 488 ret void 489} 490 491define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind { 492; AVX1-LABEL: shuffle_v32i8_to_v4i8_1: 493; AVX1: # %bb.0: 494; AVX1-NEXT: vmovdqa (%rdi), %xmm0 495; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 496; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 497; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 498; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 499; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 500; AVX1-NEXT: vmovd %xmm0, (%rsi) 501; AVX1-NEXT: retq 502; 503; AVX2-LABEL: shuffle_v32i8_to_v4i8_1: 504; AVX2: # %bb.0: 505; AVX2-NEXT: vmovdqa (%rdi), %xmm0 506; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 507; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] 508; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 509; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 510; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 511; AVX2-NEXT: vmovd %xmm0, (%rsi) 512; AVX2-NEXT: retq 513; 514; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1: 515; AVX512F: # %bb.0: 516; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 517; AVX512F-NEXT: vpsrlq $8, %ymm0, %ymm0 518; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 519; AVX512F-NEXT: vmovd %xmm0, (%rsi) 520; AVX512F-NEXT: vzeroupper 521; AVX512F-NEXT: retq 522; 523; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1: 524; AVX512VL: # %bb.0: 525; AVX512VL-NEXT: vpsrlq $8, (%rdi), %ymm0 526; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) 527; AVX512VL-NEXT: vzeroupper 528; AVX512VL-NEXT: retq 529; 530; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1: 531; AVX512BW: # %bb.0: 532; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 533; AVX512BW-NEXT: vpsrlq $8, %ymm0, %ymm0 534; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 535; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 536; AVX512BW-NEXT: vzeroupper 537; AVX512BW-NEXT: retq 538; 539; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1: 540; AVX512BWVL: # %bb.0: 541; AVX512BWVL-NEXT: vpsrlq $8, (%rdi), %ymm0 542; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) 543; AVX512BWVL-NEXT: vzeroupper 544; AVX512BWVL-NEXT: retq 545 %vec = load <32 x i8>, ptr %L 546 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25> 547 store <4 x i8> %strided.vec, ptr %S 548 ret void 549} 550 551define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind { 552; AVX1-LABEL: shuffle_v32i8_to_v4i8_2: 553; AVX1: # %bb.0: 554; AVX1-NEXT: vmovdqa (%rdi), %xmm0 555; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 556; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 557; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 558; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 559; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 560; AVX1-NEXT: vmovd %xmm0, (%rsi) 561; AVX1-NEXT: retq 562; 563; AVX2-LABEL: shuffle_v32i8_to_v4i8_2: 564; AVX2: # %bb.0: 565; AVX2-NEXT: vmovdqa (%rdi), %xmm0 566; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 567; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] 568; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 569; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 570; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 571; AVX2-NEXT: vmovd %xmm0, (%rsi) 572; AVX2-NEXT: retq 573; 574; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2: 575; AVX512F: # %bb.0: 576; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 577; AVX512F-NEXT: vpsrlq $16, %ymm0, %ymm0 578; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 579; AVX512F-NEXT: vmovd %xmm0, (%rsi) 580; AVX512F-NEXT: vzeroupper 581; AVX512F-NEXT: retq 582; 583; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2: 584; AVX512VL: # %bb.0: 585; AVX512VL-NEXT: vpsrlq $16, (%rdi), %ymm0 586; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) 587; AVX512VL-NEXT: vzeroupper 588; AVX512VL-NEXT: retq 589; 590; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2: 591; AVX512BW: # %bb.0: 592; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 593; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm0 594; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 595; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 596; AVX512BW-NEXT: vzeroupper 597; AVX512BW-NEXT: retq 598; 599; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2: 600; AVX512BWVL: # %bb.0: 601; AVX512BWVL-NEXT: vpsrlq $16, (%rdi), %ymm0 602; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) 603; AVX512BWVL-NEXT: vzeroupper 604; AVX512BWVL-NEXT: retq 605 %vec = load <32 x i8>, ptr %L 606 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26> 607 store <4 x i8> %strided.vec, ptr %S 608 ret void 609} 610 611define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind { 612; AVX1-LABEL: shuffle_v32i8_to_v4i8_3: 613; AVX1: # %bb.0: 614; AVX1-NEXT: vmovdqa (%rdi), %xmm0 615; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 616; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 617; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 618; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 619; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 620; AVX1-NEXT: vmovd %xmm0, (%rsi) 621; AVX1-NEXT: retq 622; 623; AVX2-LABEL: shuffle_v32i8_to_v4i8_3: 624; AVX2: # %bb.0: 625; AVX2-NEXT: vmovdqa (%rdi), %xmm0 626; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 627; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] 628; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 629; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 630; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 631; AVX2-NEXT: vmovd %xmm0, (%rsi) 632; AVX2-NEXT: retq 633; 634; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3: 635; AVX512F: # %bb.0: 636; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 637; AVX512F-NEXT: vpsrlq $24, %ymm0, %ymm0 638; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 639; AVX512F-NEXT: vmovd %xmm0, (%rsi) 640; AVX512F-NEXT: vzeroupper 641; AVX512F-NEXT: retq 642; 643; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3: 644; AVX512VL: # %bb.0: 645; AVX512VL-NEXT: vpsrlq $24, (%rdi), %ymm0 646; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) 647; AVX512VL-NEXT: vzeroupper 648; AVX512VL-NEXT: retq 649; 650; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3: 651; AVX512BW: # %bb.0: 652; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 653; AVX512BW-NEXT: vpsrlq $24, %ymm0, %ymm0 654; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 655; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 656; AVX512BW-NEXT: vzeroupper 657; AVX512BW-NEXT: retq 658; 659; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3: 660; AVX512BWVL: # %bb.0: 661; AVX512BWVL-NEXT: vpsrlq $24, (%rdi), %ymm0 662; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) 663; AVX512BWVL-NEXT: vzeroupper 664; AVX512BWVL-NEXT: retq 665 %vec = load <32 x i8>, ptr %L 666 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27> 667 store <4 x i8> %strided.vec, ptr %S 668 ret void 669} 670 671define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind { 672; AVX1-LABEL: shuffle_v32i8_to_v4i8_4: 673; AVX1: # %bb.0: 674; AVX1-NEXT: vmovdqa (%rdi), %xmm0 675; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 676; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 677; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 678; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 679; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 680; AVX1-NEXT: vmovd %xmm0, (%rsi) 681; AVX1-NEXT: retq 682; 683; AVX2-LABEL: shuffle_v32i8_to_v4i8_4: 684; AVX2: # %bb.0: 685; AVX2-NEXT: vmovdqa (%rdi), %xmm0 686; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 687; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] 688; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 689; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 690; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 691; AVX2-NEXT: vmovd %xmm0, (%rsi) 692; AVX2-NEXT: retq 693; 694; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4: 695; AVX512F: # %bb.0: 696; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 697; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 698; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 699; AVX512F-NEXT: vmovd %xmm0, (%rsi) 700; AVX512F-NEXT: vzeroupper 701; AVX512F-NEXT: retq 702; 703; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4: 704; AVX512VL: # %bb.0: 705; AVX512VL-NEXT: vpsrlq $32, (%rdi), %ymm0 706; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) 707; AVX512VL-NEXT: vzeroupper 708; AVX512VL-NEXT: retq 709; 710; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4: 711; AVX512BW: # %bb.0: 712; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 713; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0 714; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 715; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 716; AVX512BW-NEXT: vzeroupper 717; AVX512BW-NEXT: retq 718; 719; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4: 720; AVX512BWVL: # %bb.0: 721; AVX512BWVL-NEXT: vpsrlq $32, (%rdi), %ymm0 722; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) 723; AVX512BWVL-NEXT: vzeroupper 724; AVX512BWVL-NEXT: retq 725 %vec = load <32 x i8>, ptr %L 726 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28> 727 store <4 x i8> %strided.vec, ptr %S 728 ret void 729} 730 731define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind { 732; AVX1-LABEL: shuffle_v32i8_to_v4i8_5: 733; AVX1: # %bb.0: 734; AVX1-NEXT: vmovdqa (%rdi), %xmm0 735; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 736; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 737; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 738; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 739; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 740; AVX1-NEXT: vmovd %xmm0, (%rsi) 741; AVX1-NEXT: retq 742; 743; AVX2-LABEL: shuffle_v32i8_to_v4i8_5: 744; AVX2: # %bb.0: 745; AVX2-NEXT: vmovdqa (%rdi), %xmm0 746; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 747; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] 748; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 749; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 750; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 751; AVX2-NEXT: vmovd %xmm0, (%rsi) 752; AVX2-NEXT: retq 753; 754; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5: 755; AVX512F: # %bb.0: 756; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 757; AVX512F-NEXT: vpsrlq $40, %ymm0, %ymm0 758; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 759; AVX512F-NEXT: vmovd %xmm0, (%rsi) 760; AVX512F-NEXT: vzeroupper 761; AVX512F-NEXT: retq 762; 763; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5: 764; AVX512VL: # %bb.0: 765; AVX512VL-NEXT: vpsrlq $40, (%rdi), %ymm0 766; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) 767; AVX512VL-NEXT: vzeroupper 768; AVX512VL-NEXT: retq 769; 770; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5: 771; AVX512BW: # %bb.0: 772; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 773; AVX512BW-NEXT: vpsrlq $40, %ymm0, %ymm0 774; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 775; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 776; AVX512BW-NEXT: vzeroupper 777; AVX512BW-NEXT: retq 778; 779; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5: 780; AVX512BWVL: # %bb.0: 781; AVX512BWVL-NEXT: vpsrlq $40, (%rdi), %ymm0 782; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) 783; AVX512BWVL-NEXT: vzeroupper 784; AVX512BWVL-NEXT: retq 785 %vec = load <32 x i8>, ptr %L 786 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29> 787 store <4 x i8> %strided.vec, ptr %S 788 ret void 789} 790 791define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind { 792; AVX1-LABEL: shuffle_v32i8_to_v4i8_6: 793; AVX1: # %bb.0: 794; AVX1-NEXT: vmovdqa (%rdi), %xmm0 795; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 796; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 797; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 798; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 799; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 800; AVX1-NEXT: vmovd %xmm0, (%rsi) 801; AVX1-NEXT: retq 802; 803; AVX2-LABEL: shuffle_v32i8_to_v4i8_6: 804; AVX2: # %bb.0: 805; AVX2-NEXT: vmovdqa (%rdi), %xmm0 806; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 807; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] 808; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 809; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 810; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 811; AVX2-NEXT: vmovd %xmm0, (%rsi) 812; AVX2-NEXT: retq 813; 814; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6: 815; AVX512F: # %bb.0: 816; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 817; AVX512F-NEXT: vpsrlq $48, %ymm0, %ymm0 818; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 819; AVX512F-NEXT: vmovd %xmm0, (%rsi) 820; AVX512F-NEXT: vzeroupper 821; AVX512F-NEXT: retq 822; 823; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6: 824; AVX512VL: # %bb.0: 825; AVX512VL-NEXT: vpsrlq $48, (%rdi), %ymm0 826; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) 827; AVX512VL-NEXT: vzeroupper 828; AVX512VL-NEXT: retq 829; 830; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6: 831; AVX512BW: # %bb.0: 832; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 833; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm0 834; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 835; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 836; AVX512BW-NEXT: vzeroupper 837; AVX512BW-NEXT: retq 838; 839; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6: 840; AVX512BWVL: # %bb.0: 841; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %ymm0 842; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) 843; AVX512BWVL-NEXT: vzeroupper 844; AVX512BWVL-NEXT: retq 845 %vec = load <32 x i8>, ptr %L 846 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30> 847 store <4 x i8> %strided.vec, ptr %S 848 ret void 849} 850 851define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind { 852; AVX1-LABEL: shuffle_v32i8_to_v4i8_7: 853; AVX1: # %bb.0: 854; AVX1-NEXT: vmovdqa (%rdi), %xmm0 855; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 856; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 857; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 858; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 859; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 860; AVX1-NEXT: vmovd %xmm0, (%rsi) 861; AVX1-NEXT: retq 862; 863; AVX2-LABEL: shuffle_v32i8_to_v4i8_7: 864; AVX2: # %bb.0: 865; AVX2-NEXT: vmovdqa (%rdi), %xmm0 866; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 867; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] 868; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 869; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 870; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 871; AVX2-NEXT: vmovd %xmm0, (%rsi) 872; AVX2-NEXT: retq 873; 874; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7: 875; AVX512F: # %bb.0: 876; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 877; AVX512F-NEXT: vpsrlq $56, %ymm0, %ymm0 878; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 879; AVX512F-NEXT: vmovd %xmm0, (%rsi) 880; AVX512F-NEXT: vzeroupper 881; AVX512F-NEXT: retq 882; 883; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7: 884; AVX512VL: # %bb.0: 885; AVX512VL-NEXT: vpsrlq $56, (%rdi), %ymm0 886; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) 887; AVX512VL-NEXT: vzeroupper 888; AVX512VL-NEXT: retq 889; 890; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7: 891; AVX512BW: # %bb.0: 892; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 893; AVX512BW-NEXT: vpsrlq $56, %ymm0, %ymm0 894; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 895; AVX512BW-NEXT: vmovd %xmm0, (%rsi) 896; AVX512BW-NEXT: vzeroupper 897; AVX512BW-NEXT: retq 898; 899; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7: 900; AVX512BWVL: # %bb.0: 901; AVX512BWVL-NEXT: vpsrlq $56, (%rdi), %ymm0 902; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) 903; AVX512BWVL-NEXT: vzeroupper 904; AVX512BWVL-NEXT: retq 905 %vec = load <32 x i8>, ptr %L 906 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31> 907 store <4 x i8> %strided.vec, ptr %S 908 ret void 909} 910 911