1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 6 7; trunc(concat(x,y)) -> pack 8 9define <16 x i16> @trunc_concat_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind { 10; AVX1-LABEL: trunc_concat_packssdw_256: 11; AVX1: # %bb.0: 12; AVX1-NEXT: vpsrad $17, %xmm0, %xmm2 13; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 14; AVX1-NEXT: vpsrad $17, %xmm0, %xmm0 15; AVX1-NEXT: vpsrad $23, %xmm1, %xmm3 16; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 17; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 18; AVX1-NEXT: vpsrad $23, %xmm1, %xmm1 19; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 20; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 21; AVX1-NEXT: retq 22; 23; AVX2-LABEL: trunc_concat_packssdw_256: 24; AVX2: # %bb.0: 25; AVX2-NEXT: vpsrad $17, %ymm0, %ymm0 26; AVX2-NEXT: vpsrad $23, %ymm1, %ymm1 27; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 28; AVX2-NEXT: retq 29; 30; AVX512-LABEL: trunc_concat_packssdw_256: 31; AVX512: # %bb.0: 32; AVX512-NEXT: vpsrad $17, %ymm0, %ymm0 33; AVX512-NEXT: vpsrad $23, %ymm1, %ymm1 34; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 35; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 36; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 37; AVX512-NEXT: vpmovdw %zmm0, %ymm0 38; AVX512-NEXT: retq 39 %1 = ashr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 40 %2 = ashr <8 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23> 41 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 42 %4 = trunc <16 x i32> %3 to <16 x i16> 43 ret <16 x i16> %4 44} 45 46define <16 x i16> @trunc_concat_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind { 47; AVX1-LABEL: trunc_concat_packusdw_256: 48; AVX1: # %bb.0: 49; AVX1-NEXT: vpsrld $17, %xmm0, %xmm2 50; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 51; AVX1-NEXT: vpsrld $17, %xmm0, %xmm0 52; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 53; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 54; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 55; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 56; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 57; AVX1-NEXT: retq 58; 59; AVX2-LABEL: trunc_concat_packusdw_256: 60; AVX2: # %bb.0: 61; AVX2-NEXT: vpsrld $17, %ymm0, %ymm0 62; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15] 63; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 64; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 65; AVX2-NEXT: retq 66; 67; AVX512-LABEL: trunc_concat_packusdw_256: 68; AVX512: # %bb.0: 69; AVX512-NEXT: vpsrld $17, %ymm0, %ymm0 70; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 71; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 72; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 73; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 74; AVX512-NEXT: vpmovdw %zmm0, %ymm0 75; AVX512-NEXT: retq 76 %1 = lshr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 77 %2 = and <8 x i32> %a1, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 78 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 79 %4 = trunc <16 x i32> %3 to <16 x i16> 80 ret <16 x i16> %4 81} 82 83define <32 x i8> @trunc_concat_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind { 84; AVX1-LABEL: trunc_concat_packsswb_256: 85; AVX1: # %bb.0: 86; AVX1-NEXT: vpsraw $15, %xmm0, %xmm2 87; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 88; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 89; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 90; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 91; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 92; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 93; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 94; AVX1-NEXT: retq 95; 96; AVX2-LABEL: trunc_concat_packsswb_256: 97; AVX2: # %bb.0: 98; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0 99; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 100; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 101; AVX2-NEXT: retq 102; 103; AVX512F-LABEL: trunc_concat_packsswb_256: 104; AVX512F: # %bb.0: 105; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 106; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 107; AVX512F-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 108; AVX512F-NEXT: retq 109; 110; AVX512BW-LABEL: trunc_concat_packsswb_256: 111; AVX512BW: # %bb.0: 112; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm0 113; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 114; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 115; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 116; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 117; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 118; AVX512BW-NEXT: retq 119 %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 120 %2 = and <16 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 121 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 122 %4 = trunc <32 x i16> %3 to <32 x i8> 123 ret <32 x i8> %4 124} 125 126define <32 x i8> @trunc_concat_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind { 127; AVX1-LABEL: trunc_concat_packuswb_256: 128; AVX1: # %bb.0: 129; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm2 130; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 131; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 132; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 133; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 134; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 135; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 136; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 137; AVX1-NEXT: retq 138; 139; AVX2-LABEL: trunc_concat_packuswb_256: 140; AVX2: # %bb.0: 141; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0 142; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 143; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 144; AVX2-NEXT: retq 145; 146; AVX512F-LABEL: trunc_concat_packuswb_256: 147; AVX512F: # %bb.0: 148; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 149; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 150; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 151; AVX512F-NEXT: retq 152; 153; AVX512BW-LABEL: trunc_concat_packuswb_256: 154; AVX512BW: # %bb.0: 155; AVX512BW-NEXT: vpsrlw $15, %ymm0, %ymm0 156; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 157; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 158; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 159; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 160; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 161; AVX512BW-NEXT: retq 162 %1 = lshr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 163 %2 = and <16 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 164 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 165 %4 = trunc <32 x i16> %3 to <32 x i8> 166 ret <32 x i8> %4 167} 168 169; concat(trunc(x),trunc(y)) -> pack 170 171 172define <16 x i16> @concat_trunc_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind { 173; AVX1-LABEL: concat_trunc_packssdw_256: 174; AVX1: # %bb.0: 175; AVX1-NEXT: vpsrad $17, %xmm0, %xmm2 176; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 177; AVX1-NEXT: vpsrad $17, %xmm0, %xmm0 178; AVX1-NEXT: vpsrad $23, %xmm1, %xmm3 179; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 180; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 181; AVX1-NEXT: vpsrad $23, %xmm1, %xmm1 182; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 183; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 184; AVX1-NEXT: retq 185; 186; AVX2-LABEL: concat_trunc_packssdw_256: 187; AVX2: # %bb.0: 188; AVX2-NEXT: vpsrad $17, %ymm0, %ymm0 189; AVX2-NEXT: vpsrad $23, %ymm1, %ymm1 190; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 191; AVX2-NEXT: retq 192; 193; AVX512-LABEL: concat_trunc_packssdw_256: 194; AVX512: # %bb.0: 195; AVX512-NEXT: vpsrad $17, %ymm0, %ymm0 196; AVX512-NEXT: vpsrad $23, %ymm1, %ymm1 197; AVX512-NEXT: vpmovdw %ymm0, %xmm0 198; AVX512-NEXT: vpmovdw %ymm1, %xmm1 199; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 200; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 201; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 202; AVX512-NEXT: retq 203 %1 = ashr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 204 %2 = ashr <8 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23> 205 %3 = trunc <8 x i32> %1 to <8 x i16> 206 %4 = trunc <8 x i32> %2 to <8 x i16> 207 %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 208 ret <16 x i16> %5 209} 210 211define <16 x i16> @concat_trunc_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind { 212; AVX1-LABEL: concat_trunc_packusdw_256: 213; AVX1: # %bb.0: 214; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 215; AVX1-NEXT: vpsrld $17, %xmm2, %xmm2 216; AVX1-NEXT: vpsrld $17, %xmm0, %xmm0 217; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 218; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 219; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 220; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 221; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 222; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 223; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 224; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 225; AVX1-NEXT: retq 226; 227; AVX2-LABEL: concat_trunc_packusdw_256: 228; AVX2: # %bb.0: 229; AVX2-NEXT: vpsrld $17, %ymm0, %ymm0 230; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 231; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 232; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 233; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 234; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 235; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 236; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 237; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 238; AVX2-NEXT: retq 239; 240; AVX512-LABEL: concat_trunc_packusdw_256: 241; AVX512: # %bb.0: 242; AVX512-NEXT: vpsrld $17, %ymm0, %ymm0 243; AVX512-NEXT: vpmovdw %ymm0, %xmm0 244; AVX512-NEXT: vpmovdw %ymm1, %xmm1 245; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 246; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 247; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 248; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 249; AVX512-NEXT: retq 250 %1 = lshr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 251 %2 = and <8 x i32> %a1, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 252 %3 = trunc <8 x i32> %1 to <8 x i16> 253 %4 = trunc <8 x i32> %2 to <8 x i16> 254 %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 255 ret <16 x i16> %5 256} 257 258define <32 x i8> @concat_trunc_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind { 259; AVX1-LABEL: concat_trunc_packsswb_256: 260; AVX1: # %bb.0: 261; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 262; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2 263; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 264; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 265; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 266; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 267; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 268; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 269; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 270; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 271; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 272; AVX1-NEXT: retq 273; 274; AVX2-LABEL: concat_trunc_packsswb_256: 275; AVX2: # %bb.0: 276; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0 277; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 278; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 279; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 280; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 281; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 282; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 283; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 284; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 285; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 286; AVX2-NEXT: retq 287; 288; AVX512F-LABEL: concat_trunc_packsswb_256: 289; AVX512F: # %bb.0: 290; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 291; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 292; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 293; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 294; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 295; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 296; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 297; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 298; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 299; AVX512F-NEXT: retq 300; 301; AVX512BW-LABEL: concat_trunc_packsswb_256: 302; AVX512BW: # %bb.0: 303; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm0 304; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0 305; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 306; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 307; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 308; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 309; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 310; AVX512BW-NEXT: retq 311 %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 312 %2 = and <16 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 313 %3 = trunc <16 x i16> %1 to <16 x i8> 314 %4 = trunc <16 x i16> %2 to <16 x i8> 315 %5 = shufflevector <16 x i8> %3, <16 x i8> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 316 ret <32 x i8> %5 317} 318 319define <32 x i8> @concat_trunc_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind { 320; AVX1-LABEL: concat_trunc_packuswb_256: 321; AVX1: # %bb.0: 322; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 323; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm2 324; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 325; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 326; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 327; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 328; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 329; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 330; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 331; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 332; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 333; AVX1-NEXT: retq 334; 335; AVX2-LABEL: concat_trunc_packuswb_256: 336; AVX2: # %bb.0: 337; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0 338; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 339; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 340; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 341; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 342; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 343; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 344; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 345; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 346; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 347; AVX2-NEXT: retq 348; 349; AVX512F-LABEL: concat_trunc_packuswb_256: 350; AVX512F: # %bb.0: 351; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 352; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 353; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 354; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 355; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 356; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 357; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 358; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 359; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 360; AVX512F-NEXT: retq 361; 362; AVX512BW-LABEL: concat_trunc_packuswb_256: 363; AVX512BW: # %bb.0: 364; AVX512BW-NEXT: vpsrlw $15, %ymm0, %ymm0 365; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0 366; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 367; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 368; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 369; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 370; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 371; AVX512BW-NEXT: retq 372 %1 = lshr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 373 %2 = and <16 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 374 %3 = trunc <16 x i16> %1 to <16 x i8> 375 %4 = trunc <16 x i16> %2 to <16 x i8> 376 %5 = shufflevector <16 x i8> %3, <16 x i8> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 377 ret <32 x i8> %5 378} 379